forked from shanealynn/python_batch_geocode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython_batch_geocoding.py
More file actions
214 lines (184 loc) · 9.54 KB
/
python_batch_geocoding.py
File metadata and controls
214 lines (184 loc) · 9.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Python script for batch geocoding of addresses using the Google Geocoding API.
This script allows for massive lists of addresses to be geocoded for free by pausing when the
geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid
geocoding from Google, set it in the API key section.
Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
come from a csv file with a column "Address". Adjust the code to your own requirements as needed.
After every 500 successul geocode operations, a temporary file with results is recorded in case of
script failure / loss of connection later.
Addresses and data are held in memory, so this script may need to be adjusted to process files line
by line if you are processing millions of entries.
Shane Lynn
5th November 2016
Updated with Google Web lookup
Saurabh Gupta
January 2018
"""
import pandas as pd
import requests
import logging
import time
import os
#------------------ CONFIGURATION -------------------------------
# Set your Google API key here.
# Even if using the free 2500 queries a day, its worth getting an API key since the rate limit is 50 / second.
# With API_KEY = None, you will run into a 2 second delay every 10 requests or so.
# With a "Google Maps Geocoding API" key from https://console.developers.google.com/apis/,
# the daily limit will be 2500, but at a much faster rate.
# Example: API_KEY = 'AIzaSyC9azed9tLdjpZNjg2_kVePWvMIBq154eA'
API_KEY = 'AIzaSyD0H_G1JKCgklUtvDFVcdoMtto3ooyalZ8'
# Backoff time sets how many minutes to wait between google pings when your API limit is hit
BACKOFF_TIME = 30
# Set your output file name here.
output_filename = 'data/output.csv'
# Set your input file here
input_filename = "data/input.csv"
# Specify the column name in your input data that contains addresses here
address_column_name = "Address"
index_label = "SNo"
#Write temp file after this many records
temp_file_dump_after = 500
#Write counters after this many records
counts_show_after = 100
# Return Full Google Results? If True, full JSON results from Google are included in output
RETURN_FULL_RESULTS = False
#The level of the logger
log_level = logging.DEBUG
#------------------ Logging -------------------------------------
logger = logging.getLogger(__name__)
logger.setLevel(log_level)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(log_level)
logger.addHandler(ch)
#------------------ DATA LOADING --------------------------------
# Read the data to a Pandas Dataframe
data = pd.read_csv(input_filename, encoding='utf8')
if address_column_name not in data.columns:
raise ValueError("Missing Address column in input data")
# Form a list of addresses for geocoding:
# Make a big list of all of the addresses to be processed.
addresses = data[address_column_name].tolist()
#------------------ FUNCTION DEFINITIONS ------------------------
'''
Use Utility function from websearch to lookup an address
The function returns the number of web results to give confidence number based on number of results
It also returns any spelling suggestion google has for the address
'''
#Note that the results are a little different because of personalized and local search results.
def lookup_web(address, output, logger):
from websearch import search_item
web_answer = search_item(address)
logger.debug("Web Look Done")
if web_answer.get("correction"):
output["correction"] = web_answer.get("correction")
output['number_of_web_results'] = web_answer.get("formattedTotalResults")
return output
def get_google_results(address, api_key=None, return_full_response=False):
"""
Get geocode results from Google Maps Geocoding API.
Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result.
@param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland"
@param api_key: String API key if present from google.
If supplied, requests will use your allowance from the Google API. If not, you
will be limited to the free usage of 2500 requests per day.
@param return_full_response: Boolean to indicate if you'd like to return the full response from google. This
is useful if you'd like additional location details for storage or parsing later.
"""
# Set up your Geocoding url
geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
if api_key is not None:
geocode_url = geocode_url + "&key={}".format(api_key)
# Ping google for the reuslts:
results = requests.get(geocode_url)
# Results will be in JSON format - convert to dict using requests functionality
results = results.json()
# if there's no results or an error, return empty results.
if len(results['results']) == 0:
output = {
"formatted_address" : None,
"latitude": None,
"longitude": None,
"accuracy": None,
"google_place_id": None,
"type": None,
"postcode": None
}
else:
answer = results['results'][0]
output = {
"formatted_address" : answer.get('formatted_address'),
"latitude": answer.get('geometry').get('location').get('lat'),
"longitude": answer.get('geometry').get('location').get('lng'),
"accuracy": answer.get('geometry').get('location_type'),
"google_place_id": answer.get("place_id"),
"type": ",".join(answer.get('types')),
"postcode": ",".join([x['long_name'] for x in answer.get('address_components')
if 'postal_code' in x.get('types')])
}
# Append some other details:
output['input_string'] = address
output['number_of_results'] = len(results['results'])
output['status'] = results.get('status')
if return_full_response is True:
output['response'] = results
return output
#------------------ PROCESSING LOOP -----------------------------
# Ensure, before we start, that the API key is ok/valid, and internet access is ok
test_result = get_google_results("London, England", API_KEY, RETURN_FULL_RESULTS)
if (test_result['status'] != 'OK') or (test_result['formatted_address'] != 'London, UK'):
logger.warning("There was an error when testing the Google Geocoder.")
raise ConnectionError('Problem with test results from Google Geocode - check your API key and internet connection.')
# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
# While the address geocoding is not finished:
geocoded = False
while geocoded is not True:
# Geocode the address with google
try:
geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
except Exception as e:
logger.exception(e)
logger.error("Major error with {}".format(address))
logger.error("Skipping!")
geocoded = True
# If we're over the API limit, backoff for a while and try again later.
if geocode_result['status'] == 'OVER_QUERY_LIMIT':
logger.info("Hit Query Limit! Backing off for a bit.")
time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
geocoded = False
else:
# If we're ok with API use, save the results
# Note that the results might be empty / non-ok - log this
if geocode_result['status'] != 'OK':
logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
logger.info("Looking up google web custom search for address: {}"\
.format(address))
geocode_result= lookup_web(address, geocode_result, logger)
logger.debug("{} results found".format(geocode_result.get("number_of_web_results")))
else:
logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
results.append(geocode_result)
geocoded = True
# Print status every 100 addresses
if len(results) % counts_show_after == 0:
logger.info("Completed {} of {} address".format(len(results), len(addresses)))
# Every 500 addresses, save progress to file(in case of a failure so you have something!)
if len(results) % temp_file_dump_after == 0:
logger.debug("Writing Temp File")
pd.DataFrame(results).to_csv("{}_bak".format(output_filename), index_label=index_label)
# All done
logger.info("Finished geocoding all addresses")
# Write the full results to csv using the pandas library.
logger.info("Writing Output File")
pd.DataFrame(results).to_csv(output_filename, encoding='utf8', index_label=index_label)
#Cleanup if needed and if actual file has a modified timestamp after the temp file
logger.debug("Cleanup Temp File")
if os.path.isfile("{}_bak".format(output_filename)) and os.path.isfile(output_filename) :
if(os.path.getmtime(output_filename) > os.path.getmtime("{}_bak".format(output_filename))):
os.remove("{}_bak".format(output_filename))
else:
logger.debug("It seems temp file is newer than output file. Output file needs checking")