Here we review different techniques for handling errors when geocoding addresses.
from geopy import GoogleV3
api_key = 'AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w'
geocode = GoogleV3(api_key).geocode
If an address is incomplete, the geocoder may not be able to geocode a unique address.
address = '236-238 25TH STREET'
geocode(address) is not None
address = '236-238 25TH STREET, BROOKLYN, NY'
geocode(address)
address = '236-238 25TH STREET, BRONX, NY'
geocode(address)
You can use the usaddress
package to detect if the city is missing.
import subprocess
assert subprocess.call('pip install usaddress'.split()) == 0
address = '236-238 25TH STREET'
address
import usaddress
parts = usaddress.parse(address)
parts
value_by_type = {v: k for k, v in parts}
value_by_type
missing_place = 'PlaceName' not in value_by_type
missing_state = 'StateName' not in value_by_type
missing_zip = 'ZipCode' not in value_by_type
if missing_place and missing_state and missing_zip:
address += ', Brooklyn, NY'
address
geocode(address)
from geopy import GoogleV3
api_key = 'AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w'
geocode = GoogleV3(api_key).geocode
import subprocess
assert subprocess.call('pip install usaddress'.split()) == 0
import numpy as np
from usaddress import parse as parse_address
def fix_address(address, default_region):
address_parts = parse_address(address)
value_by_type = {v: k for k, v in address_parts}
missing_place = 'PlaceName' not in value_by_type
missing_state = 'StateName' not in value_by_type
missing_zip = 'ZipCode' not in value_by_type
if missing_place and missing_state and missing_zip:
address += ', ' + default_region
return address
def get_location(row):
address = row['address']
address = fix_address(address, default_region='New York, NY')
location = geocode(address)
if location is None:
return np.nan
row['longitude'] = location.longitude
row['latitude'] = location.latitude
return row
import pandas as pd
address_table = pd.DataFrame([
['118 West 22nd Street'],
['415 E 71st St, New York, NY'],
['abcdefg'],
['65-60 Kissena Blvd, Flushing, NY'],
], columns=['address'])
address_table
geolocated_table = address_table.apply(get_location, axis=1)
geolocated_table
clean_table = geolocated_table.dropna(subset=['longitude', 'latitude'])
clean_table
Although using DataFrame.apply
is recommended because it is more flexible, you can also use pandas.Series
for cases where you want to define a new column but some values are missing.
import pandas as pd
from shapely.geometry import Point
d = {}
for index, row in address_table.iterrows():
address = row['address']
location = geocode(address)
if not location:
continue
geometry = Point(location.longitude, location.latitude)
d[index] = geometry.wkt
address_table['wkt'] = pd.Series(d)
address_table