# CrossCompute
ready_table_path = 'Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
search_radius_in_miles = 0.5
industry_select = """
Manufacturing
Manufacturing
Wholesale/Warehouse/Distribution
Commercial
Landlord
Public Benefit Corp
Other
"""
program_select = """
ICIP
City/State
ICAP
ICIP
IDA
Relocator
Tenant
"""
target_folder = '/tmp'
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import subprocess
subprocess.call('pip install geopandas'.split())
subprocess.call('pip install dill'.split())
matplotlib.rcParams['figure.figsize'] = (10,10)
<font color=blue>Tasks: <br> 1. Feature selection: Company Name, Address, Industry, Program, Savings<br> 2. Smooth out time factor by calculating "Periodic Savings over months" starting from effective date (round to month) to 2017-12-31
# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
# Rename features for better readability
building_table = building_table.rename(columns={'Company Type': 'Program',
'Savings from begining receiving benefits': 'Savings'})
# Feature selection
# Come back to add features later if necessary
building_table = building_table[['Company Name', 'Address','Industry','Program',
'Effective Date','Savings',
'Borough','Latitude','Longitude']]
building_table[:3]
len(building_table)
# Drop rows when lonlat is NULL
building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
len(building_table)
building_table.dtypes
building_table.dtypes
from datetime import date
building_table['Effective Date'] = pd.to_datetime(building_table['Effective Date'].str.strip(), format='%m/%d/%Y')
building_table['Month Count'] = ((pd.to_datetime('2017-12-31') - building_table['Effective Date']) / np.timedelta64(1, 'M'))
building_table['Month Count'] = building_table['Month Count'].astype(int)
# Periodic Savings over months starting from effective date (round to month) to 2017-12-31
building_table['Periodic Savings over Months'] = (building_table['Savings'] / building_table['Month Count']).apply(lambda x: round(x, 2))
building_table.drop(['Savings' #,'Effective Date','Month Count'
], axis=1)
building_table.head()
from shapely.geometry import Point
# Combining Lattitude and Longitude to create company geo coordinates:
building_table['WKT'] = building_table[['Longitude', 'Latitude']].values.tolist()
# Change the coordinates to a geoPoint
building_table['WKT'] = building_table['WKT'].apply(Point)
building_table[:3]
# Choose a reference location
from shapely import wkt
g = wkt.loads(building_table.iloc[0])#['WKT'])
g.wkt
<font color=blue>Note: Building table feature selection is ready to this point<br>Time factor is smoothed out. Please refer to feature "Periodic Savings over Months" as reliable statistics for all participants<br>
<font color=blue>Tasks: <br>1. Feature selection: Longitude, Latitude, status, tree_id for later tree count around each location from building table<br> 2. Define a function to expedite data processing for large dataset
# Load function
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, sort=False)
%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table.head()
tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()
<font color=blue>Tree table is clean and ready at this point<br> Next step: import ketree to get total tree count for each location in building table
# building_table.loc[building_table['Industry'] == industry_select].loc[building_table['Program'] == program_select]
# building_table = building_table.loc[building_table['Industry'] == "Commercial"].loc[building_table['Program'] == "ICIP"]
# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_xys
tree_count = len(tree_xys)
tree_count
from shapely import wkt
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
radius_in_miles = 0.5
# def get_tree_count(r):
# xy = r['Longitude'], r['Latitude']
# distances, indices = bin_tree.query(
# xy, k=tree_count, distance_upper_bound=radius_in_miles)
# indices = indices[~np.isnan(indices)]
# return len(indices)
def get_tree_count(r):
xy = wkt.loads(r['WKT']).coords[0]
distances, indices = bin_tree.query(
xy,
k=tree_count,
distance_upper_bound=radius_in_meters)
indices = indices[~np.isnan(indices)]
return len(indices)
building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with tree count.csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
building_table.to_csv('Table with tree count.csv', sep=',')
<font color=blue>Note: Tree count within 0.5 miles is attached as a new column to the building table at this point.<br> Ready table is saved to file 'Table with tree count.csv' for later analysis</font><br><br>
import pandas as pd
import numpy as np
table_tree_count = pd.read_csv('Table with tree count.csv',na_values='n/a')
table_tree_count.drop(['Unnamed: 0'], axis=1)
table_tree_count[:3]
# Make kdtree
radius_in_miles = 0.5
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
def get_sav_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_sav.query(
xy,
k=len(bin_sav.data),
distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
indices = [int(x) for x in indices]
selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
return selected_sav_table['Periodic Savings over Months'].mean()
print(table_tree_count.loc([]))
table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]
# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count \
(within 0.5 Mile).csv', sep=',')
<font color=blue>Note: Tree count and average monthly savings whinin 0.5 miles are attached to building table at this point. Ready table is saved to file 'Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv' for later analysis
# Algorithm
# find dtypes
# Begin creating our model
# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns
# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
X, Y, test_size = validation_size, random_state = seed)
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
# Export model
# How is the tool made in cross compute?
# import sys
# 'geopandas' in sys.modules
# Convert WKT coordinates
# from geopandas import GeoDataFrame
# from shapely.geometry import Point
# geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]
# building_table = building_table.drop(['Longitude', 'Latitude'], axis=1)
# crs = {'init': 'epsg:4326'}
# gdf = GeoDataFrame(building_table, crs=crs, geometry=geometry)
# building_table.head()
import subprocess
subprocess.call('pip install geopandas'.split())
ready_table_path = 'Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
search_radius_in_miles = 0.5
user_address = '28-10 Jackson Ave' # this address can be located
#user_address = '236-238 25TH STREET' # this is an address geocode can't locate
target_folder = '/tmp'
import pandas as pd
ready_table = pd.read_csv(ready_table_path,na_values='n/a')
from shapely.geometry import Point
# Combining Lattitude and Longitude to create company geo coordinates:
ready_table['Coordinate'] = ready_table[['Longitude', 'Latitude']].values.tolist()
# Change the coordinates to a geoPoint
ready_table['Coordinate'] = ready_table['Coordinate'].apply(Point)
ready_table[:3]
# Save your output files in target_folder
target_path = target_folder + '/Table with geometry points.csv'
ready_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
ready_table.to_csv('Table with geometry points', sep=',')
ready_table[:3]
<font color=blue>Note: geometry is attached to ready table at this point. <br>Next step: Filter out addresses from training dataset which can't be converted to coordinates in order to be displayed as reference companies on map.
<font color=blue>Note: This step is necessary because program will stop at addresses that can't be converted to geo coordiate and affect funtionality of reference data display on map.
input_filename = 'Table with geometry points.csv'
import pandas as pd
import requests
import logging
import time
logger = logging.getLogger('root')
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
#------------------ CONFIGURATION -------------------------------
API_KEY = 'AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w'
BACKOFF_TIME = 30
output_filename = 'Table with all addresses check.csv'
input_filename = 'Table with geometry points.csv'
address_column_name = 'Address'
RETURN_FULL_RESULTS = False
#------------------ DATA LOADING --------------------------------
data = pd.read_csv(input_filename, encoding='utf8')
if address_column_name not in data.columns:
raise ValueError('Missing Address column in input data')
addresses = data['Address'].tolist()
#------------------ FUNCTION DEFINITIONS ------------------------
def get_google_results(address, api_key='AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w', return_full_response=False):
geocode_url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
if api_key is not None:
geocode_url = geocode_url + '&key={}'.format(api_key)
results = requests.get(geocode_url)
results = results.json()
if len(results['results']) == 0:
output = {
'formatted_address' : None,
'latitude': None,
'longitude': None,
}
else:
answer = results['results'][0]
output = {
'formatted_address' : answer.get('formatted_address'),
'latitude': answer.get('geometry').get('location').get('lat'),
'longitude': answer.get('geometry').get('location').get('lng'),
}
# Append some other details:
output['input_string'] = address
output['status'] = results.get('status')
if return_full_response is True:
output['response'] = results
return output
#------------------ PROCESSING LOOP -----------------------------
# Create a list to hold results
results = []
for address in addresses:
# While the address geocoding is not finished:
geocoded = False
while geocoded is not True:
# Geocode the address with google
try:
geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
except Exception as e:
logger.exception(e)
logger.error('Major error with {}'.format(address))
logger.error('Skipping!')
geocoded = True
# If we're over the API limit, backoff for a while and try again later.
if geocode_result['status'] == 'OVER_QUERY_LIMIT':
logger.info('Hit Query Limit! Backing off for a bit.')
time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
geocoded = False
else:
# If we're ok with API use, save the results
# Note that the results might be empty / non-ok - log this
if geocode_result['status'] != 'OK':
logger.warning('Error geocoding {}: {}'.format(address, geocode_result['status']))
logger.debug('Geocoded: {}: {}'.format(address, geocode_result['status']))
results.append(geocode_result)
geocoded = True
# Print status every 100 addresses
if len(results) % 100 == 0:
logger.info('Completed {} of {} address'.format(len(results), len(addresses)))
# Every 500 addresses, save progress to file(in case of a failure so you have something!)
if len(results) % 500 == 0:
pd.DataFrame(results).to_csv('{}_bak'.format(output_filename))
logger.info('Finished geocoding all addresses')
pd.DataFrame(results).to_csv(output_filename, encoding='utf8')
<font color=blue>Note: Finished geocoding all addresses at this point.
geocoded_table = pd.read_csv('Table with all addresses check.csv')
#geocoded_table=pd.DataFrame(results)
geocoded_table = geocoded_table.rename(columns={'input_string': 'Address',
'input_string': 'Address'})
len(geocoded_table)
<font color=blue>Note: <br>1. Check geocoded_table in this code, or output table "Table with all addresses check.csv". We know 12 addresses can't be converted, which need to be deleted from building data for map display to function properly.<br>2. Next step: delete addresses can't be converted.
merged_table = pd.merge(ready_table,
geocoded_table[['Address','formatted_address','status']],
on='Address', how='right')
df2 = merged_table[merged_table.status == 'OK']
df2 = df2.drop(df2[(df2.status != 'OK')].index)
df2 = df2.drop_duplicates()
df2
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Ready Table with 490 rows.csv'
df2.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
df2.to_csv('Ready Table with 490 rows.csv', sep=',')