# CrossCompute
user_address = ""
search_radius_in_miles = 0.5
industry_select = """
Manufacturing
Manufacturing
Wholesale/Warehouse/Distribution
Commercial
Landlord
Public Benefit Corp
Other
"""
program_select = """
ICIP
ICIP
ICAP
City/State
IDA
Relocator
Tenant
"""
target_folder = '/tmp'
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
matplotlib.rcParams['figure.figsize'] = (10,10)
import subprocess
subprocess.call('pip install geopandas'.split())
subprocess.call('pip install dill'.split())
<font color=blue>Tasks: <br><br> 1. Feature selection: Drop information users don't care about. <br> 2. Smooth out time factor by calculating "Periodic Savings over months" starting from effective date (round to month) to 2017-12-31
# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
# Rename features for better readability
building_table = building_table.rename(columns={'Company Type': 'Program',
'Savings from begining receiving benefits': 'Savings'})
# Feature selection
# Come back to add features later if necessary
building_table = building_table[['Company Name', 'Address','Industry','Program',
'Effective Date','Savings',
'Borough','Latitude','Longitude']]
building_table[:3]
len(building_table)
# Drop rows when lonlat is NULL
building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
len(building_table)
building_table.dtypes
from datetime import date
building_table['Effective Date'] = pd.to_datetime(building_table['Effective Date'].str.strip(), format='%m/%d/%Y')
building_table['Month Count'] = ((pd.to_datetime('2017-12-31') - building_table['Effective Date']) / np.timedelta64(1, 'M'))
building_table['Month Count'] = building_table['Month Count'].astype(int)
# Periodic Savings over months starting from effective date (round to month) to 2017-12-31
building_table['Periodic Savings over Months'] = (building_table['Savings'] / building_table['Month Count']).apply(lambda x: round(x, 2))
building_table.drop(['Savings' #,'Effective Date','Month Count'
], axis=1)
building_table.head()
<font color=blue>Tasks: <br><br>1. Feature selection: Longitude, Latitude, status, tree_id for later tree count around each location from building table<br> 2. Define a function to expedite data processing for large dataset
# Load function
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, sort=False)
%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table.head()
tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()
# Make kdtree
from pysal.lib.cg import KDTree,RADIUS_EARTH_MILES
tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
radius_in_miles = 0.5
def get_tree_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_tree.query(
xy, k=tree_count, distance_upper_bound=radius_in_miles)
indices = indices[indices < tree_count]
return len(indices)
building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with tree count.csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
building_table.to_csv('Table with tree count.csv', sep=',')
<font color=blue>Note:<br><br>1. Tree count within 0.5 miles is attached as a new column to the building table at this point.<br> 2. Ready table is saved to file 'Table with tree count.csv' for later analysis</font><br><br>
import pandas as pd
import numpy as np
building_table.head()
<font color=blue>We tried count in user selected industry and program
# Filter table by user selected program and industry
#building_table = building_table.loc[building_table['Industry'] == industry_select].loc[building_table['Program'] == program_select]
<font color=blue>We decided to drop industry and program becuase not enough data was left with those filters on.<br>ValueError: not enough values to unpack (expected 2, got 1).
# Make kdtree
radius_in_miles = 0.5
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
sav_xys = building_table[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
def get_sav_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_sav.query(
xy,
k=len(bin_sav.data),
distance_upper_bound=radius_in_miles)
indices = indices[indices < sav_count]
indices = [int(x) for x in indices]
selected_sav_table = building_table.loc[building_table.index[indices]]
return selected_sav_table['Periodic Savings over Months'].mean()
building_table['Periodic Savings within 0.5 Mile'] = building_table.apply(get_sav_average, axis=1)
building_table[:3]
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
building_table.to_csv('Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv', sep=',')
# rename building table with tree count and periodic savings around
ready_table = building_table.copy()
#ready_table.drop(['Unnamed: 0'], axis=1)
ready_geotable = ready_table.copy() # Prevent SettingwithCopyWarning
# Set radius for each point
# The bigger the circle, the more the company saved
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Periodic Savings over Months']
# Set color for each point using a gradient
# The darker the color, the more trees the company is surrounded by
ready_geotable['FillRedsFromMean'] = ready_geotable['Total Tree Count within 0.5 Mile']
# See what we did
ready_geotable[:3]
Hypothesis:<br>
Discovery:<br>
#url1 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/prediction_table.csv'
prediction_table = ready_table.copy()
prediction_table[:3]
prediction_table = prediction_table[['Total Tree Count within 0.5 Mile',
'Periodic Savings within 0.5 Mile', 'Periodic Savings over Months']]
prediction_table.head()
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's that outlier in our savings?
X1 = prediction_table[['Periodic Savings within 0.5 Mile','Total Tree Count within 0.5 Mile']].values
Y1 = prediction_table['Periodic Savings over Months']
model1 = LinearRegression()
model1.fit(X1, Y1)
cross_val_score(model1, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
model2 = BayesianRidge()
model2.fit(X1, Y1)
cross_val_score(model2, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
model3 = SVR()
model3.fit(X1, Y1)
cross_val_score(model3, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
import pickle
pickle.dump(model2, open('/tmp/model.pkl', 'wb'))
# Load model
from pickle import load
model = load(open('/tmp/model.pkl', 'rb')) # !!! Replace dummy model with your model
model
url2 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/Testing_Data.csv'
testing_table = pd.read_csv(url2)
testing_table = testing_table.drop(['Unnamed: 0'], axis = 1)
X = testing_table[['Total Tree Count within 0.5 Mile', 'Periodic Savings within 0.5 Mile']].values
y = model.predict(X)
y
testing_table['Periodic Savings over Months'] = y
testing_table
# Save file to target folder to include it in the result download
target_path = target_folder + '/a.csv'
prediction_table.to_csv(target_path, index=False)
print(f'a_table_path = {target_path}') # Print table_path to render table
# A step further:
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's are outliers in our savings?