In [2]:

# CrossCompute
ready_table_path = 'Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
#user_address = ""
target_folder = '/tmp'

In [3]:

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
matplotlib.rcParams['figure.figsize'] = (10,10)

import subprocess
subprocess.call('pip install geopandas'.split())
subprocess.call('pip install dill'.split())

Out[3]:

In [4]:

subprocess.call('pip install geopy'.split())
subprocess.call('pip install folium'.split())

from geopy import GoogleV3
import folium

Render Map¶

In [5]:

ready_table = pd.read_csv(ready_table_path)  # Prevent SettingwithCopyWarning
ready_table[:3]

Out[5]:

	Unnamed: 0	Company Name	Address	Industry	Program	Effective Date	Savings	Borough	Latitude	Longitude	Month Count	Periodic Savings over Months	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile
0	0	139 ACA Realty, Inc.	43-23 35th Street	Commercial	ICIP	2008-04-07	123975.24	QUEENS	40.745706	-73.929565	116	1068.75	1331	1419.727333
1	1	141 Lake Avenue Realty c/o JR Produce, Inc.	141 Lake Avenue	Wholesale/Warehouse/Distribution	ICIP	2009-12-08	47512.89	STATEN IS	40.633153	-74.150999	96	494.93	1390	336.525000
2	2	14-10 123rd Street LLC	14-10 123rd Street	Commercial	ICIP	2011-03-04	21322.89	QUEENS	40.785144	-73.844833	81	263.25	2195	1079.380000

In [6]:

ready_table=ready_table.drop(ready_table.columns[0], axis=1) 
ready_table[:3]

Out[6]:

	Company Name	Address	Industry	Program	Effective Date	Savings	Borough	Latitude	Longitude	Month Count	Periodic Savings over Months	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile
0	139 ACA Realty, Inc.	43-23 35th Street	Commercial	ICIP	2008-04-07	123975.24	QUEENS	40.745706	-73.929565	116	1068.75	1331	1419.727333
1	141 Lake Avenue Realty c/o JR Produce, Inc.	141 Lake Avenue	Wholesale/Warehouse/Distribution	ICIP	2009-12-08	47512.89	STATEN IS	40.633153	-74.150999	96	494.93	1390	336.525000
2	14-10 123rd Street LLC	14-10 123rd Street	Commercial	ICIP	2011-03-04	21322.89	QUEENS	40.785144	-73.844833	81	263.25	2195	1079.380000

In [7]:

ready_geotable = ready_table.copy()
ready_geotable[:3]

Out[7]:

	Company Name	Address	Industry	Program	Effective Date	Savings	Borough	Latitude	Longitude	Month Count	Periodic Savings over Months	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile
0	139 ACA Realty, Inc.	43-23 35th Street	Commercial	ICIP	2008-04-07	123975.24	QUEENS	40.745706	-73.929565	116	1068.75	1331	1419.727333
1	141 Lake Avenue Realty c/o JR Produce, Inc.	141 Lake Avenue	Wholesale/Warehouse/Distribution	ICIP	2009-12-08	47512.89	STATEN IS	40.633153	-74.150999	96	494.93	1390	336.525000
2	14-10 123rd Street LLC	14-10 123rd Street	Commercial	ICIP	2011-03-04	21322.89	QUEENS	40.785144	-73.844833	81	263.25	2195	1079.380000

We tried color coded industry and program on map but it didn't work¶

In [8]:

# ind_progm_table=pd.get_dummies(ready_geotable,columns=['Industry'])
# ind_progm_table=pd.get_dummies(ind_progm_table,columns=['Program'])
# ind_progm_table
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Industry_Commercial'	,
# 'Industry_Landlord'	,
# 'Industry_Manufacturing',	
# 'Industry_Other',	
# 'Industry_Public Benefit Corp',	
# 'Industry_Wholesale/Warehouse/Distribution']

  File "<ipython-input-8-2223b67b816c>", line 9
    # 'Industry_Wholesale/Warehouse/Distribution']
                                                  ^
SyntaxError: unexpected EOF while parsing

In [ ]:

ind_progm_table[:5]

In [ ]:

# Set radius for each point
# The bigger the circle, the more the company saved
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Periodic Savings over Months']

In [ ]:

# Set color for each point using a gradient
# The darker the color, the more trees the company is surrounded by
ready_geotable['FillBluesFromMean'] = ready_geotable['Total Tree Count within 0.5 Mile']

In [ ]:

# See what we did
ready_geotable[:3]

Hypothesis:<br>

Bigger circle tend to be darker<br>
Also, companies who saved the more money tend to be surrounded by more trees.<br>

Discovery:<br>

Biggest circles are found in Bronx and STATEN IS<br>
Bigger circles in Bronx are indeed surrounded by more trees<br> <font color=red>(ex.Albert Einstein College of Medicine of Yeshiva Uni, saved 102919.54 dollars, surrounded by 419 trees; Montefiore Medical Center, saved 11400.63 in ICAP and 69506.82 in ICIP, surrounded by 303 trees) <br> <font color=black>- While the biggest circle of all is found in Stanley IS but the color is near transparent(means very few trees around) <br> <font color=red>(ex.VISY PAPER RECYCLING saved 274038.51, 41 trees)

In [ ]:

# Save file to target folder to include it in the result download
target_path = target_folder + '/b.csv'
ready_geotable.to_csv(target_path, index=False)
print(f'b_geotable_path = {target_path}')  # Print geotable_path to render map

Render Plot¶

In [ ]:

#import data analysis package

import seaborn
import scipy
import matplotlib.pyplot as plt

ready_table.drop(ready_table[ready_table['Periodic Savings over Months'] > 20000].index, inplace=True)
    
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

#plot alcohol consumption and suicide rate as a scatterplot
scat1 = seaborn.regplot(x='Total Tree Count within 0.5 Mile', y='Periodic Savings over Months', data=ready_table)
plt.xlabel('Total Tree Count within 0.5 Mile')
plt.ylabel('Periodic Savings over Months')
plt.title('Scatterplot for the Association Between Tree Count and Periodic Savings')

In [ ]:

#get the correlation coefficient
print('Association Between Tree Count and Periodic Savings')
print(scipy.stats.pearsonr(ready_table['Total Tree Count within 0.5 Mile'], ready_table['Periodic Savings over Months']))

In [ ]:

# Save file to target folder to include it in the result download
target_path = target_folder + '/c.png'
figure = scat1.get_figure()
figure.savefig(target_path)
print(f'c_image_path = {target_path}')
print('Association Between Tree Count and Periodic Savings')
print(scipy.stats.pearsonr(ready_table['Total Tree Count within 0.5 Mile'], 
                           ready_table['Periodic Savings over Months']))

Training Model¶

In [1]:

prediction_table = ready_table.copy()
prediction_table[:3]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-6e7f80355573> in <module>
----> 1 prediction_table = ready_table.copy()
      2 prediction_table[:3]

NameError: name 'ready_table' is not defined

In [15]:

prediction_table = prediction_table[['Total Tree Count within 0.5 Mile',
                                     'Periodic Savings within 0.5 Mile', 'Periodic Savings over Months']]
prediction_table.head()

Out[15]:

	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile	Periodic Savings over Months
0	1331	1419.727333	1068.750000
1	1390	336.525000	494.930000
2	2195	1079.380000	263.250000
3	1254	2846.165714	4200.660000
4	1136	1517.730217	2016.420000

In [16]:

prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's that outlier in our savings?

In [17]:

X1 = prediction_table[['Periodic Savings within 0.5 Mile','Total Tree Count within 0.5 Mile']].values
Y1 = prediction_table['Periodic Savings over Months']

In [18]:

model1 = LinearRegression()
model1.fit(X1, Y1)
cross_val_score(model1, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()

Out[18]:

-1680.9851844200355

In [19]:

model2 = BayesianRidge()
model2.fit(X1, Y1)
cross_val_score(model2, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()

Out[19]:

-1679.2847550478734

In [20]:

model3 = SVR()
model3.fit(X1, Y1)
cross_val_score(model3, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)

Out[20]:

-1494.6005428121223

In [21]:

import pickle
pickle.dump(model2, open('/tmp/model.pkl', 'wb'))

In [22]:

# Load model
from pickle import load
model = load(open('/tmp/model.pkl', 'rb'))  # !!! Replace dummy model with your model
model

Out[22]:

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [23]:

url2 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/Testing_Data.csv'
testing_table = pd.read_csv(url2)

In [24]:

testing_table = testing_table.drop(['Unnamed: 0'], axis = 1)

In [25]:

X = testing_table[['Total Tree Count within 0.5 Mile', 'Periodic Savings within 0.5 Mile']].values
y = model.predict(X)
y

Out[25]:

array([1373.78583903, 1329.47751343, 1352.70225448])

In [26]:

testing_table['Periodic Savings over Months'] = y
testing_table

Out[26]:

	Address	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile	Periodic Savings over Months
0	43-23 35th Street	683	1423.931818	1373.785839
1	141 Lake Avenue	21	336.525000	1329.477513
2	14-10 123rd Street	447	1079.380000	1352.702254

Render Table¶

In [27]:

# Save file to target folder to include it in the result download
target_path = target_folder + '/a.csv'
testing_table.to_csv(target_path, index=False)
print(f'a_table_path = {target_path}')  # Print table_path to render table

a_table_path = /tmp/a.csv

In [28]:

# A step further: 
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's are outliers in our savings?

Check statistics from Participants in ECSP¶

YOUR INTERPRETATION OF THE RESULTS

{Participants statistics : Participants statistics ? YOUR TABLE DESCRIPTION}

{Participants on map : Participants on map ? YOUR MAP DESCRIPTION}

{Association Between Tree Count and Periodic Savings : Association Between Tree Count and Periodic Savings ? YOUR PLOT DESCRIPTION}

In [ ]:

Pay Notebook Creator: Haige Cui	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0