# CrossCompute
ready_table_path = 'Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
#user_address = ""
target_folder = '/tmp'
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
matplotlib.rcParams['figure.figsize'] = (10,10)
import subprocess
subprocess.call('pip install geopandas'.split())
subprocess.call('pip install dill'.split())
subprocess.call('pip install geopy'.split())
subprocess.call('pip install folium'.split())
from geopy import GoogleV3
import folium
ready_table = pd.read_csv(ready_table_path) # Prevent SettingwithCopyWarning
ready_table[:3]
ready_table=ready_table.drop(ready_table.columns[0], axis=1)
ready_table[:3]
ready_geotable = ready_table.copy()
ready_geotable[:3]
# ind_progm_table=pd.get_dummies(ready_geotable,columns=['Industry'])
# ind_progm_table=pd.get_dummies(ind_progm_table,columns=['Program'])
# ind_progm_table
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Industry_Commercial' ,
# 'Industry_Landlord' ,
# 'Industry_Manufacturing',
# 'Industry_Other',
# 'Industry_Public Benefit Corp',
# 'Industry_Wholesale/Warehouse/Distribution']
ind_progm_table[:5]
# Set radius for each point
# The bigger the circle, the more the company saved
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Periodic Savings over Months']
# Set color for each point using a gradient
# The darker the color, the more trees the company is surrounded by
ready_geotable['FillBluesFromMean'] = ready_geotable['Total Tree Count within 0.5 Mile']
# See what we did
ready_geotable[:3]
Hypothesis:<br>
Discovery:<br>
# Save file to target folder to include it in the result download
target_path = target_folder + '/b.csv'
ready_geotable.to_csv(target_path, index=False)
print(f'b_geotable_path = {target_path}') # Print geotable_path to render map
#import data analysis package
import seaborn
import scipy
import matplotlib.pyplot as plt
ready_table.drop(ready_table[ready_table['Periodic Savings over Months'] > 20000].index, inplace=True)
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
#plot alcohol consumption and suicide rate as a scatterplot
scat1 = seaborn.regplot(x='Total Tree Count within 0.5 Mile', y='Periodic Savings over Months', data=ready_table)
plt.xlabel('Total Tree Count within 0.5 Mile')
plt.ylabel('Periodic Savings over Months')
plt.title('Scatterplot for the Association Between Tree Count and Periodic Savings')
#get the correlation coefficient
print('Association Between Tree Count and Periodic Savings')
print(scipy.stats.pearsonr(ready_table['Total Tree Count within 0.5 Mile'], ready_table['Periodic Savings over Months']))
# Save file to target folder to include it in the result download
target_path = target_folder + '/c.png'
figure = scat1.get_figure()
figure.savefig(target_path)
print(f'c_image_path = {target_path}')
print('Association Between Tree Count and Periodic Savings')
print(scipy.stats.pearsonr(ready_table['Total Tree Count within 0.5 Mile'],
ready_table['Periodic Savings over Months']))
prediction_table = ready_table.copy()
prediction_table[:3]
prediction_table = prediction_table[['Total Tree Count within 0.5 Mile',
'Periodic Savings within 0.5 Mile', 'Periodic Savings over Months']]
prediction_table.head()
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's that outlier in our savings?
X1 = prediction_table[['Periodic Savings within 0.5 Mile','Total Tree Count within 0.5 Mile']].values
Y1 = prediction_table['Periodic Savings over Months']
model1 = LinearRegression()
model1.fit(X1, Y1)
cross_val_score(model1, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
model2 = BayesianRidge()
model2.fit(X1, Y1)
cross_val_score(model2, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
model3 = SVR()
model3.fit(X1, Y1)
cross_val_score(model3, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
import pickle
pickle.dump(model2, open('/tmp/model.pkl', 'wb'))
# Load model
from pickle import load
model = load(open('/tmp/model.pkl', 'rb')) # !!! Replace dummy model with your model
model
url2 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/Testing_Data.csv'
testing_table = pd.read_csv(url2)
testing_table = testing_table.drop(['Unnamed: 0'], axis = 1)
X = testing_table[['Total Tree Count within 0.5 Mile', 'Periodic Savings within 0.5 Mile']].values
y = model.predict(X)
y
testing_table['Periodic Savings over Months'] = y
testing_table
# Save file to target folder to include it in the result download
target_path = target_folder + '/a.csv'
testing_table.to_csv(target_path, index=False)
print(f'a_table_path = {target_path}') # Print table_path to render table
# A step further:
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's are outliers in our savings?
YOUR INTERPRETATION OF THE RESULTS
{Participants statistics : Participants statistics ? YOUR TABLE DESCRIPTION}
{Participants on map : Participants on map ? YOUR MAP DESCRIPTION}
{Association Between Tree Count and Periodic Savings : Association Between Tree Count and Periodic Savings ? YOUR PLOT DESCRIPTION}