ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Atlas ESCP

In [1]:
# CrossCompute
user_address = ""
search_radius_in_miles = 0.5
industry_select = """
    Manufacturing

    Manufacturing
    Wholesale/Warehouse/Distribution
    Commercial
    Landlord
    Public Benefit Corp
    Other
"""
program_select = """
    ICIP
    
    ICIP
    ICAP
    City/State
    IDA
    Relocator
    Tenant
"""
target_folder = '/tmp'
In [2]:
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
matplotlib.rcParams['figure.figsize'] = (10,10)

import subprocess
subprocess.call('pip install geopandas'.split())
subprocess.call('pip install dill'.split())
Out[2]:
0

Prepare building dataset

<font color=blue>Tasks: <br><br> 1. Feature selection: Drop information users don't care about. <br> 2. Smooth out time factor by calculating "Periodic Savings over months" starting from effective date (round to month) to 2017-12-31

In [3]:
# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
# Rename features for better readability
building_table = building_table.rename(columns={'Company Type': 'Program', 
                                                'Savings from begining receiving benefits': 'Savings'})
# Feature selection
# Come back to add features later if necessary
building_table = building_table[['Company Name', 'Address','Industry','Program', 
                                 'Effective Date','Savings',
                                 'Borough','Latitude','Longitude']]
building_table[:3]
Out[3]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 04/07/2008 123975.24 QUEENS 40.745706 -73.929565
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 12/08/2009 47512.89 STATEN IS 40.633153 -74.150999
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 03/04/2011 21322.89 QUEENS 40.785144 -73.844833
In [4]:
len(building_table)
Out[4]:
568
In [5]:
# Drop rows when lonlat is NULL
building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
len(building_table)
Out[5]:
516
In [6]:
building_table.dtypes
Out[6]:
Company Name       object
Address            object
Industry           object
Program            object
Effective Date     object
Savings           float64
Borough            object
Latitude          float64
Longitude         float64
dtype: object
In [7]:
from datetime import date
building_table['Effective Date'] = pd.to_datetime(building_table['Effective Date'].str.strip(), format='%m/%d/%Y')
building_table['Month Count'] = ((pd.to_datetime('2017-12-31') - building_table['Effective Date']) / np.timedelta64(1, 'M'))
building_table['Month Count'] = building_table['Month Count'].astype(int)  

# Periodic Savings over months starting from effective date (round to month) to 2017-12-31
building_table['Periodic Savings over Months'] = (building_table['Savings'] / building_table['Month Count']).apply(lambda x: round(x, 2))

building_table.drop(['Savings' #,'Effective Date','Month Count' 
                         ], axis=1) 
building_table.head()
Out[7]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25
3 183 Lorriane Street LLC 183 Lorraine Street Wholesale/Warehouse/Distribution ICIP 2015-11-06 105016.49 BROOKLYN 40.673106 -74.002300 25 4200.66
4 21st Century Optics, Inc. 47-00 33rd Street Manufacturing Tenant 2009-01-07 215757.20 QUEENS 40.742386 -73.932148 107 2016.42

Now load the tree dataset

<font color=blue>Tasks: <br><br>1. Feature selection: Longitude, Latitude, status, tree_id for later tree count around each location from building table<br> 2. Define a function to expedite data processing for large dataset

In [8]:
# Load function
def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, sort=False)
In [9]:
%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
https://data.cityofnewyork.us/resource/nwxe-4ae8.csv?$limit=100000&$select=tree_id,status,Latitude,Longitude
100000
200000
300000
400000
500000
600000
683788
CPU times: user 549 ms, sys: 94 ms, total: 643 ms
Wall time: 1.44 s
In [10]:
tree_table.head()
Out[10]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Latitude Longitude status tree_id
0 40.723092 -73.844215 Alive 180683
1 40.794111 -73.818679 Alive 200540
2 40.717581 -73.936608 Alive 204026
3 40.713537 -73.934456 Alive 204337
4 40.666778 -73.975979 Alive 189565
In [11]:
tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()
Out[11]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Latitude Longitude tree_id
0 40.723092 -73.844215 180683
1 40.794111 -73.818679 200540
2 40.717581 -73.936608 204026
3 40.713537 -73.934456 204337
4 40.666778 -73.975979 189565

Get total tree count within 0.5 mile around each participant

In [12]:
# Make kdtree
from pysal.lib.cg import KDTree,RADIUS_EARTH_MILES

tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
radius_in_miles = 0.5


def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[indices < tree_count]
    return len(indices)
In [13]:
building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]
Out[13]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months Total Tree Count within 0.5 Mile
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75 1331
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93 1390
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25 2195
In [14]:
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with tree count.csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
final_table_path = /tmp/Table with tree count.csv
In [15]:
building_table.to_csv('Table with tree count.csv', sep=',')

<font color=blue>Note:<br><br>1. Tree count within 0.5 miles is attached as a new column to the building table at this point.<br> 2. Ready table is saved to file 'Table with tree count.csv' for later analysis</font><br><br>

In [16]:
import pandas as pd
import numpy as np
building_table.head()
Out[16]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months Total Tree Count within 0.5 Mile
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75 1331
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93 1390
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25 2195
3 183 Lorriane Street LLC 183 Lorraine Street Wholesale/Warehouse/Distribution ICIP 2015-11-06 105016.49 BROOKLYN 40.673106 -74.002300 25 4200.66 1254
4 21st Century Optics, Inc. 47-00 33rd Street Manufacturing Tenant 2009-01-07 215757.20 QUEENS 40.742386 -73.932148 107 2016.42 1136

Get average periodic savings of all participants, within 0.5 mile

<font color=blue>We tried count in user selected industry and program

In [17]:
# Filter table by user selected program and industry
#building_table = building_table.loc[building_table['Industry'] == industry_select].loc[building_table['Program'] == program_select]

<font color=blue>We decided to drop industry and program becuase not enough data was left with those filters on.<br>ValueError: not enough values to unpack (expected 2, got 1).

In [18]:
# Make kdtree
radius_in_miles = 0.5

from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
sav_xys = building_table[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
In [19]:
def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[indices < sav_count]
    indices = [int(x) for x in indices]
    selected_sav_table = building_table.loc[building_table.index[indices]]
    return selected_sav_table['Periodic Savings over Months'].mean()
In [20]:
building_table['Periodic Savings within 0.5 Mile'] = building_table.apply(get_sav_average, axis=1)
building_table[:3]
Out[20]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75 1331 1419.727333
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93 1390 336.525000
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25 2195 1079.380000
In [22]:
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
final_table_path = /tmp/Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv
In [23]:
building_table.to_csv('Table with periodic savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv', sep=',')

Render Map

In [26]:
# rename building table with tree count and periodic savings around
ready_table = building_table.copy()
#ready_table.drop(['Unnamed: 0'], axis=1)
In [27]:
ready_geotable = ready_table.copy()  # Prevent SettingwithCopyWarning
In [28]:
# Set radius for each point
# The bigger the circle, the more the company saved
ready_geotable['RadiusInPixelsRange5-50'] = ready_geotable['Periodic Savings over Months']
In [29]:
# Set color for each point using a gradient
# The darker the color, the more trees the company is surrounded by
ready_geotable['FillRedsFromMean'] = ready_geotable['Total Tree Count within 0.5 Mile']
In [30]:
# See what we did
ready_geotable[:3]
Out[30]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile RadiusInPixelsRange5-50 FillRedsFromMean
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75 1331 1419.727333 1068.75 1331
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93 1390 336.525000 494.93 1390
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25 2195 1079.380000 263.25 2195

Hypothesis:<br>

  • Bigger circle tend to be darker<br>
  • Also, companies who saved the more money tend to be surrounded by more trees.<br>

Discovery:<br>

  • Biggest circles are found in Bronx and STATEN IS<br>
  • Bigger circles in Bronx are indeed surrounded by more trees<br> <font color=red>(ex.Albert Einstein College of Medicine of Yeshiva Uni, saved 102919.54 dollars, surrounded by 419 trees; Montefiore Medical Center, saved 11400.63 in ICAP and 69506.82 in ICIP, surrounded by 303 trees) <br> <font color=black>- While the biggest circle of all is found in Stanley IS but the color is near transparent(means very few trees around) <br> <font color=red>(ex.VISY PAPER RECYCLING saved 274038.51, 41 trees)

Training Model

In [34]:
#url1 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/prediction_table.csv'
prediction_table = ready_table.copy()
prediction_table[:3]
Out[34]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Address Industry Program Effective Date Savings Borough Latitude Longitude Month Count Periodic Savings over Months Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile
0 139 ACA Realty, Inc. 43-23 35th Street Commercial ICIP 2008-04-07 123975.24 QUEENS 40.745706 -73.929565 116 1068.75 1331 1419.727333
1 141 Lake Avenue Realty c/o JR Produce, Inc. 141 Lake Avenue Wholesale/Warehouse/Distribution ICIP 2009-12-08 47512.89 STATEN IS 40.633153 -74.150999 96 494.93 1390 336.525000
2 14-10 123rd Street LLC 14-10 123rd Street Commercial ICIP 2011-03-04 21322.89 QUEENS 40.785144 -73.844833 81 263.25 2195 1079.380000
In [35]:
prediction_table = prediction_table[['Total Tree Count within 0.5 Mile',
                                     'Periodic Savings within 0.5 Mile', 'Periodic Savings over Months']]
prediction_table.head()
Out[35]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile Periodic Savings over Months
0 1331 1419.727333 1068.75
1 1390 336.525000 494.93
2 2195 1079.380000 263.25
3 1254 2846.165714 4200.66
4 1136 1517.730217 2016.42
In [36]:
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's that outlier in our savings? 
In [37]:
X1 = prediction_table[['Periodic Savings within 0.5 Mile','Total Tree Count within 0.5 Mile']].values
Y1 = prediction_table['Periodic Savings over Months']
In [38]:
model1 = LinearRegression()
model1.fit(X1, Y1)
cross_val_score(model1, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
Out[38]:
-2416.049517035988
In [39]:
model2 = BayesianRidge()
model2.fit(X1, Y1)
cross_val_score(model2, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
Out[39]:
-2410.992224823291
In [40]:
model3 = SVR()
model3.fit(X1, Y1)
cross_val_score(model3, X1, Y1, cv=3, scoring='neg_mean_absolute_error').mean()
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
Out[40]:
-2836.098137886751
In [41]:
import pickle
pickle.dump(model2, open('/tmp/model.pkl', 'wb'))
In [42]:
# Load model
from pickle import load
model = load(open('/tmp/model.pkl', 'rb'))  # !!! Replace dummy model with your model
model
Out[42]:
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
In [43]:
url2 = 'https://raw.githubusercontent.com/wzmemo/NYC_Open_Data_Business_Savings/master/Testing_Data.csv'
testing_table = pd.read_csv(url2)
In [44]:
testing_table = testing_table.drop(['Unnamed: 0'], axis = 1)
In [45]:
X = testing_table[['Total Tree Count within 0.5 Mile', 'Periodic Savings within 0.5 Mile']].values
y = model.predict(X)
y
Out[45]:
array([695.90254332, -30.41407771, 439.4542031 ])
In [46]:
testing_table['Periodic Savings over Months'] = y
testing_table
Out[46]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Address Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile Periodic Savings over Months
0 43-23 35th Street 683 1423.931818 695.902543
1 141 Lake Avenue 21 336.525000 -30.414078
2 14-10 123rd Street 447 1079.380000 439.454203

Render Table

In [47]:
# Save file to target folder to include it in the result download
target_path = target_folder + '/a.csv'
prediction_table.to_csv(target_path, index=False)
print(f'a_table_path = {target_path}')  # Print table_path to render table
a_table_path = /tmp/a.csv
In [49]:
# A step further: 
prediction_table.plot(kind = 'box', subplots = True, layout = (1,3), sharex = False, sharey = False)
plt.show() #what's are outliers in our savings? 

This is the end of the workbook :)