In [1]:

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

matplotlib.rcParams['figure.figsize'] = (10,10)

In [3]:

# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
building_table = building_table[['Company Name','Industry','Company Type', 'Address',
                     'Borough','Latitude','Longitude',
                     'Savings from begining receiving benefits']]

building_table = building_table.rename(columns={'Invenstment': 'Investment', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Savings'})

building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
building_table.shape


building_table = building_table[['Company Name','Industry','Program',
                     'Borough','Latitude','Longitude',
                     'Savings', 'Address']]
building_table.head()

Out[3]:

	Company Name	Industry	Program	Borough	Latitude	Longitude	Savings	Address
0	139 ACA Realty, Inc.	Commercial	ICIP	QUEENS	40.745706	-73.929565	123975.24	43-23 35th Street
1	141 Lake Avenue Realty c/o JR Produce, Inc.	Wholesale/Warehouse/Distribution	ICIP	STATEN IS	40.633153	-74.150999	47512.89	141 Lake Avenue
2	14-10 123rd Street LLC	Commercial	ICIP	QUEENS	40.785144	-73.844833	21322.89	14-10 123rd Street
3	183 Lorriane Street LLC	Wholesale/Warehouse/Distribution	ICIP	BROOKLYN	40.673106	-74.002300	105016.49	183 Lorraine Street
4	21st Century Optics, Inc.	Manufacturing	Tenant	QUEENS	40.742386	-73.932148	215757.20	47-00 33rd Street

In [35]:

building_table.iloc[0]

Out[35]:

Company Name    139 ACA Realty, Inc.
Industry                  Commercial
Program                         ICIP
Borough                    QUEENS   
Latitude                     40.7457
Longitude                   -73.9296
Savings                       123975
Address            43-23 35th Street
Name: 0, dtype: object

In [4]:

# Now load the tree dataset
# The tree dataset is much larger so we will need to use a formula

# Load function
def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, sort=False)

In [5]:

%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)

https://data.cityofnewyork.us/resource/nwxe-4ae8.csv?$limit=100000&$select=tree_id,status,Latitude,Longitude
100000
200000
300000
400000
500000
600000
683788
CPU times: user 1.02 s, sys: 265 ms, total: 1.28 s
Wall time: 8.08 s

In [6]:

tree_table.head()

Out[6]:

	Latitude	Longitude	status	tree_id
0	40.723092	-73.844215	Alive	180683
1	40.794111	-73.818679	Alive	200540
2	40.717581	-73.936608	Alive	204026
3	40.713537	-73.934456	Alive	204337
4	40.666778	-73.975979	Alive	189565

In [7]:

tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()

Out[7]:

	Latitude	Longitude	tree_id
0	40.723092	-73.844215	180683
1	40.794111	-73.818679	200540
2	40.717581	-73.936608	204026
3	40.713537	-73.934456	204337
4	40.666778	-73.975979	189565

In [10]:

# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES

tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)

In [11]:

radius_in_miles = 0.5

#def get_buildings(latlon)：
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

In [12]:

building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]

Out[12]:

	Company Name	Industry	Program	Borough	Latitude	Longitude	Savings	Address	Total Tree Count within 0.5 Mile
0	139 ACA Realty, Inc.	Commercial	ICIP	QUEENS	40.745706	-73.929565	123975.24	43-23 35th Street	683788
1	141 Lake Avenue Realty c/o JR Produce, Inc.	Wholesale/Warehouse/Distribution	ICIP	STATEN IS	40.633153	-74.150999	47512.89	141 Lake Avenue	683788
2	14-10 123rd Street LLC	Commercial	ICIP	QUEENS	40.785144	-73.844833	21322.89	14-10 123rd Street	683788

In [17]:

# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)

final_table_path = /tmp/Table with periodic savings and tree count.csv

In [18]:

building_table.to_csv('Table with periodic savings and tree count.csv', sep=',')

In [19]:

table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]

Out[19]:

	Unnamed: 0	Company Name	Industry	Program	Borough	Latitude	Longitude	Savings	Address	Total Tree Count within 0.5 Mile
0	0	139 ACA Realty, Inc.	Commercial	ICIP	QUEENS	40.745706	-73.929565	123975.24	43-23 35th Street	683788
1	1	141 Lake Avenue Realty c/o JR Produce, Inc.	Wholesale/Warehouse/Distribution	ICIP	STATEN IS	40.633153	-74.150999	47512.89	141 Lake Avenue	683788
2	2	14-10 123rd Street LLC	Commercial	ICIP	QUEENS	40.785144	-73.844833	21322.89	14-10 123rd Street	683788

In [20]:

sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)

In [21]:

def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
    return selected_sav_table['Periodic Savings'].mean()

In [22]:

table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-22-5cdfcf0c3501> in <module>()
----> 1 table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
      2 table_tree_count[:3]

/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-21-9a1abb1bf400> in get_sav_average(r)
      7     indices = indices[~np.isnan(indices)]
      8     indices = [int(x) for x in indices]
----> 9     selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
     10     return selected_sav_table['Periodic Savings'].mean()

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in __getitem__(self, key)
    554 
    555         # fall back to Int64Index
--> 556         return super_getitem(key)
    557 
    558     def __floordiv__(self, other):

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
   2093 
   2094         key = com._values_from_object(key)
-> 2095         result = getitem(key)
   2096         if not is_scalar(result):
   2097             return promote(result)

IndexError: ('index 516 is out of bounds for axis 1 with size 516', 'occurred at index 0')

In [ ]:

# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)

In [ ]:

table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count \
                        (within 0.5 Mile).csv', sep=',')

In [ ]:

# Algorithm

# find dtypes

In [ ]:

# Begin creating our model

# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns

# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size = validation_size, random_state = seed)

In [ ]:

# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [ ]:

# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [ ]:

# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)

In [ ]:

# Export model

In [ ]:

# How is the tool made in cross compute?

In [ ]:

# Paloma's Geopy and GoogleV3 code
from geopy import GoogleV3
geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode

In [ ]:

def get_location(user_input):
    info = {}
    location = geocode(user_input)
    info['Longitude'] = location.longitude
    info['Latitude'] = location.latitude
    return info

In [ ]:

print(get_location("28-10 jackson ave"))

In [38]:

# import sys
# 'geopandas' in sys.modules

Out[38]:

False

In [36]:

# Convert WKT coordinates
# from geopandas import GeoDataFrame
# from shapely.geometry import Point

# geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]
# building_table = building_table.drop(['Longitude', 'Latitude'], axis=1)
# crs = {'init': 'epsg:4326'}
# gdf = GeoDataFrame(building_table, crs=crs, geometry=geometry)

# building_table.head()

!!! Important Roy => cross compute doesnt have Geopandas. !!!

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-1ace19eefed6> in <module>
      1 # Convert WKT coordinates
----> 2 from geopandas import GeoDataFrame
      3 from shapely.geometry import Point
      4 
      5 geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]

ModuleNotFoundError: No module named 'geopandas'

In [11]:

# from datetime import date
# df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
# df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
# df['Month Count'] = df['Month Count'].astype(int)  
# df['Periodic Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))

# #monthly savings over period starting from effective date (round to month) to 2017-12-31

# df = df.drop(['Savings',
#             #'Effective Date','Month Count'  
#             # user might care about 
#              ], axis=1)

In [18]:

df2 = tree_table[['tree_id','status',
                  #'address','zipcode','boroname', 
                  'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,]          ## filter out "stump" and "dead"
df2.shape

Out[18]:

(157921, 4)

In [19]:

df2[:3]

Out[19]:

	tree_id	status	Latitude	Longitude
0	180683	Alive	40.723092	-73.844215
1	200540	Alive	40.794111	-73.818679
2	204026	Alive	40.717581	-73.936608

In [20]:

# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES

tree_xys = df2[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/__init__.py:65: VisibleDeprecationWarning: PySAL's API will be changed on 2018-12-31. The last release made with this API is version 1.14.4. A preview of the next API version is provided in the `pysal` 2.0 prelease candidate. The API changes and a guide on how to change imports is provided at https://pysal.org/about
  ), VisibleDeprecationWarning)

In [21]:

radius_in_miles = 0.5

#def get_buildings(latlon)：
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

df['Total Tree Count within 0.5 Mile'] = df.apply(get_tree_count, axis=1)
df[:3]

Out[21]:

	Company Name	BIN	Industry	Business	Program	Effective Date	Address	Postcode	Borough	Latitude	Longitude	Month Count	Periodic Savings	Total Tree Count within 0.5 Mile
0	139 ACA Realty, Inc.	4003160	Commercial	Limousine Service	ICIP	2008-04-07	43-23 35th Street	11101	QUEENS	40.745706	-73.929565	116	1068.75	683
1	141 Lake Avenue Realty c/o JR Produce, Inc.	5146740	Wholesale/Warehouse/Distribution	Dist. of prepacked salads	ICIP	2009-12-08	141 Lake Avenue	10303	STATEN IS	40.633153	-74.150999	96	494.93	21
2	14-10 123rd Street LLC	4098344	Commercial	Electrical Parts Mfg.	ICIP	2011-03-04	14-10 123rd Street	11356	QUEENS	40.785144	-73.844833	81	263.25	447

In [22]:

df.shape

Out[22]:

(502, 14)

In [23]:

# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
df.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)

final_table_path = /tmp/Table with periodic savings and tree count.csv

In [24]:

df.to_csv('Table with periodic savings and tree count.csv', sep=',')

In [25]:

table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]

Out[25]:

	Unnamed: 0	Company Name	BIN	Industry	Business	Program	Effective Date	Address	Postcode	Borough	Latitude	Longitude	Month Count	Periodic Savings	Total Tree Count within 0.5 Mile
0	0	139 ACA Realty, Inc.	4003160	Commercial	Limousine Service	ICIP	2008-04-07	43-23 35th Street	11101	QUEENS	40.745706	-73.929565	116	1068.75	683
1	1	141 Lake Avenue Realty c/o JR Produce, Inc.	5146740	Wholesale/Warehouse/Distribution	Dist. of prepacked salads	ICIP	2009-12-08	141 Lake Avenue	10303	STATEN IS	40.633153	-74.150999	96	494.93	21
2	2	14-10 123rd Street LLC	4098344	Commercial	Electrical Parts Mfg.	ICIP	2011-03-04	14-10 123rd Street	11356	QUEENS	40.785144	-73.844833	81	263.25	447

In [26]:

# table_tree_count.dtypes

In [27]:

sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)

In [28]:

def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
    return
    return selected_sav_table['Periodic Savings'].mean()

In [29]:

table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]

Out[29]:

	Unnamed: 0	Company Name	BIN	Industry	Business	Program	Effective Date	Address	Postcode	Borough	Latitude	Longitude	Month Count	Periodic Savings	Total Tree Count within 0.5 Mile	Periodic Savings within 0.5 Mile
0	0	139 ACA Realty, Inc.	4003160	Commercial	Limousine Service	ICIP	2008-04-07	43-23 35th Street	11101	QUEENS	40.745706	-73.929565	116	1068.75	683	1423.931818
1	1	141 Lake Avenue Realty c/o JR Produce, Inc.	5146740	Wholesale/Warehouse/Distribution	Dist. of prepacked salads	ICIP	2009-12-08	141 Lake Avenue	10303	STATEN IS	40.633153	-74.150999	96	494.93	21	336.525000
2	2	14-10 123rd Street LLC	4098344	Commercial	Electrical Parts Mfg.	ICIP	2011-03-04	14-10 123rd Street	11356	QUEENS	40.785144	-73.844833	81	263.25	447	1079.380000

In [30]:

# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)

final_table_path = /tmp/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv

In [31]:

table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv', sep=',')

In [ ]:

Pay Notebook Creator: Haige Cui	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

ECSP