ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

matplotlib.rcParams['figure.figsize'] = (10,10)
In [3]:
# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
building_table = building_table[['Company Name','Industry','Company Type', 'Address',
                     'Borough','Latitude','Longitude',
                     'Savings from begining receiving benefits']]

building_table = building_table.rename(columns={'Invenstment': 'Investment', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Savings'})

building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
building_table.shape


building_table = building_table[['Company Name','Industry','Program',
                     'Borough','Latitude','Longitude',
                     'Savings', 'Address']]
building_table.head()
Out[3]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Industry Program Borough Latitude Longitude Savings Address
0 139 ACA Realty, Inc. Commercial ICIP QUEENS 40.745706 -73.929565 123975.24 43-23 35th Street
1 141 Lake Avenue Realty c/o JR Produce, Inc. Wholesale/Warehouse/Distribution ICIP STATEN IS 40.633153 -74.150999 47512.89 141 Lake Avenue
2 14-10 123rd Street LLC Commercial ICIP QUEENS 40.785144 -73.844833 21322.89 14-10 123rd Street
3 183 Lorriane Street LLC Wholesale/Warehouse/Distribution ICIP BROOKLYN 40.673106 -74.002300 105016.49 183 Lorraine Street
4 21st Century Optics, Inc. Manufacturing Tenant QUEENS 40.742386 -73.932148 215757.20 47-00 33rd Street
In [35]:
building_table.iloc[0]
Out[35]:
Company Name    139 ACA Realty, Inc.
Industry                  Commercial
Program                         ICIP
Borough                    QUEENS   
Latitude                     40.7457
Longitude                   -73.9296
Savings                       123975
Address            43-23 35th Street
Name: 0, dtype: object
In [4]:
# Now load the tree dataset
# The tree dataset is much larger so we will need to use a formula

# Load function
def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, sort=False)
In [5]:
%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
https://data.cityofnewyork.us/resource/nwxe-4ae8.csv?$limit=100000&$select=tree_id,status,Latitude,Longitude
100000
200000
300000
400000
500000
600000
683788
CPU times: user 1.02 s, sys: 265 ms, total: 1.28 s
Wall time: 8.08 s
In [6]:
tree_table.head()
Out[6]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Latitude Longitude status tree_id
0 40.723092 -73.844215 Alive 180683
1 40.794111 -73.818679 Alive 200540
2 40.717581 -73.936608 Alive 204026
3 40.713537 -73.934456 Alive 204337
4 40.666778 -73.975979 Alive 189565
In [7]:
tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()
Out[7]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Latitude Longitude tree_id
0 40.723092 -73.844215 180683
1 40.794111 -73.818679 200540
2 40.717581 -73.936608 204026
3 40.713537 -73.934456 204337
4 40.666778 -73.975979 189565
In [10]:
# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES

tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
In [11]:
radius_in_miles = 0.5

#def get_buildings(latlon):
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)
In [12]:
building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]
Out[12]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Industry Program Borough Latitude Longitude Savings Address Total Tree Count within 0.5 Mile
0 139 ACA Realty, Inc. Commercial ICIP QUEENS 40.745706 -73.929565 123975.24 43-23 35th Street 683788
1 141 Lake Avenue Realty c/o JR Produce, Inc. Wholesale/Warehouse/Distribution ICIP STATEN IS 40.633153 -74.150999 47512.89 141 Lake Avenue 683788
2 14-10 123rd Street LLC Commercial ICIP QUEENS 40.785144 -73.844833 21322.89 14-10 123rd Street 683788
In [17]:
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
final_table_path = /tmp/Table with periodic savings and tree count.csv
In [18]:
building_table.to_csv('Table with periodic savings and tree count.csv', sep=',')
In [19]:
table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]
Out[19]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Unnamed: 0 Company Name Industry Program Borough Latitude Longitude Savings Address Total Tree Count within 0.5 Mile
0 0 139 ACA Realty, Inc. Commercial ICIP QUEENS 40.745706 -73.929565 123975.24 43-23 35th Street 683788
1 1 141 Lake Avenue Realty c/o JR Produce, Inc. Wholesale/Warehouse/Distribution ICIP STATEN IS 40.633153 -74.150999 47512.89 141 Lake Avenue 683788
2 2 14-10 123rd Street LLC Commercial ICIP QUEENS 40.785144 -73.844833 21322.89 14-10 123rd Street 683788
In [20]:
sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
In [21]:
def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
    return selected_sav_table['Periodic Savings'].mean()
In [22]:
table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-22-5cdfcf0c3501> in <module>()
----> 1 table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
      2 table_tree_count[:3]

/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-21-9a1abb1bf400> in get_sav_average(r)
      7     indices = indices[~np.isnan(indices)]
      8     indices = [int(x) for x in indices]
----> 9     selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
     10     return selected_sav_table['Periodic Savings'].mean()

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in __getitem__(self, key)
    554 
    555         # fall back to Int64Index
--> 556         return super_getitem(key)
    557 
    558     def __floordiv__(self, other):

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
   2093 
   2094         key = com._values_from_object(key)
-> 2095         result = getitem(key)
   2096         if not is_scalar(result):
   2097             return promote(result)

IndexError: ('index 516 is out of bounds for axis 1 with size 516', 'occurred at index 0')
In [ ]:
# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
In [ ]:
table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count \
                        (within 0.5 Mile).csv', sep=',')
In [ ]:
# Algorithm

# find dtypes
In [ ]:
# Begin creating our model

# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns

# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size = validation_size, random_state = seed)
In [ ]:
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
In [ ]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
In [ ]:
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
In [ ]:
# Export model
In [ ]:
# How is the tool made in cross compute?
In [ ]:
# Paloma's Geopy and GoogleV3 code
from geopy import GoogleV3
geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode 
In [ ]:
def get_location(user_input):
    info = {}
    location = geocode(user_input)
    info['Longitude'] = location.longitude
    info['Latitude'] = location.latitude
    return info
In [ ]:
print(get_location("28-10 jackson ave"))
In [38]:
# import sys
# 'geopandas' in sys.modules
Out[38]:
False
In [36]:
# Convert WKT coordinates
# from geopandas import GeoDataFrame
# from shapely.geometry import Point

# geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]
# building_table = building_table.drop(['Longitude', 'Latitude'], axis=1)
# crs = {'init': 'epsg:4326'}
# gdf = GeoDataFrame(building_table, crs=crs, geometry=geometry)

# building_table.head()

!!! Important Roy => cross compute doesnt have Geopandas. !!!
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-1ace19eefed6> in <module>
      1 # Convert WKT coordinates
----> 2 from geopandas import GeoDataFrame
      3 from shapely.geometry import Point
      4 
      5 geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]

ModuleNotFoundError: No module named 'geopandas'
In [11]:
# from datetime import date
# df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
# df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
# df['Month Count'] = df['Month Count'].astype(int)  
# df['Periodic Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))

# #monthly savings over period starting from effective date (round to month) to 2017-12-31

# df = df.drop(['Savings',
#             #'Effective Date','Month Count'  
#             # user might care about 
#              ], axis=1) 
In [18]:
df2 = tree_table[['tree_id','status',
                  #'address','zipcode','boroname', 
                  'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,]          ## filter out "stump" and "dead"
df2.shape
Out[18]:
(157921, 4)
In [19]:
df2[:3]
Out[19]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
tree_id status Latitude Longitude
0 180683 Alive 40.723092 -73.844215
1 200540 Alive 40.794111 -73.818679
2 204026 Alive 40.717581 -73.936608
In [20]:
# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES

tree_xys = df2[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/__init__.py:65: VisibleDeprecationWarning: PySAL's API will be changed on 2018-12-31. The last release made with this API is version 1.14.4. A preview of the next API version is provided in the `pysal` 2.0 prelease candidate. The API changes and a guide on how to change imports is provided at https://pysal.org/about
  ), VisibleDeprecationWarning)
In [21]:
radius_in_miles = 0.5

#def get_buildings(latlon):
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

df['Total Tree Count within 0.5 Mile'] = df.apply(get_tree_count, axis=1)
df[:3]
Out[21]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name BIN Industry Business Program Effective Date Address Postcode Borough Latitude Longitude Month Count Periodic Savings Total Tree Count within 0.5 Mile
0 139 ACA Realty, Inc. 4003160 Commercial Limousine Service ICIP 2008-04-07 43-23 35th Street 11101 QUEENS 40.745706 -73.929565 116 1068.75 683
1 141 Lake Avenue Realty c/o JR Produce, Inc. 5146740 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 2009-12-08 141 Lake Avenue 10303 STATEN IS 40.633153 -74.150999 96 494.93 21
2 14-10 123rd Street LLC 4098344 Commercial Electrical Parts Mfg. ICIP 2011-03-04 14-10 123rd Street 11356 QUEENS 40.785144 -73.844833 81 263.25 447
In [22]:
df.shape
Out[22]:
(502, 14)
In [23]:
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
df.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
final_table_path = /tmp/Table with periodic savings and tree count.csv
In [24]:
df.to_csv('Table with periodic savings and tree count.csv', sep=',')
In [25]:
table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]
Out[25]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Unnamed: 0 Company Name BIN Industry Business Program Effective Date Address Postcode Borough Latitude Longitude Month Count Periodic Savings Total Tree Count within 0.5 Mile
0 0 139 ACA Realty, Inc. 4003160 Commercial Limousine Service ICIP 2008-04-07 43-23 35th Street 11101 QUEENS 40.745706 -73.929565 116 1068.75 683
1 1 141 Lake Avenue Realty c/o JR Produce, Inc. 5146740 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 2009-12-08 141 Lake Avenue 10303 STATEN IS 40.633153 -74.150999 96 494.93 21
2 2 14-10 123rd Street LLC 4098344 Commercial Electrical Parts Mfg. ICIP 2011-03-04 14-10 123rd Street 11356 QUEENS 40.785144 -73.844833 81 263.25 447
In [26]:
# table_tree_count.dtypes
In [27]:
sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
In [28]:
def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
    return
    return selected_sav_table['Periodic Savings'].mean()
In [29]:
table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]
Out[29]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Unnamed: 0 Company Name BIN Industry Business Program Effective Date Address Postcode Borough Latitude Longitude Month Count Periodic Savings Total Tree Count within 0.5 Mile Periodic Savings within 0.5 Mile
0 0 139 ACA Realty, Inc. 4003160 Commercial Limousine Service ICIP 2008-04-07 43-23 35th Street 11101 QUEENS 40.745706 -73.929565 116 1068.75 683 1423.931818
1 1 141 Lake Avenue Realty c/o JR Produce, Inc. 5146740 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 2009-12-08 141 Lake Avenue 10303 STATEN IS 40.633153 -74.150999 96 494.93 21 336.525000
2 2 14-10 123rd Street LLC 4098344 Commercial Electrical Parts Mfg. ICIP 2011-03-04 14-10 123rd Street 11356 QUEENS 40.785144 -73.844833 81 263.25 447 1079.380000
In [30]:
# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)

# Render the file as a table
print('final_table_path = %s' % target_path)
final_table_path = /tmp/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv
In [31]:
table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv', sep=',')
In [ ]: