import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
matplotlib.rcParams['figure.figsize'] = (10,10)
# Prepare builing data
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
building_table = pd.read_csv(url1, na_values = 'n/a')
building_table = building_table[['Company Name','Industry','Company Type', 'Address',
'Borough','Latitude','Longitude',
'Savings from begining receiving benefits']]
building_table = building_table.rename(columns={'Invenstment': 'Investment',
'Company Type': 'Program',
'Savings from begining receiving benefits': 'Savings'})
building_table.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
building_table.shape
building_table = building_table[['Company Name','Industry','Program',
'Borough','Latitude','Longitude',
'Savings', 'Address']]
building_table.head()
building_table.iloc[0]
# Now load the tree dataset
# The tree dataset is much larger so we will need to use a formula
# Load function
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, sort=False)
%%time
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'status', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table.head()
tree_table.dropna(subset=['Longitude', 'Latitude'], inplace=True)
tree_table[tree_table['status'] == 'Alive'].iloc[:3,] #filter out stump and dead trees
tree_table.drop(['status'], axis = 1).head()
# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
radius_in_miles = 0.5
#def get_buildings(latlon):
def get_tree_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_tree.query(
xy, k=tree_count, distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
return len(indices)
building_table['Total Tree Count within 0.5 Mile'] = building_table.apply(get_tree_count, axis=1)
building_table[:3]
# Save your output files in target_folder
target_folder = '/tmp'
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
building_table.to_csv('Table with periodic savings and tree count.csv', sep=',')
table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]
sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
def get_sav_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_sav.query(
xy,
k=len(bin_sav.data),
distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
indices = [int(x) for x in indices]
selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
return selected_sav_table['Periodic Savings'].mean()
table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]
# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count \
(within 0.5 Mile).csv', sep=',')
# Algorithm
# find dtypes
# Begin creating our model
# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns
# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
X, Y, test_size = validation_size, random_state = seed)
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
building_table.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
# Export model
# How is the tool made in cross compute?
# Paloma's Geopy and GoogleV3 code
from geopy import GoogleV3
geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode
def get_location(user_input):
info = {}
location = geocode(user_input)
info['Longitude'] = location.longitude
info['Latitude'] = location.latitude
return info
print(get_location("28-10 jackson ave"))
# import sys
# 'geopandas' in sys.modules
# Convert WKT coordinates
# from geopandas import GeoDataFrame
# from shapely.geometry import Point
# geometry = [Point(xy) for xy in zip(building_table.Longitude, building_table.Latitude)]
# building_table = building_table.drop(['Longitude', 'Latitude'], axis=1)
# crs = {'init': 'epsg:4326'}
# gdf = GeoDataFrame(building_table, crs=crs, geometry=geometry)
# building_table.head()
!!! Important Roy => cross compute doesnt have Geopandas. !!!
# from datetime import date
# df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
# df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
# df['Month Count'] = df['Month Count'].astype(int)
# df['Periodic Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))
# #monthly savings over period starting from effective date (round to month) to 2017-12-31
# df = df.drop(['Savings',
# #'Effective Date','Month Count'
# # user might care about
# ], axis=1)
df2 = tree_table[['tree_id','status',
#'address','zipcode','boroname',
'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,] ## filter out "stump" and "dead"
df2.shape
df2[:3]
# Make kdtree
from pysal.lib.cg.kdtree import KDTree
from pysal.lib.cg import RADIUS_EARTH_MILES
tree_xys = df2[['Longitude', 'Latitude']].values
tree_count = len(tree_xys)
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
radius_in_miles = 0.5
#def get_buildings(latlon):
def get_tree_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_tree.query(
xy, k=tree_count, distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
return len(indices)
df['Total Tree Count within 0.5 Mile'] = df.apply(get_tree_count, axis=1)
df[:3]
df.shape
# Save your output files in target_folder
target_path = target_folder + '/Table with periodic savings and tree count.csv'
df.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
df.to_csv('Table with periodic savings and tree count.csv', sep=',')
table_tree_count = pd.read_csv('Table with periodic savings and tree count.csv',na_values='n/a')
table_tree_count[:3]
# table_tree_count.dtypes
sav_xys = table_tree_count[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
def get_sav_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_sav.query(
xy,
k=len(bin_sav.data),
distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
indices = [int(x) for x in indices]
selected_sav_table = table_tree_count.loc[table_tree_count.index[indices]]
return
return selected_sav_table['Periodic Savings'].mean()
table_tree_count['Periodic Savings within 0.5 Mile'] = table_tree_count.apply(get_sav_average, axis=1)
table_tree_count[:3]
# Save your output files in target_folder
target_path = target_folder + '/Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv'
table_tree_count.to_csv(target_path, index=False)
# Render the file as a table
print('final_table_path = %s' % target_path)
table_tree_count.to_csv('Table with average monthly savings(within 0.5 Mile) and tree count(within 0.5 Mile).csv', sep=',')