# CrossCompute
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import date
from geopy import GoogleV3
import pysal.lib
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
matplotlib.rcParams['figure.figsize'] = (10,10)
import sys
print('Python: {}'.format(sys.version))
cwd = os.getcwd()
cwd
"""
Pseudo code
What I want:
Input address
input zip code
spits out latitude and logitude
goes into my machine learning algorithm
see if I can predict savings given postcode, industry, program, borough + we can add tree data later to complicate
"""
table = pd.read_csv('Downloads/Energy_Cost_Savings_for_Businesses.csv', na_values='n/a')
df = table[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
'Effective Date','Savings from begining receiving benefits',
'Postcode','Borough','Latitude','Longitude']]
df = df.rename(columns={'Invenstment': 'Investment',
'Industry Description': 'Business',
'Company Type': 'Program',
'Address': 'address',
'Savings from begining receiving benefits': 'Savings'})
df.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df.head()
# Select certain columns to generalize the data
df = df[['Industry', 'Program', 'Savings', 'Postcode', 'Borough']]
df.head()
print(df.groupby('Industry').size())
print('\n')
print(df.groupby('Program').size())
print('\n')
print(df.groupby('Postcode').size())
print('\n')
print(df.groupby('Borough').size())
print(df.dtypes)
df.plot(kind = 'box', subplots = True, layout = (1,2), sharex = False, sharey = False)
plt.show() #who's that outlier in our savings? ignore postcode
df.hist()
plt.show() #This might be tough, savings is not normally distributed
#
max_sav = max(df['Savings'])
print(df.loc[df['Savings'] == max_sav])
# Remove annoying outlier
df = df.drop([544], axis = 0)
df.plot(kind = 'box', subplots = True, layout = (1,2), sharex = False, sharey = False)
plt.show() # much better, again ignore zip code
# Begin creating our model
# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns
# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
X, Y, test_size = validation_size, random_state = seed)
print(array)
# Now we have training and validation data
# We will use 10 fold cross validation to estimate accuracy. (split dataset into 9 parts, train on 1)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
from geopy import GoogleV3
geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode
def get_location(user_input):
info = {}
location = geocode(user_input)
info['Longitude'] = location.longitude
info['Latitude'] = location.latitude
return info
print(get_location("28-10 jackson ave"))
print(get_location("55 lexington ave"))
table_a = pd.read_csv(input_table_path,na_values='n/a')
df = table_a[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
'Effective Date','Savings from begining receiving benefits',
'Postcode','Borough','Latitude','Longitude']]
df = df.rename(columns={'Invenstment': 'Investment',
'Industry Description': 'Business',
'Company Type': 'Program',
'Address': 'address',
'Savings from begining receiving benefits': 'Savings'})
df.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df.head()
df['Longitude'] = np.round(np.array(df['Longitude']),6)
df['Latitude'] = np.round(np.array(df['Latitude']),6)
#df['Coordinate']=df['Longitude'].astype(str)+','+df['Latitude'].astype(str)
from datetime import date
df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
df['Month Count'] = df['Month Count'].astype(int)
df['Savings Over Months'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))
df.iloc[0]
df.head()
df.shape
input_table_path2 = '2015StreetTreesCensus_TREES.csv'
table_b = pd.read_csv(input_table_path2, na_values='n/a')
df2 = table_b[['tree_id','status','address','zipcode','boroname', 'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,] ## filter out "stump" and "dead"
df2.head()
df2['Longitude'] = np.round(np.array(df2['Longitude']),6)
df2['Latitude'] = np.round(np.array(df2['Latitude']),6)
#df2['Coordinate']=df2['Longitude'].astype(str)+','+df2['Latitude'].astype(str)
df2.head()
min_investment_select = """
"""
max_investment_select = """
"""
BIN = ''
industry_select = """
Manufacturing
Manufacturing
Commercial
Wholesale/Warehouse/Distribution
"""
# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES
tree_xys = df2[['Longitude', 'Latitude']].values
tree_xys
tree_count = len(tree_xys)
tree_count
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
bin_tree
radius_in_miles = 0.5
#def get_buildings(latlon):
def get_tree_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_tree.query(
xy, k=tree_count, distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
return len(indices)
df['nearby_tree_count'] = df.apply(get_tree_count, axis=1)
df.shape
df_gb = df.groupby(['Industry','Program','BIN'])['nearby_tree_count'].sum().reset_index()
df_gb
min(df['Savings Over Months'])
max(df['Savings Over Months'])
df['longitude'] = df['Longitude']
df['latitude'] = df['Latitude']
bins = [0, 500, 5000, 10000, 50000, 100000, 200000, 500000]
labels = ['0-500','500-5000','5000-10000','10000-50000','50000-100000','100000-200000','200000-500000']
aa= pandas.cut(df.cost, bins, right = False, labels= labels)
target_path = join(target_folder, 'locations.csv')
df.to_csv(target_path, index=False)
print('x_geotable_path = %s' % target_path)