ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [2]:
# CrossCompute
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import os
from datetime import date

from geopy import GoogleV3
import pysal.lib
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

matplotlib.rcParams['figure.figsize'] = (10,10)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-2-19dd2b2f54aa> in <module>
      8 
      9 from geopy import GoogleV3
---> 10 import pysal.lib
     11 from pysal.cg.kdtree import KDTree
     12 from pysal.cg import RADIUS_EARTH_MILES

ModuleNotFoundError: No module named 'pysal.lib'
In [ ]:
import sys
print('Python: {}'.format(sys.version))
In [ ]:
cwd = os.getcwd()
cwd
In [ ]:
"""
Pseudo code

What I want:

Input address

input zip code

spits out latitude and logitude

goes into my machine learning algorithm

see if I can predict savings given postcode, industry, program, borough + we can add tree data later to complicate

"""
In [ ]:
table = pd.read_csv('Downloads/Energy_Cost_Savings_for_Businesses.csv', na_values='n/a') 
In [ ]:
df = table[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
                     'Effective Date','Savings from begining receiving benefits',
                     'Postcode','Borough','Latitude','Longitude']]

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Address': 'address',
                        'Savings from begining receiving benefits': 'Savings'})

df.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df.head()
In [ ]:
# Select certain columns to generalize the data
df = df[['Industry', 'Program', 'Savings', 'Postcode', 'Borough']]
df.head()
In [ ]:
print(df.groupby('Industry').size())
print('\n')
print(df.groupby('Program').size())
print('\n')
print(df.groupby('Postcode').size())
print('\n')
print(df.groupby('Borough').size())
In [ ]:
print(df.dtypes)
In [ ]:
df.plot(kind = 'box', subplots = True, layout = (1,2), sharex = False, sharey = False)
plt.show() #who's that outlier in our savings? ignore postcode
In [ ]:
df.hist()
plt.show() #This might be tough, savings is not normally distributed
In [ ]:
# 
max_sav = max(df['Savings'])
print(df.loc[df['Savings'] == max_sav])
In [ ]:
# Remove annoying outlier
df = df.drop([544], axis = 0)
In [ ]:
df.plot(kind = 'box', subplots = True, layout = (1,2), sharex = False, sharey = False)
plt.show() # much better, again ignore zip code
In [3]:
# Begin creating our model

# Rearrange col
df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
#df.columns

# Split our validation dataset
array = df.values
X = array[:, 0:3]
Y = array[:, 3]
validation_size = .20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size = validation_size, random_state = seed)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-4ba791597e25> in <module>
      2 
      3 # Rearrange col
----> 4 df = df[['Industry', 'Program', 'Postcode', 'Borough', 'Savings']]
      5 #df.columns
      6 

NameError: name 'df' is not defined
In [ ]:
print(array)
In [ ]:
# Now we have training and validation data
# We will use 10 fold cross validation to estimate accuracy. (split dataset into 9 parts, train on 1)
In [ ]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
In [ ]:
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
In [ ]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
In [ ]:
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
In [ ]:
from geopy import GoogleV3
geocode = GoogleV3('AIzaSyDNqc0tWzXHx_wIp1w75-XTcCk4BSphB5w').geocode 

def get_location(user_input):
    info = {}
    location = geocode(user_input)
    info['Longitude'] = location.longitude
    info['Latitude'] = location.latitude
    return info

print(get_location("28-10 jackson ave"))

print(get_location("55 lexington ave"))

table_a = pd.read_csv(input_table_path,na_values='n/a')
df = table_a[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
                     'Effective Date','Savings from begining receiving benefits',
                     'Postcode','Borough','Latitude','Longitude']]

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Address': 'address',
                        'Savings from begining receiving benefits': 'Savings'})

df.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df.head()



df['Longitude'] = np.round(np.array(df['Longitude']),6)
df['Latitude'] = np.round(np.array(df['Latitude']),6)
#df['Coordinate']=df['Longitude'].astype(str)+','+df['Latitude'].astype(str)

from datetime import date
df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
df['Month Count'] = df['Month Count'].astype(int)  
df['Savings Over Months'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))

df.iloc[0]
df.head()

df.shape

input_table_path2 = '2015StreetTreesCensus_TREES.csv'
table_b = pd.read_csv(input_table_path2, na_values='n/a') 
df2 = table_b[['tree_id','status','address','zipcode','boroname', 'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,]          ## filter out "stump" and "dead"
df2.head()

df2['Longitude'] = np.round(np.array(df2['Longitude']),6)
df2['Latitude'] = np.round(np.array(df2['Latitude']),6)
#df2['Coordinate']=df2['Longitude'].astype(str)+','+df2['Latitude'].astype(str)
df2.head()

min_investment_select = """

"""
max_investment_select = """

"""
BIN = ''
industry_select = """
    Manufacturing

    Manufacturing
    Commercial
    Wholesale/Warehouse/Distribution
"""

# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

tree_xys = df2[['Longitude', 'Latitude']].values
tree_xys

tree_count = len(tree_xys)
tree_count

bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
bin_tree

radius_in_miles = 0.5

#def get_buildings(latlon):
    

def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

df['nearby_tree_count'] = df.apply(get_tree_count, axis=1)

df.shape

df_gb = df.groupby(['Industry','Program','BIN'])['nearby_tree_count'].sum().reset_index() 
df_gb
min(df['Savings Over Months'])
max(df['Savings Over Months'])

df['longitude'] = df['Longitude']
df['latitude'] = df['Latitude']

bins = [0, 500, 5000, 10000, 50000, 100000, 200000, 500000]
labels = ['0-500','500-5000','5000-10000','10000-50000','50000-100000','100000-200000','200000-500000']
aa= pandas.cut(df.cost, bins, right = False, labels= labels) 

target_path = join(target_folder, 'locations.csv')
df.to_csv(target_path, index=False)
print('x_geotable_path = %s' % target_path)