''' Build Training Dataset
'''
# CrossCompute
target_folder = '/tmp'
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv'
#input_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'
#building_table = pd.read_csv(input_table_path,na_values='n/a')
import pandas as pd
import numpy as np
building_table = pd.read_csv(url1, na_values = 'n/a')
df = building_table[['Company Name','Industry','Industry Description','Company Type', 'Address',
'Effective Date','Savings from begining receiving benefits',
'Postcode','Borough',
'Latitude','Longitude']]
df = df.rename(columns={'Invenstment': 'Investment',
'Industry Description': 'Business',
'Company Type': 'Program',
'Savings from begining receiving benefits': 'Savings'})
df.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
df[:3]
print(df.groupby('Industry').size())
print('\n')
print(df.groupby('Program').size())
print('\n')
#print(df.groupby('Postcode').size())
#print('\n')
print(df.groupby('Borough').size())
df.dtypes
from datetime import date
df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
df['Month Count'] = df['Month Count'].astype(int)
df['Average Monthly Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))
df = df.drop(['Company Name',
'Postcode','Borough','Address',
'Latitude','Longitude',
'Effective Date','Month Count','Savings'], axis=1)
df.head()
# below code didnt work for me - Paloma - caused kernel dead repeatedly
url2 = 'https://data.cityofnewyork.us/api/views/5rq2-4hqu/rows.csv'
tree_table = pd.read_csv(url2, na_values='n/a')
# output: (157921, 4)
#input_table_path2 = '2015StreetTreesCensus_TREES.csv'
#table_b = pd.read_csv(input_table_path2, na_values='n/a')
df2 = tree_table[['tree_id','status',
#'address','zipcode','boroname',
'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,] ## filter out "stump" and "dead"
df2.shape
df2 = df2.drop(['status'], axis=1)
df2['Longitude'] = np.round(np.array(df2['Longitude']),6)
df2['Latitude'] = np.round(np.array(df2['Latitude']),6)
#df2['Coordinate']=df2['Longitude'].astype(str)+','+df2['Latitude'].astype(str)
df2.head()
df2.dtypes
# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES
tree_xys = df2[['Longitude', 'Latitude']].values
tree_xys
tree_count = len(tree_xys)
tree_count
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
bin_tree
# Get buildings within X radius of input address - we might wanna abandon this idea
# def get_buildings(latlon)
radius_in_miles = 0.5
#def get_buildings(latlon):
def get_tree_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_tree.query(
xy, k=tree_count, distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
return len(indices)
df['Nearby Tree Count'] = df.apply(get_tree_count, axis=1)
df.head()
df[:3]
sav_xys = df[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
def get_sav_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = bin_sav.query(
xy,
k=len(bin_sav.data),
distance_upper_bound=radius_in_miles)
indices = indices[~np.isnan(indices)]
indices = [int(x) for x in indices]
selected_sav_table = df.loc[df.index[indices]]
return selected_sav_table['Savings'].mean()
# partial_dataset_table = df[:5].copy()
# partial_dataset_table[
# f'Average Savings of Nearby Buildings Within {radius_in_miles} Miles'
# ] = partial_dataset_table.apply(
# get_sav_average, axis=1)
# partial_dataset_table[:5]
dataset_table = df.copy()
dataset_table[
f'Average Monthly Savings of Nearby Buildings Within {radius_in_miles} Miles'
] = dataset_table.apply(
get_sav_average, axis=1)
dataset_table[:5]
target_path = target_folder + '/savings.csv'
dataset_table.to_csv(target_path, index=False)
print(f'prediction_table_path = {target_path}')