''' Build Training Dataset
'''
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
import pandas as pd
import numpy as np
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, sort=False)
building_table = pd.read_csv(url1, na_values = 'n/a')
#df = building_table[[]]
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table.head()
building_table.head()
df_buildings = building_table[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
'Effective Date','Savings from begining receiving benefits',
'Postcode','Borough','Latitude','Longitude']]
df_buildings = df_buildings.rename(columns={'Invenstment': 'Investment',
'Industry Description': 'Business',
'Company Type': 'Program',
'Address': 'address',
'Savings from begining receiving benefits': 'Savings'})
df_buildings.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df_buildings.head()
# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES
tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_xys
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
tree_count = len(tree_xys)
# radius_in_miles = 0.5
# #def get_buildings(latlon):
# def get_tree_count(r):
# xy = r['Longitude'], r['Latitude']
# distances, indices = bin_tree.query(
# xy, k=tree_count, distance_upper_bound=radius_in_miles)
# indices = indices[~np.isnan(indices)]
# return len(indices)
# df_buildings['Nearby Tree Count'] = df_buildings.apply(get_tree_count, axis=1)
# #df_buildings.head()
df_buildings.head()
'''
'''
def get_tree_average(r):
xy = r['Longitude'], r['Latitude']
distances, indices = tree_tree.query(
xy,
k=len(tree_tree.data),
distance_upper_bound=radius_in_meters)
indices = indices[~np.isnan(indices)]
indices = [int(x) for x in indices]
selected_tree_table = tree_table.loc[tree_index[indices]]
return selected_tree_table['tree_dbh'].mean()
partial_dataset_table = df_buildings[:5].copy()
partial_dataset_table[
f'Average Savings of Nearby Buildings Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_tree_average, axis=1)