ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

''' Build Training Dataset

  1. Load and import data
  2. Explore dataset
  3. Prepare training data
  4. Test

'''

In [1]:
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv?accessType=DOWNLOAD'
In [2]:
import pandas as pd
import numpy as np
def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, sort=False)
In [3]:
building_table = pd.read_csv(url1, na_values = 'n/a')
#df = building_table[[]]
In [4]:
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'Latitude', 'Longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
https://data.cityofnewyork.us/resource/nwxe-4ae8.csv?$limit=100000&$select=tree_id,Latitude,Longitude
100000
200000
300000
400000
500000
600000
683788
In [5]:
tree_table.head()
Out[5]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Latitude Longitude tree_id
0 40.723092 -73.844215 180683
1 40.794111 -73.818679 200540
2 40.717581 -73.936608 204026
3 40.713537 -73.934456 204337
4 40.666778 -73.975979 189565
In [6]:
building_table.head()
Out[6]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company ID Company Name company contact company email company phone Address City State Postcode Industry ... Savings from begining receiving benefits Borough Latitude Longitude Community Board Council District Census Tract BIN BBL NTA
0 3410 139 ACA Realty, Inc. Eitan Chandally barbara@dial7.com 7.187076e+09 43-23 35th Street Long Island City NY 11101 Commercial ... 123975.24 QUEENS 40.745706 -73.929565 2.0 26.0 179.0 4003160.0 4.002220e+09 Hunters Point-Sunnyside-West Maspeth ...
1 3531 141 Lake Avenue Realty c/o JR Produce, Inc. Josef Raz jrproduce@gmail.com 7.183709e+09 141 Lake Avenue Staten Island NY 10303 Wholesale/Warehouse/Distribution ... 47512.89 STATEN IS 40.633153 -74.150999 1.0 49.0 239.0 5146740.0 5.011610e+09 Mariner's Harbor-Arlington-Port Ivory-Granitev...
2 3465 14-10 123rd Street LLC Danica/Ivan Drazic ddrazic@atjelectrical.com 7.183210e+09 14-10 123rd Street College Point NY 11356 Commercial ... 21322.89 QUEENS 40.785144 -73.844833 7.0 19.0 929.0 4098344.0 4.040850e+09 College Point ...
3 3800 183 Lorriane Street LLC Tom Sapienza tsapienza@KLCNY.com 2.128406e+09 183 Lorraine Street Brooklyn NY 11231 Wholesale/Warehouse/Distribution ... 105016.49 BROOKLYN 40.673106 -74.002300 6.0 38.0 53.0 3336622.0 3.005720e+09 Carroll Gardens-Columbia Street-Red Hook ...
4 3482 21st Century Optics, Inc. Ralph Woythaler rwoythaler@21st centuryoptics.com 7.183922e+09 47-00 33rd Street Lond Island City NY 11101 Manufacturing ... 215757.20 QUEENS 40.742386 -73.932148 2.0 26.0 199.0 4003447.0 4.002520e+09 Hunters Point-Sunnyside-West Maspeth ...
<p>5 rows × 27 columns</p>
In [7]:
df_buildings = building_table[['Company Name','BIN','Industry','Industry Description','Company Type', 'Address',
                     'Effective Date','Savings from begining receiving benefits',
                     'Postcode','Borough','Latitude','Longitude']]

df_buildings = df_buildings.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Address': 'address',
                        'Savings from begining receiving benefits': 'Savings'})

df_buildings.dropna(axis=0, subset=['BIN','Longitude','Latitude'], inplace = True)
df_buildings.head()
Out[7]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name BIN Industry Business Program address Effective Date Savings Postcode Borough Latitude Longitude
0 139 ACA Realty, Inc. 4003160.0 Commercial Limousine Service ICIP 43-23 35th Street 04/07/2008 123975.24 11101 QUEENS 40.745706 -73.929565
1 141 Lake Avenue Realty c/o JR Produce, Inc. 5146740.0 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 141 Lake Avenue 12/08/2009 47512.89 10303 STATEN IS 40.633153 -74.150999
2 14-10 123rd Street LLC 4098344.0 Commercial Electrical Parts Mfg. ICIP 14-10 123rd Street 03/04/2011 21322.89 11356 QUEENS 40.785144 -73.844833
3 183 Lorriane Street LLC 3336622.0 Wholesale/Warehouse/Distribution Commercial Storage facility ICIP 183 Lorraine Street 11/06/2015 105016.49 11231 BROOKLYN 40.673106 -74.002300
4 21st Century Optics, Inc. 4003447.0 Manufacturing Eye glasses Tenant 47-00 33rd Street 01/07/2009 215757.20 11101 QUEENS 40.742386 -73.932148
In [8]:
# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

tree_xys = tree_table[['Longitude', 'Latitude']].values
tree_xys
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/__init__.py:65: VisibleDeprecationWarning: PySAL's API will be changed on 2018-12-31. The last release made with this API is version 1.14.4. A preview of the next API version is provided in the `pysal` 2.0 prelease candidate. The API changes and a guide on how to change imports is provided at https://pysal.org/about
  ), VisibleDeprecationWarning)
Out[8]:
array([[-73.84421522,  40.72309177],
       [-73.81867946,  40.79411067],
       [-73.9366077 ,  40.71758074],
       ...,
       [-74.13651724,  40.62076153],
       [-73.90311472,  40.85082819],
       [-73.78752646,  40.73216525]])
In [9]:
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
tree_count = len(tree_xys)
In [15]:
radius_in_miles = 0.5

# #def get_buildings(latlon):
def get_tree_count(r):
     xy = r['Longitude'], r['Latitude']
     distances, indices = bin_tree.query(
         xy, k=tree_count, distance_upper_bound=radius_in_miles)
     indices = indices[~np.isnan(indices)]
     return len(indices)
df_buildings['Total Tree Count within 0.5 Miles'] = df_buildings.apply(get_tree_count, axis=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-3a8acb7af80b> in <module>
      8      indices = indices[~np.isnan(indices)]
      9      return len(indices)
---> 10 df_buildings['Total Tree Count within 0.5 Miles'] = df_buildings.apply(get_tree_count, axis=1)

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-15-3a8acb7af80b> in get_tree_count(r)
      5      xy = r['Longitude'], r['Latitude']
      6      distances, indices = bin_tree.query(
----> 7          xy, k=tree_count, distance_upper_bound=radius_in_miles)
      8      indices = indices[~np.isnan(indices)]
      9      return len(indices)

NameError: ("name 'radius_in_miles' is not defined", 'occurred at index 0')
In [11]:
df_buildings.head()
Out[11]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name BIN Industry Business Program address Effective Date Savings Postcode Borough Latitude Longitude
0 139 ACA Realty, Inc. 4003160.0 Commercial Limousine Service ICIP 43-23 35th Street 04/07/2008 123975.24 11101 QUEENS 40.745706 -73.929565
1 141 Lake Avenue Realty c/o JR Produce, Inc. 5146740.0 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 141 Lake Avenue 12/08/2009 47512.89 10303 STATEN IS 40.633153 -74.150999
2 14-10 123rd Street LLC 4098344.0 Commercial Electrical Parts Mfg. ICIP 14-10 123rd Street 03/04/2011 21322.89 11356 QUEENS 40.785144 -73.844833
3 183 Lorriane Street LLC 3336622.0 Wholesale/Warehouse/Distribution Commercial Storage facility ICIP 183 Lorraine Street 11/06/2015 105016.49 11231 BROOKLYN 40.673106 -74.002300
4 21st Century Optics, Inc. 4003447.0 Manufacturing Eye glasses Tenant 47-00 33rd Street 01/07/2009 215757.20 11101 QUEENS 40.742386 -73.932148
In [12]:
'''


'''
Out[12]:
'\n\n\n'
In [13]:
def get_tree_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy,
        k=len(bin_tree.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_tree_table = tree_table.loc[tree_index[indices]]
    return selected_tree_table['tree_dbh'].mean()
In [14]:
partial_dataset_table = df_buildings[:5].copy()
partial_dataset_table[
    f'Average Savings of Nearby Buildings Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_tree_average, axis=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-c5e40567b37e> in <module>
      3     f'Average Savings of Nearby Buildings Within {radius_in_meters} Meters'
      4 ] = partial_dataset_table.apply(
----> 5     get_tree_average, axis=1)

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-13-ecab1e0b5024> in get_tree_average(r)
      1 def get_tree_average(r):
      2     xy = r['Longitude'], r['Latitude']
----> 3     distances, indices = tree_tree.query(
      4         xy,
      5         k=len(tree_tree.data),

NameError: ("name 'tree_tree' is not defined", 'occurred at index 0')
In [ ]: