ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

''' Build Training Dataset

  1. Load and import data
  2. Explore dataset
  3. Prepare training data
  4. Test

'''

In [1]:
# CrossCompute
target_folder = '/tmp'
In [2]:
url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv'
In [3]:
#input_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'
In [4]:
#building_table = pd.read_csv(input_table_path,na_values='n/a')
In [5]:
import pandas as pd
import numpy as np
In [6]:
building_table = pd.read_csv(url1, na_values = 'n/a')
In [7]:
df = building_table[['Company Name','Industry','Industry Description','Company Type', 'Address',
                     'Effective Date','Savings from begining receiving benefits',
                     'Postcode','Borough',
                      'Latitude','Longitude']]

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Savings'})

df.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
df[:3]
Out[7]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Company Name Industry Business Program Address Effective Date Savings Postcode Borough Latitude Longitude
0 139 ACA Realty, Inc. Commercial Limousine Service ICIP 43-23 35th Street 04/07/2008 123975.24 11101 QUEENS 40.745706 -73.929565
1 141 Lake Avenue Realty c/o JR Produce, Inc. Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 141 Lake Avenue 12/08/2009 47512.89 10303 STATEN IS 40.633153 -74.150999
2 14-10 123rd Street LLC Commercial Electrical Parts Mfg. ICIP 14-10 123rd Street 03/04/2011 21322.89 11356 QUEENS 40.785144 -73.844833
In [8]:
print(df.groupby('Industry').size())
print('\n')
print(df.groupby('Program').size())
print('\n')
#print(df.groupby('Postcode').size())
#print('\n')
print(df.groupby('Borough').size())
Industry
Commercial                           92
Landlord                              3
Manufacturing                       294
Other                                13
Public Benefit Corp                   1
Wholesale/Warehouse/Distribution    113
dtype: int64


Program
City/State     10
ICAP           27
ICIP          183
IDA            93
Relocator      96
Tenant        107
dtype: int64


Borough
BRONX         79
BROOKLYN     186
MANHATTAN     32
QUEENS       211
STATEN IS      8
dtype: int64
In [9]:
df.dtypes
Out[9]:
Company Name       object
Industry           object
Business           object
Program            object
Address            object
Effective Date     object
Savings           float64
Postcode            int64
Borough            object
Latitude          float64
Longitude         float64
dtype: object
In [10]:
from datetime import date
df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
df['Month Count'] = df['Month Count'].astype(int)  
df['Average Monthly Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))
In [11]:
df = df.drop(['Company Name',
         'Postcode','Borough','Address',
         'Latitude','Longitude',
         'Effective Date','Month Count','Savings'], axis=1)
In [12]:
df.head()
Out[12]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Industry Business Program Average Monthly Savings
0 Commercial Limousine Service ICIP 1068.75
1 Wholesale/Warehouse/Distribution Dist. of prepacked salads ICIP 494.93
2 Commercial Electrical Parts Mfg. ICIP 263.25
3 Wholesale/Warehouse/Distribution Commercial Storage facility ICIP 4200.66
4 Manufacturing Eye glasses Tenant 2016.42
In [13]:
# below code didnt work for me - Paloma - caused kernel dead repeatedly
url2 = 'https://data.cityofnewyork.us/api/views/5rq2-4hqu/rows.csv'
tree_table = pd.read_csv(url2, na_values='n/a') 
# below codes didn't work for me - Paloma # Load trees # selected_columns = 'tree_id', 'Latitude', 'Longitude' # buffer_size = 1000 # tree_table = load(url2, selected_columns, buffer_size) # tree_table[:3]
In [14]:
# output: (157921, 4)
#input_table_path2 = '2015StreetTreesCensus_TREES.csv'
#table_b = pd.read_csv(input_table_path2, na_values='n/a') 
df2 = tree_table[['tree_id','status',
               #'address','zipcode','boroname', 
               'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,]          ## filter out "stump" and "dead"
df2.shape
Out[14]:
(225985, 4)
In [15]:
df2 = df2.drop(['status'], axis=1)
In [16]:
df2['Longitude'] = np.round(np.array(df2['Longitude']),6)
df2['Latitude'] = np.round(np.array(df2['Latitude']),6)
#df2['Coordinate']=df2['Longitude'].astype(str)+','+df2['Latitude'].astype(str)
df2.head()
Out[16]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
tree_id Latitude Longitude
0 180683.0 40.723092 -73.844215
1 200540.0 40.794111 -73.818679
2 204026.0 40.717581 -73.936608
3 204337.0 40.713537 -73.934456
4 189565.0 40.666778 -73.975979
In [17]:
df2.dtypes
Out[17]:
tree_id      float64
Latitude     float64
Longitude    float64
dtype: object
In [18]:
# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

tree_xys = df2[['Longitude', 'Latitude']].values
tree_xys
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/__init__.py:65: VisibleDeprecationWarning: PySAL's API will be changed on 2018-12-31. The last release made with this API is version 1.14.4. A preview of the next API version is provided in the `pysal` 2.0 prelease candidate. The API changes and a guide on how to change imports is provided at https://pysal.org/about
  ), VisibleDeprecationWarning)
Out[18]:
array([[-73.844215,  40.723092],
       [-73.818679,  40.794111],
       [-73.936608,  40.717581],
       ...,
       [-73.959231,  40.599107],
       [-73.888072,  40.845787],
       [-73.803615,  40.779056]])
In [19]:
tree_count = len(tree_xys)
tree_count
Out[19]:
225985
In [20]:
bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
bin_tree
Out[20]:
<pysal.cg.kdtree.Arc_KDTree at 0x7f4ca607b2b0>
In [21]:
# Get buildings within X radius of input address - we might wanna abandon this idea
# def get_buildings(latlon)
In [22]:
radius_in_miles = 0.5

#def get_buildings(latlon):
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

df['Nearby Tree Count'] = df.apply(get_tree_count, axis=1)
df.head()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3123             try:
-> 3124                 return libindex.get_value_box(s, key)
   3125             except IndexError:

pandas/_libs/index.pyx in pandas._libs.index.get_value_box()

pandas/_libs/index.pyx in pandas._libs.index.get_value_box()

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-22-2acbf0236b02> in <module>
      9     return len(indices)
     10 
---> 11 df['Nearby Tree Count'] = df.apply(get_tree_count, axis=1)
     12 df.head()

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-22-2acbf0236b02> in get_tree_count(r)
      3 #def get_buildings(latlon):
      4 def get_tree_count(r):
----> 5     xy = r['Longitude'], r['Latitude']
      6     distances, indices = bin_tree.query(
      7         xy, k=tree_count, distance_upper_bound=radius_in_miles)

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    765         key = com._apply_if_callable(key, self)
    766         try:
--> 767             result = self.index.get_value(self, key)
    768 
    769             if not is_scalar(result):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3130                     raise InvalidIndexError(key)
   3131                 else:
-> 3132                     raise e1
   3133             except Exception:  # pragma: no cover
   3134                 raise e1

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3116         try:
   3117             return self._engine.get_value(s, k,
-> 3118                                           tz=getattr(series.dtype, 'tz', None))
   3119         except KeyError as e1:
   3120             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('Longitude', 'occurred at index 0')
In [ ]:
 
In [ ]:
df[:3]
In [ ]:
sav_xys = df[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)
In [ ]:
def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = df.loc[df.index[indices]]
    return selected_sav_table['Savings'].mean()
In [ ]:
# partial_dataset_table = df[:5].copy()
# partial_dataset_table[
#     f'Average Savings of Nearby Buildings Within {radius_in_miles} Miles'
# ] = partial_dataset_table.apply(
#     get_sav_average, axis=1)
# partial_dataset_table[:5]
In [ ]:
dataset_table = df.copy()
dataset_table[
    f'Average Monthly Savings of Nearby Buildings Within {radius_in_miles} Miles'
] = dataset_table.apply(
    get_sav_average, axis=1)
dataset_table[:5]
In [ ]:
target_path = target_folder + '/savings.csv'
dataset_table.to_csv(target_path, index=False)
print(f'prediction_table_path = {target_path}')
In [ ]: