In [1]:

# CrossCompute
target_folder = '/tmp'

In [2]:

url1 = 'https://data.cityofnewyork.us/api/views/ukdt-xm28/rows.csv'

In [3]:

#input_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'

In [4]:

#building_table = pd.read_csv(input_table_path,na_values='n/a')

In [5]:

import pandas as pd
import numpy as np

In [6]:

building_table = pd.read_csv(url1, na_values = 'n/a')

In [7]:

df = building_table[['Company Name','Industry','Industry Description','Company Type', 'Address',
                     'Effective Date','Savings from begining receiving benefits',
                     'Postcode','Borough',
                      'Latitude','Longitude']]

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Savings'})

df.dropna(axis=0, subset=['Longitude','Latitude'], inplace = True)
df[:3]

Out[7]:

	Company Name	Industry	Business	Program	Address	Effective Date	Savings	Postcode	Borough	Latitude	Longitude
0	139 ACA Realty, Inc.	Commercial	Limousine Service	ICIP	43-23 35th Street	04/07/2008	123975.24	11101	QUEENS	40.745706	-73.929565
1	141 Lake Avenue Realty c/o JR Produce, Inc.	Wholesale/Warehouse/Distribution	Dist. of prepacked salads	ICIP	141 Lake Avenue	12/08/2009	47512.89	10303	STATEN IS	40.633153	-74.150999
2	14-10 123rd Street LLC	Commercial	Electrical Parts Mfg.	ICIP	14-10 123rd Street	03/04/2011	21322.89	11356	QUEENS	40.785144	-73.844833

In [8]:

print(df.groupby('Industry').size())
print('\n')
print(df.groupby('Program').size())
print('\n')
#print(df.groupby('Postcode').size())
#print('\n')
print(df.groupby('Borough').size())

Industry
Commercial                           92
Landlord                              3
Manufacturing                       294
Other                                13
Public Benefit Corp                   1
Wholesale/Warehouse/Distribution    113
dtype: int64


Program
City/State     10
ICAP           27
ICIP          183
IDA            93
Relocator      96
Tenant        107
dtype: int64


Borough
BRONX         79
BROOKLYN     186
MANHATTAN     32
QUEENS       211
STATEN IS      8
dtype: int64

In [9]:

df.dtypes

Out[9]:

Company Name       object
Industry           object
Business           object
Program            object
Address            object
Effective Date     object
Savings           float64
Postcode            int64
Borough            object
Latitude          float64
Longitude         float64
dtype: object

In [10]:

from datetime import date
df['Effective Date'] = pd.to_datetime(df['Effective Date'].str.strip(), format='%m/%d/%Y')
df['Month Count'] = ((pd.to_datetime('2017-12-31') - df['Effective Date']) / np.timedelta64(1, 'M'))
df['Month Count'] = df['Month Count'].astype(int)  
df['Average Monthly Savings'] = (df['Savings'] / df['Month Count']).apply(lambda x: round(x, 2))

In [11]:

df = df.drop(['Company Name',
         'Postcode','Borough','Address',
         'Latitude','Longitude',
         'Effective Date','Month Count','Savings'], axis=1)

In [12]:

df.head()

Out[12]:

	Industry	Business	Program	Average Monthly Savings
0	Commercial	Limousine Service	ICIP	1068.75
1	Wholesale/Warehouse/Distribution	Dist. of prepacked salads	ICIP	494.93
2	Commercial	Electrical Parts Mfg.	ICIP	263.25
3	Wholesale/Warehouse/Distribution	Commercial Storage facility	ICIP	4200.66
4	Manufacturing	Eye glasses	Tenant	2016.42

In [13]:

# below code didnt work for me - Paloma - caused kernel dead repeatedly
url2 = 'https://data.cityofnewyork.us/api/views/5rq2-4hqu/rows.csv'
tree_table = pd.read_csv(url2, na_values='n/a')

In [14]:

# output: (157921, 4)
#input_table_path2 = '2015StreetTreesCensus_TREES.csv'
#table_b = pd.read_csv(input_table_path2, na_values='n/a') 
df2 = tree_table[['tree_id','status',
               #'address','zipcode','boroname', 
               'Latitude','longitude']]
df2 = df2.rename(columns={'longitude': 'Longitude'})
df2.dropna(subset=['Longitude', 'Latitude'], inplace=True)
df2[df2['status'] == 'Alive'].iloc[:3,]          ## filter out "stump" and "dead"
df2.shape

Out[14]:

(225985, 4)

In [15]:

df2 = df2.drop(['status'], axis=1)

In [16]:

df2['Longitude'] = np.round(np.array(df2['Longitude']),6)
df2['Latitude'] = np.round(np.array(df2['Latitude']),6)
#df2['Coordinate']=df2['Longitude'].astype(str)+','+df2['Latitude'].astype(str)
df2.head()

Out[16]:

	tree_id	Latitude	Longitude
0	180683.0	40.723092	-73.844215
1	200540.0	40.794111	-73.818679
2	204026.0	40.717581	-73.936608
3	204337.0	40.713537	-73.934456
4	189565.0	40.666778	-73.975979

In [17]:

df2.dtypes

Out[17]:

tree_id      float64
Latitude     float64
Longitude    float64
dtype: object

In [18]:

# Make kdtree
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

tree_xys = df2[['Longitude', 'Latitude']].values
tree_xys

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/__init__.py:65: VisibleDeprecationWarning: PySAL's API will be changed on 2018-12-31. The last release made with this API is version 1.14.4. A preview of the next API version is provided in the `pysal` 2.0 prelease candidate. The API changes and a guide on how to change imports is provided at https://pysal.org/about
  ), VisibleDeprecationWarning)

Out[18]:

array([[-73.844215,  40.723092],
       [-73.818679,  40.794111],
       [-73.936608,  40.717581],
       ...,
       [-73.959231,  40.599107],
       [-73.888072,  40.845787],
       [-73.803615,  40.779056]])

In [19]:

tree_count = len(tree_xys)
tree_count

Out[19]:

In [20]:

bin_tree = KDTree(tree_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
bin_tree

Out[20]:

<pysal.cg.kdtree.Arc_KDTree at 0x7f4ca607b2b0>

In [21]:

# Get buildings within X radius of input address - we might wanna abandon this idea
# def get_buildings(latlon)

In [22]:

radius_in_miles = 0.5

#def get_buildings(latlon)：
def get_tree_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_tree.query(
        xy, k=tree_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

df['Nearby Tree Count'] = df.apply(get_tree_count, axis=1)
df.head()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3123             try:
-> 3124                 return libindex.get_value_box(s, key)
   3125             except IndexError:

pandas/_libs/index.pyx in pandas._libs.index.get_value_box()

pandas/_libs/index.pyx in pandas._libs.index.get_value_box()

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-22-2acbf0236b02> in <module>
      9     return len(indices)
     10 
---> 11 df['Nearby Tree Count'] = df.apply(get_tree_count, axis=1)
     12 df.head()

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6012                          args=args,
   6013                          kwds=kwds)
-> 6014         return op.get_result()
   6015 
   6016     def applymap(self, func):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
    140             return self.apply_raw()
    141 
--> 142         return self.apply_standard()
    143 
    144     def apply_empty_result(self):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
    246 
    247         # compute the result using the series generator
--> 248         self.apply_series_generator()
    249 
    250         # wrap results

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
    275             try:
    276                 for i, v in enumerate(series_gen):
--> 277                     results[i] = self.f(v)
    278                     keys.append(v.name)
    279             except Exception as e:

<ipython-input-22-2acbf0236b02> in get_tree_count(r)
      3 #def get_buildings(latlon)：
      4 def get_tree_count(r):
----> 5     xy = r['Longitude'], r['Latitude']
      6     distances, indices = bin_tree.query(
      7         xy, k=tree_count, distance_upper_bound=radius_in_miles)

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    765         key = com._apply_if_callable(key, self)
    766         try:
--> 767             result = self.index.get_value(self, key)
    768 
    769             if not is_scalar(result):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3130                     raise InvalidIndexError(key)
   3131                 else:
-> 3132                     raise e1
   3133             except Exception:  # pragma: no cover
   3134                 raise e1

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   3116         try:
   3117             return self._engine.get_value(s, k,
-> 3118                                           tz=getattr(series.dtype, 'tz', None))
   3119         except KeyError as e1:
   3120             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('Longitude', 'occurred at index 0')

In [ ]:

df[:3]

In [ ]:

sav_xys = df[['Longitude', 'Latitude']].values
bin_sav = KDTree(sav_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
sav_count = len(sav_xys)

In [ ]:

def get_sav_average(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = bin_sav.query(
        xy,
        k=len(bin_sav.data),
        distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    indices = [int(x) for x in indices]
    selected_sav_table = df.loc[df.index[indices]]
    return selected_sav_table['Savings'].mean()

In [ ]:

# partial_dataset_table = df[:5].copy()
# partial_dataset_table[
#     f'Average Savings of Nearby Buildings Within {radius_in_miles} Miles'
# ] = partial_dataset_table.apply(
#     get_sav_average, axis=1)
# partial_dataset_table[:5]

In [ ]:

dataset_table = df.copy()
dataset_table[
    f'Average Monthly Savings of Nearby Buildings Within {radius_in_miles} Miles'
] = dataset_table.apply(
    get_sav_average, axis=1)
dataset_table[:5]

In [ ]:

target_path = target_folder + '/savings.csv'
dataset_table.to_csv(target_path, index=False)
print(f'prediction_table_path = {target_path}')

In [ ]:

Pay Notebook Creator: Haige Cui	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

ECSP