ECSP




Pay Notebook Creator: Haige Cui0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
# CrossCompute
a_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'
b_table_path = 'Jobs_Created_or_Retained_by_Energy_Cost_Savings_Program.csv'


target_folder = '/tmp'
In [2]:
import pandas as pd
a_table = pd.read_csv(a_table_path)
trimmed_a = a_table[['Company Name','BIN','Industry','Industry Description','Company Type',
                     'Effective Date','Savings from begining receiving benefits',
                     'Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude']]
In [3]:
trimmed_a.iloc[0]
Out[3]:
Company Name                                139 ACA Realty, Inc.
BIN                                                  4.00316e+06
Industry                                              Commercial
Industry Description                           Limousine Service
Company Type                                                ICIP
Effective Date                                          4/7/2008
Savings from begining receiving benefits                  123975
Current fulltime                                             125
Job create                                                   NaN
Job retain                                                   NaN
Postcode                                                   11101
Borough                                                QUEENS   
Latitude                                                 40.7457
Longitude                                               -73.9296
Name: 0, dtype: object
In [16]:
b_table = pd.read_csv(b_table_path,na_values='n/a')

#df.loc[df['Investment'] == 'n/a' ] = np.nan
trimmed_b = b_table[['BIN','Invenstment']]

trimmed_b.iloc[-1]
Out[16]:
BIN                2.00415e+06
Invenstment    $25,769,954.00 
Name: 58, dtype: object
In [15]:
key_column_name = 'BIN'
df = pd.merge(trimmed_a, trimmed_b, on=key_column_name)
df.reindex(columns=['Invenstment','Company Name','BIN','Industry','Industry Description','Company Type','Effective Date','Savings from begining receiving benefits','Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude'])

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Accumulative Savings'})

from os.path import join
target_path = join(target_folder, 'table.csv')
df.to_csv(target_path, index=False)
print('m_table_path = ' + target_path)


import numpy as np

df.dropna(axis=0, subset=['BIN'], inplace = True)
df['Investment'] = df['Investment'].str.strip()                 # remove spaces 
df['Investment'] = df['Investment'].replace("$","", regex=True) # remove dollar sign
df['Investment'] = df['Investment'].replace(",","")             # remove thousands

df['Investment'] = pd.to_numeric(df['Investment'])

#df.loc[df['Investment'] == 'n/a' ] = np.nan

df.dropna(axis=0, subset=['Investment'], inplace = True)
df.shape
df.head()
m_table_path = /tmp/table.csv
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "$91,940.00"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-15-cef7f1de4737> in <module>
     21 df['Investment'] = df['Investment'].replace(",","")             # remove thousands
     22 
---> 23 df['Investment'] = pd.to_numeric(df['Investment'])
     24 
     25 #df.loc[df['Investment'] == 'n/a' ] = np.nan

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    131             coerce_numeric = False if errors in ('ignore', 'raise') else True
    132             values = lib.maybe_convert_numeric(values, set(),
--> 133                                                coerce_numeric=coerce_numeric)
    134 
    135     except Exception:

pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "$91,940.00" at position 12
In [ ]:
 

Question : how to re-adjust column order, e.g set 1st column 'Investment'?

Question : why result show 3 times the same company information

Question : why the wrong investment values were assigned to companies that don't own them?

In [4]:
program_select = """
    Tenant

    Relocator
    ICAP
    ICIP
    Tenant
    IDA
"""
borough = ''
BIN = ''
industry_select = """
    Manufacturing

    Manufacturing
    Commercial
    Wholesale/Warehouse/Distribution
"""
limit = 50
In [13]:
df_gb = df.groupby(['Program','Industry'])['Investment'].sum().reset_index() 
#Question: why values were concatenated instead of summed up?
df_gb['Investment']
df_gb
Out[13]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Program Industry Investment
0 ICAP Commercial $278,070.00
1 ICAP Manufacturing $736,000.00
2 ICIP Commercial $18,000,000.00
3 ICIP Manufacturing $13,132,488.00$41,017,840.00$50,000,000.00$500...
4 ICIP Wholesale/Warehouse/Distribution $13,132,488.00$41,017,840.00$50,000,000.00$2,2...
5 IDA Commercial $2,215,900.00
6 IDA Manufacturing $91,940.00$500,000.00$438,900.00$313,580.00
7 IDA Wholesale/Warehouse/Distribution $13,132,488.00$41,017,840.00$50,000,000.00
8 Relocator Commercial $13,132,488.00$41,017,840.00$50,000,000.00
9 Tenant Manufacturing $13,132,488.00$41,017,840.00$50,000,000.00$1,6...
In [9]:
df.dtypes
Out[9]:
Company Name             object
BIN                     float64
Industry                 object
Business                 object
Program                  object
Effective Date           object
Accumulative Savings    float64
Current fulltime        float64
Job create              float64
Job retain              float64
Postcode                float64
Borough                  object
Latitude                float64
Longitude               float64
Investment               object
dtype: object
In [11]:
len(df['Investment'])
Out[11]:
29
In [ ]: