In [1]:

# CrossCompute
a_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'
b_table_path = 'Jobs_Created_or_Retained_by_Energy_Cost_Savings_Program.csv'


target_folder = '/tmp'

In [2]:

import pandas as pd
a_table = pd.read_csv(a_table_path)
trimmed_a = a_table[['Company Name','BIN','Industry','Industry Description','Company Type',
                     'Effective Date','Savings from begining receiving benefits',
                     'Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude']]

In [3]:

trimmed_a.iloc[0]

Out[3]:

Company Name                                139 ACA Realty, Inc.
BIN                                                  4.00316e+06
Industry                                              Commercial
Industry Description                           Limousine Service
Company Type                                                ICIP
Effective Date                                          4/7/2008
Savings from begining receiving benefits                  123975
Current fulltime                                             125
Job create                                                   NaN
Job retain                                                   NaN
Postcode                                                   11101
Borough                                                QUEENS   
Latitude                                                 40.7457
Longitude                                               -73.9296
Name: 0, dtype: object

In [16]:

b_table = pd.read_csv(b_table_path,na_values='n/a')

#df.loc[df['Investment'] == 'n/a' ] = np.nan
trimmed_b = b_table[['BIN','Invenstment']]

trimmed_b.iloc[-1]

Out[16]:

BIN                2.00415e+06
Invenstment    $25,769,954.00 
Name: 58, dtype: object

In [15]:

key_column_name = 'BIN'
df = pd.merge(trimmed_a, trimmed_b, on=key_column_name)
df.reindex(columns=['Invenstment','Company Name','BIN','Industry','Industry Description','Company Type','Effective Date','Savings from begining receiving benefits','Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude'])

df = df.rename(columns={'Invenstment': 'Investment', 
                        'Industry Description': 'Business', 
                        'Company Type': 'Program', 
                        'Savings from begining receiving benefits': 'Accumulative Savings'})

from os.path import join
target_path = join(target_folder, 'table.csv')
df.to_csv(target_path, index=False)
print('m_table_path = ' + target_path)


import numpy as np

df.dropna(axis=0, subset=['BIN'], inplace = True)
df['Investment'] = df['Investment'].str.strip()                 # remove spaces 
df['Investment'] = df['Investment'].replace("$","", regex=True) # remove dollar sign
df['Investment'] = df['Investment'].replace(",","")             # remove thousands

df['Investment'] = pd.to_numeric(df['Investment'])

#df.loc[df['Investment'] == 'n/a' ] = np.nan

df.dropna(axis=0, subset=['Investment'], inplace = True)
df.shape
df.head()

m_table_path = /tmp/table.csv

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "$91,940.00"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-15-cef7f1de4737> in <module>
     21 df['Investment'] = df['Investment'].replace(",","")             # remove thousands
     22 
---> 23 df['Investment'] = pd.to_numeric(df['Investment'])
     24 
     25 #df.loc[df['Investment'] == 'n/a' ] = np.nan

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    131             coerce_numeric = False if errors in ('ignore', 'raise') else True
    132             values = lib.maybe_convert_numeric(values, set(),
--> 133                                                coerce_numeric=coerce_numeric)
    134 
    135     except Exception:

pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "$91,940.00" at position 12

In [ ]:

In [4]:

program_select = """
    Tenant

    Relocator
    ICAP
    ICIP
    Tenant
    IDA
"""
borough = ''
BIN = ''
industry_select = """
    Manufacturing

    Manufacturing
    Commercial
    Wholesale/Warehouse/Distribution
"""
limit = 50

In [13]:

df_gb = df.groupby(['Program','Industry'])['Investment'].sum().reset_index() 
#Question: why values were concatenated instead of summed up?
df_gb['Investment']
df_gb

Out[13]:

	Program	Industry	Investment
0	ICAP	Commercial	$278,070.00
1	ICAP	Manufacturing	$736,000.00
2	ICIP	Commercial	$18,000,000.00
3	ICIP	Manufacturing	$13,132,488.00$41,017,840.00$50,000,000.00$500...
4	ICIP	Wholesale/Warehouse/Distribution	$13,132,488.00$41,017,840.00$50,000,000.00$2,2...
5	IDA	Commercial	$2,215,900.00
6	IDA	Manufacturing	$91,940.00$500,000.00$438,900.00$313,580.00
7	IDA	Wholesale/Warehouse/Distribution	$13,132,488.00$41,017,840.00$50,000,000.00
8	Relocator	Commercial	$13,132,488.00$41,017,840.00$50,000,000.00
9	Tenant	Manufacturing	$13,132,488.00$41,017,840.00$50,000,000.00$1,6...

In [9]:

df.dtypes

Out[9]:

Company Name             object
BIN                     float64
Industry                 object
Business                 object
Program                  object
Effective Date           object
Accumulative Savings    float64
Current fulltime        float64
Job create              float64
Job retain              float64
Postcode                float64
Borough                  object
Latitude                float64
Longitude               float64
Investment               object
dtype: object

In [11]:

len(df['Investment'])

Out[11]:

In [ ]:

ECSP

Question : how to re-adjust column order, e.g set 1st column 'Investment'?¶

Question : why result show 3 times the same company information¶

Pay Notebook Creator: Haige Cui	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0