# CrossCompute
a_table_path = 'Value_of_Energy_Cost_Savings_Program_Savings_for_Businesses.csv'
b_table_path = 'Jobs_Created_or_Retained_by_Energy_Cost_Savings_Program.csv'
target_folder = '/tmp'
import pandas as pd
a_table = pd.read_csv(a_table_path)
trimmed_a = a_table[['Company Name','BIN','Industry','Industry Description','Company Type',
'Effective Date','Savings from begining receiving benefits',
'Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude']]
trimmed_a.iloc[0]
b_table = pd.read_csv(b_table_path,na_values='n/a')
#df.loc[df['Investment'] == 'n/a' ] = np.nan
trimmed_b = b_table[['BIN','Invenstment']]
trimmed_b.iloc[-1]
key_column_name = 'BIN'
df = pd.merge(trimmed_a, trimmed_b, on=key_column_name)
df.reindex(columns=['Invenstment','Company Name','BIN','Industry','Industry Description','Company Type','Effective Date','Savings from begining receiving benefits','Current fulltime','Job create','Job retain','Postcode','Borough','Latitude','Longitude'])
df = df.rename(columns={'Invenstment': 'Investment',
'Industry Description': 'Business',
'Company Type': 'Program',
'Savings from begining receiving benefits': 'Accumulative Savings'})
from os.path import join
target_path = join(target_folder, 'table.csv')
df.to_csv(target_path, index=False)
print('m_table_path = ' + target_path)
import numpy as np
df.dropna(axis=0, subset=['BIN'], inplace = True)
df['Investment'] = df['Investment'].str.strip() # remove spaces
df['Investment'] = df['Investment'].replace("$","", regex=True) # remove dollar sign
df['Investment'] = df['Investment'].replace(",","") # remove thousands
df['Investment'] = pd.to_numeric(df['Investment'])
#df.loc[df['Investment'] == 'n/a' ] = np.nan
df.dropna(axis=0, subset=['Investment'], inplace = True)
df.shape
df.head()
Question : why the wrong investment values were assigned to companies that don't own them?
program_select = """
Tenant
Relocator
ICAP
ICIP
Tenant
IDA
"""
borough = ''
BIN = ''
industry_select = """
Manufacturing
Manufacturing
Commercial
Wholesale/Warehouse/Distribution
"""
limit = 50
df_gb = df.groupby(['Program','Industry'])['Investment'].sum().reset_index()
#Question: why values were concatenated instead of summed up?
df_gb['Investment']
df_gb
df.dtypes
len(df['Investment'])