# CrossCompute
target_folder = '/tmp'
import geotable
url1 = "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
taxi = geotable.load(url1)
url2 = "https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nynta_18d.zip"
nta = geotable.load(url2)
nta1 = nta
#nta.draw()
#nta.iloc[0].geometry_proj4 == taxi.iloc[0].geometry_proj4
import pandas as pd
import numpy as np
approx_nta = pd.Series()
n = nta.geometry_object
for i in range(0, len(taxi)):
polygon = taxi.iloc[i].geometry_object
a = [x.intersection(polygon).area for x in n]
approx_nta.loc[i]= nta.iloc[np.argmax(a)].NTACode
approx_nta[:2]
taxi.iloc[5]
region = pd.DataFrame(approx_nta, columns = ["NTA"])
# region['Taxi Zone'] = range(1,264)
region['Taxi Zone'] = taxi.LocationID
df = region.copy()
df[:3]
zone_id = 1
url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
def get_ride_count(zone_id):
return pd.read_csv(url + str(zone_id))['count'][0]
get_ride_count(zone_id)
df['Taxi Trips'] = df['Taxi Zone'].apply(get_ride_count)
# def load(
# endpoint_url,
# selected_columns=None,
# buffer_size=1000,
# search_term_by_column=None,
# **kw,
# ):
# buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
# if selected_columns:
# select_string = ','.join(selected_columns)
# buffer_url += f'&$select={select_string}'
# for column, search_term in (search_term_by_column or {}).items():
# buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
# print(buffer_url)
# tables = []
# if endpoint_url.endswith('.json'):
# f = pd.read_json
# else:
# f = pd.read_csv
# t = f(buffer_url, **kw)
# while len(t):
# print(len(tables) * buffer_size + len(t))
# tables.append(t)
# offset = buffer_size * len(tables)
# t = f(buffer_url + f'&$offset={offset}', **kw)
# return pd.concat(tables, sort=False)
# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv'
# t = load(url, buffer_size = 100000, selected_columns=['PUlocationID'])
# t[:3]
df[-10:]
# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
# # for i in range(1,264):
# for i in range(1,3):
# count = pd.read_csv(url + str(i))
# df['Taxi Trips'][i-1] = count.iloc[0]
# df.describe()
#Import ACS dataset on median income by NTA
acs = pd.read_excel("https://www1.nyc.gov/assets/planning/download/office/data-maps/nyc-population/acs/econ_2016acs5yr_nta.xlsx?r=1")
acs = acs.set_index("GeoID")
acs = acs.sort_index()
incomes = acs['MdFamIncE']
incomes
df1 = df.copy()
df1 = df1.set_index("NTA")
df1[:3]
df1['Median Income'] = incomes
df1[:3]
#df1.to_csv('trip.csv',index=False)
#nta1['Median Income'] = incomes
nta1 = nta1.set_index('NTACode')
nta1[:3]
df1[:3]
df1.dropna()
df1[df1.index.duplicated()]
df1 = df1[~df1.index.duplicated()]
nta1['Taxi Trips'] = df1['Taxi Trips']
nta1[:3]
Map_geotable = nta1.copy()
Map_geotable['fill_greens'] = Map_geotable['Taxi Trips']
target_path = target_folder + '/choropleth.csv'
Map_geotable.to_csv(target_path, index=False)
print('a_geotable_path = %s' % target_path)