FHV_BUS_Trips




Pay Notebook Creator: Ning Wei0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [147]:
# CrossCompute
target_folder = '/tmp'
In [95]:
import geotable
url1 = "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
taxi = geotable.load(url1)
url2 = "https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nynta_18d.zip"
nta = geotable.load(url2)
nta1 = nta
In [96]:
#nta.draw()
In [97]:
#nta.iloc[0].geometry_proj4 == taxi.iloc[0].geometry_proj4
In [98]:
import pandas as pd
import numpy as np
approx_nta = pd.Series()
n = nta.geometry_object
for i in range(0, len(taxi)):
    polygon = taxi.iloc[i].geometry_object
    a = [x.intersection(polygon).area for x in n]
    approx_nta.loc[i]= nta.iloc[np.argmax(a)].NTACode
In [99]:
approx_nta[:2]
Out[99]:
0    BK88
1    QN99
dtype: object
In [100]:
taxi.iloc[5]
Out[100]:
OBJECTID                                                           6
Shape_Leng                                                  0.150491
Shape_Area                                               0.000606461
zone                                         Arrochar/Fort Wadsworth
LocationID                                                         6
borough                                                Staten Island
geometry_object    POLYGON ((966568.7466657609 158679.854687795, ...
geometry_layer                                            taxi_zones
geometry_proj4     +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0...
Name: 5, dtype: object
In [101]:
region = pd.DataFrame(approx_nta, columns = ["NTA"])
In [102]:
# region['Taxi Zone'] = range(1,264)
region['Taxi Zone'] = taxi.LocationID
In [103]:
df = region.copy()
df[:3]
Out[103]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
NTA Taxi Zone
0 BK88 1
1 QN99 2
2 BX31 3
In [104]:
zone_id = 1
In [105]:
url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
def get_ride_count(zone_id):
    return pd.read_csv(url + str(zone_id))['count'][0]
In [106]:
get_ride_count(zone_id)
Out[106]:
18707
In [107]:
df['Taxi Trips'] = df['Taxi Zone'].apply(get_ride_count)
In [108]:
# def load(
#     endpoint_url,
#     selected_columns=None,
#     buffer_size=1000,
#     search_term_by_column=None,
#     **kw,
# ):
#     buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
#     if selected_columns:
#         select_string = ','.join(selected_columns)
#         buffer_url += f'&$select={select_string}'
#     for column, search_term in (search_term_by_column or {}).items():
#         buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
#     print(buffer_url)
#     tables = []
    
#     if endpoint_url.endswith('.json'):
#         f = pd.read_json
#     else:
#         f = pd.read_csv

#     t = f(buffer_url, **kw)
#     while len(t):
#         print(len(tables) * buffer_size + len(t))
#         tables.append(t)
#         offset = buffer_size * len(tables)
#         t = f(buffer_url + f'&$offset={offset}', **kw)
#     return pd.concat(tables, sort=False)

# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv'
# t = load(url, buffer_size = 100000, selected_columns=['PUlocationID'])
# t[:3]
In [109]:
df[-10:]
Out[109]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
NTA Taxi Zone Taxi Trips
253 BX44 254 362003
254 BK73 255 1130817
255 BK73 256 871085
256 BK40 257 166888
257 QN53 258 254391
258 BX62 259 233808
259 QN63 260 354218
260 MN25 261 383808
261 MN32 262 570956
262 MN32 263 707551
In [110]:
# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
# # for i in range(1,264):
# for i in range(1,3):
#     count = pd.read_csv(url + str(i))
#     df['Taxi Trips'][i-1] = count.iloc[0]
# df.describe()
In [111]:
#Import ACS dataset on median income by NTA
acs = pd.read_excel("https://www1.nyc.gov/assets/planning/download/office/data-maps/nyc-population/acs/econ_2016acs5yr_nta.xlsx?r=1")
In [112]:
acs = acs.set_index("GeoID")
In [113]:
acs = acs.sort_index()
incomes = acs['MdFamIncE']
In [114]:
incomes
Out[114]:
GeoID
BK09    177363.0
BK17     69275.0
BK19     53728.0
BK21     36671.0
BK23     62717.0
BK25     62837.0
BK26     55414.0
BK27     67616.0
BK28     57211.0
BK29     54960.0
BK30     61651.0
BK31     80079.0
BK32     44195.0
BK33    119440.0
BK34     35810.0
BK35     47967.0
BK37    146514.0
BK38    102181.0
BK40    111990.0
BK41     55736.0
BK42     54186.0
BK43     62604.0
BK44     69634.0
BK45     98144.0
BK46     68126.0
BK50     71138.0
BK58     77466.0
BK60     53129.0
BK61     43714.0
BK63     50225.0
          ...   
QN61     44926.0
QN62     55993.0
QN63     56077.0
QN66     90495.0
QN68     32411.0
QN70     61469.0
QN71     50107.0
QN72     73032.0
QN76     68079.0
QN98         NaN
QN99     78124.0
SI01    102303.0
SI05     88444.0
SI07     99548.0
SI08     64186.0
SI11    100128.0
SI12     66072.0
SI14     83357.0
SI22     59182.0
SI24    103850.0
SI25     99285.0
SI28     78431.0
SI32    102690.0
SI35     91550.0
SI36     82122.0
SI37     65940.0
SI45     90917.0
SI48     96734.0
SI54    102148.0
SI99         NaN
Name: MdFamIncE, Length: 195, dtype: float64
In [115]:
df1 = df.copy()
In [116]:
df1 = df1.set_index("NTA")
df1[:3]
Out[116]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips
NTA
BK88 1 18707
QN99 2 174
BX31 3 163797
In [117]:
df1['Median Income'] = incomes
In [118]:
df1[:3]
Out[118]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
BK88 1 18707 38187.0
QN99 2 174 78124.0
BX31 3 163797 69099.0
In [119]:
#df1.to_csv('trip.csv',index=False)
In [120]:
#nta1['Median Income'] = incomes
In [121]:
nta1 = nta1.set_index('NTACode')
In [123]:
nta1[:3]
Out[123]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
BoroCode BoroName CountyFIPS NTAName Shape_Leng Shape_Area geometry_object geometry_layer geometry_proj4
NTACode
BK88 3 Brooklyn 047 Borough Park 39247.228028 5.400502e+07 POLYGON ((990897.9000244141 169268.1207885742,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0...
QN51 4 Queens 081 Murray Hill 33266.904861 5.248828e+07 POLYGON ((1038593.459228516 221913.3550415039,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0...
QN27 4 Queens 081 East Elmhurst 19816.712318 1.972685e+07 POLYGON ((1022728.275024414 217530.8082275391,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0...
In [124]:
df1[:3]
Out[124]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
BK88 1 18707 38187.0
QN99 2 174 78124.0
BX31 3 163797 69099.0
In [132]:
df1.dropna()
Out[132]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
BK88 1 18707 38187.0
QN99 2 174 78124.0
BX31 3 163797 69099.0
MN28 4 482278 35703.0
SI48 5 17136 96734.0
SI14 6 31369 83357.0
QN70 7 1002109 61469.0
QN99 8 6560 78124.0
QN48 9 76735 85282.0
QN76 10 235165 68079.0
BK27 11 101297 67616.0
MN25 13 606377 196589.0
BK31 14 503026 80079.0
QN47 15 64970 93634.0
QN46 16 194964 90008.0
BK75 17 908486 42162.0
BX05 18 387167 35391.0
QN43 19 56137 88555.0
BX06 20 253036 28859.0
BK29 21 257318 54960.0
BK28 22 303555 57211.0
SI05 23 85426 88444.0
MN09 24 182624 98523.0
BK38 25 602703 102181.0
BK88 26 479977 38187.0
QN10 27 3673 102242.0
QN35 28 177292 67217.0
BK19 29 185729 53728.0
QN10 30 2714 102242.0
BX99 31 53787 52499.0
... ... ... ...
MN13 234 1551214 154989.0
BX36 235 402334 26424.0
MN40 236 905989 200000.0
MN40 237 1054207 200000.0
MN12 238 810409 160993.0
MN12 239 846216 160993.0
BX99 240 75129 52499.0
BX28 241 324463 45684.0
BX37 242 317786 47301.0
MN35 243 571055 50178.0
MN36 244 751893 44903.0
SI35 245 60550 91550.0
MN13 246 1241571 154989.0
BX63 247 427982 28914.0
BX08 248 187756 32977.0
MN23 249 1106833 197200.0
BX59 250 183467 50925.0
SI07 251 47145 99548.0
QN49 252 116749 92461.0
QN99 253 15315 78124.0
BX44 254 362003 47365.0
BK73 255 1130817 63867.0
BK73 256 871085 63867.0
BK40 257 166888 111990.0
QN53 258 254391 68456.0
BX62 259 233808 66916.0
QN63 260 354218 56077.0
MN25 261 383808 196589.0
MN32 262 570956 165223.0
MN32 263 707551 165223.0
<p>251 rows × 3 columns</p>
In [135]:
df1[df1.index.duplicated()]
Out[135]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
QN99 8 6560 78124.0
BK88 26 479977 38187.0
QN10 30 2714 102242.0
MN99 43 402928 NaN
MN15 50 595983 118304.0
BK33 52 145663 119440.0
BK33 54 74004 119440.0
QN25 56 277831 47892.0
BX10 58 19785 84240.0
BX99 59 14093 52499.0
BK38 65 523070 102181.0
BK38 66 449821 102181.0
MN25 87 760502 196589.0
MN25 88 254730 196589.0
MN13 90 875506 154989.0
QN99 93 57047 78124.0
QN99 96 18895 78124.0
SI05 99 3084 88444.0
MN99 103 0 NaN
MN99 103 0 NaN
MN99 103 0 NaN
BK99 111 7988 106943.0
MN23 114 827980 197200.0
MN99 120 41538 NaN
MN99 128 25495 NaN
QN98 138 1349062 NaN
MN31 141 835081 189296.0
MN14 143 665701 200000.0
MN24 144 990423 191870.0
MN27 148 1285344 35780.0
... ... ... ...
BX10 183 83984 84240.0
BX99 184 20479 52499.0
MN17 186 682736 193129.0
BK99 190 99312 106943.0
QN68 193 114321 32411.0
MN99 194 15047 NaN
BK33 195 217065 119440.0
QN10 201 29359 102242.0
MN31 202 52494 189296.0
QN99 207 1413 78124.0
MN25 209 225103 196589.0
BK17 210 308606 69275.0
MN24 211 684450 191870.0
QN31 226 568996 72573.0
MN19 229 874687 200000.0
MN17 230 1180166 193129.0
MN24 231 1358037 191870.0
MN28 232 541380 35703.0
MN19 233 818838 200000.0
MN13 234 1551214 154989.0
MN40 237 1054207 200000.0
MN12 238 810409 160993.0
MN12 239 846216 160993.0
BX99 240 75129 52499.0
MN13 246 1241571 154989.0
MN23 249 1106833 197200.0
QN99 253 15315 78124.0
BK73 256 871085 63867.0
MN25 261 383808 196589.0
MN32 263 707551 165223.0
<p>69 rows × 3 columns</p>
In [138]:
df1 = df1[~df1.index.duplicated()]
In [139]:
nta1['Taxi Trips'] = df1['Taxi Trips']
In [142]:
nta1[:3]
Out[142]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
BoroCode BoroName CountyFIPS NTAName Shape_Leng Shape_Area geometry_object geometry_layer geometry_proj4 Taxi Trips
NTACode
BK88 3 Brooklyn 047 Borough Park 39247.228028 5.400502e+07 POLYGON ((990897.9000244141 169268.1207885742,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0... 18707.0
QN51 4 Queens 081 Murray Hill 33266.904861 5.248828e+07 POLYGON ((1038593.459228516 221913.3550415039,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0... 178341.0
QN27 4 Queens 081 East Elmhurst 19816.712318 1.972685e+07 POLYGON ((1022728.275024414 217530.8082275391,... nynta +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0... 201942.0
In [143]:
Map_geotable = nta1.copy()
In [145]:
Map_geotable['fill_greens'] = Map_geotable['Taxi Trips']
In [148]:
target_path = target_folder + '/choropleth.csv'
Map_geotable.to_csv(target_path, index=False)
print('a_geotable_path = %s' % target_path)
a_geotable_path = /tmp/choropleth.csv
In [ ]: