Predict bus ridership using median income and FHV trips per region




Pay Notebook Creator: Henry Weng0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
import geotable
url1 = "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
taxi = geotable.load(url1)
In [2]:
url2 = "https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nynta_18d.zip"
nta = geotable.load(url2)
In [3]:
nta.iloc[0].geometry_proj4 == taxi.iloc[0].geometry_proj4
Out[3]:
True
In [6]:
import pandas as pd
import numpy as np
approx_nta = pd.Series()
n = nta.geometry_object
for i in range(0, len(taxi)):
    polygon = taxi.iloc[i].geometry_object
    a = [x.intersection(polygon).area for x in n]
    approx_nta.loc[i]= nta.iloc[np.argmax(a)].NTACode
Out[6]:
<bound method NDFrame.head of 0      BK88
1      QN99
2      BX31
3      MN28
4      SI48
5      SI14
6      QN70
7      QN99
8      QN48
9      QN76
10     BK27
11     MN99
12     MN25
13     BK31
14     QN47
15     QN46
16     BK75
17     BX05
18     QN43
19     BX06
20     BK29
21     BK28
22     SI05
23     MN09
24     BK38
25     BK88
26     QN10
27     QN35
28     BK19
29     QN10
       ... 
233    MN13
234    BX36
235    MN40
236    MN40
237    MN12
238    MN12
239    BX99
240    BX28
241    BX37
242    MN35
243    MN36
244    SI35
245    MN13
246    BX63
247    BX08
248    MN23
249    BX59
250    SI07
251    QN49
252    QN99
253    BX44
254    BK73
255    BK73
256    BK40
257    QN53
258    BX62
259    QN63
260    MN25
261    MN32
262    MN32
Length: 263, dtype: object>
In [7]:
approx_nta[:2]
Out[7]:
0    BK88
1    QN99
dtype: object
In [8]:
taxi.iloc[5]
Out[8]:
OBJECTID                                                           1
Shape_Leng                                                  0.116357
Shape_Area                                               0.000782307
zone                                                  Newark Airport
LocationID                                                         1
borough                                                          EWR
geometry_object    POLYGON ((933100.9183527103 192536.0856972019,...
geometry_layer                                            taxi_zones
geometry_proj4     +proj=lcc +lat_1=40.66666666666666 +lat_2=41.0...
Name: 0, dtype: object
In [9]:
region = pd.DataFrame(approx_nta, columns = ["NTA"])
In [10]:
# region['Taxi Zone'] = range(1,264)
region['Taxi Zone'] = taxi.LocationID
In [28]:
df = region.copy()
df[:3]
Out[28]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
NTA Taxi Zone
0 BK88 1
1 QN99 2
2 BX31 3
In [22]:
zone_id = 1
In [38]:
url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
def get_ride_count(zone_id):
    return pd.read_csv(url + str(zone_id))['count'][0]
In [30]:
get_ride_count(zone_id)
Out[30]:
18707
In [39]:
df['Taxi Trips'] = df['Taxi Zone'].apply(get_ride_count)
In [37]:
# def load(
#     endpoint_url,
#     selected_columns=None,
#     buffer_size=1000,
#     search_term_by_column=None,
#     **kw,
# ):
#     buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
#     if selected_columns:
#         select_string = ','.join(selected_columns)
#         buffer_url += f'&$select={select_string}'
#     for column, search_term in (search_term_by_column or {}).items():
#         buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
#     print(buffer_url)
#     tables = []
    
#     if endpoint_url.endswith('.json'):
#         f = pd.read_json
#     else:
#         f = pd.read_csv

#     t = f(buffer_url, **kw)
#     while len(t):
#         print(len(tables) * buffer_size + len(t))
#         tables.append(t)
#         offset = buffer_size * len(tables)
#         t = f(buffer_url + f'&$offset={offset}', **kw)
#     return pd.concat(tables, sort=False)

# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv'
# t = load(url, buffer_size = 100000, selected_columns=['PUlocationID'])
# t[:3]
https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$limit=100000&$select=PUlocationID
85003
184819
284892
383980
485019
582430
680981
784936
884894
984884
1082041
1185071
1284975
1382647
1482715
1582519
1685007
1782428
1884898
1984901
2084906
2184968
2282703
2382343
2482581
2582411
2685342
2782588
2885600
2985318
3085547
3183488
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-37-0b82c58835d2> in <module>
     29 
     30 url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv'
---> 31 t = load(url, buffer_size = 100000, selected_columns=['PUlocationID'])
     32 t[:3]

<ipython-input-37-0b82c58835d2> in load(endpoint_url, selected_columns, buffer_size, search_term_by_column, **kw)
     25         tables.append(t)
     26         offset = buffer_size * len(tables)
---> 27         t = f(buffer_url + f'&$offset={offset}', **kw)
     28     return pd.concat(tables, sort=False)
     29 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
    676                     skip_blank_lines=skip_blank_lines)
    677 
--> 678         return _read(filepath_or_buffer, kwds)
    679 
    680     parser_f.__name__ = name

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    422     compression = _infer_compression(filepath_or_buffer, compression)
    423     filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
--> 424         filepath_or_buffer, encoding, compression)
    425     kwds['compression'] = compression
    426 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
    193 
    194     if _is_url(filepath_or_buffer):
--> 195         req = _urlopen(filepath_or_buffer)
    196         content_encoding = req.headers.get('Content-Encoding', None)
    197         if content_encoding == 'gzip':

/usr/lib64/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/usr/lib64/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response

/usr/lib64/python3.6/urllib/request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result

/usr/lib64/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/usr/lib64/python3.6/urllib/request.py in https_open(self, req)
   1359         def https_open(self, req):
   1360             return self.do_open(http.client.HTTPSConnection, req,
-> 1361                 context=self._context, check_hostname=self._check_hostname)
   1362 
   1363         https_request = AbstractHTTPHandler.do_request_

/usr/lib64/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1319             except OSError as err: # timeout error
   1320                 raise URLError(err)
-> 1321             r = h.getresponse()
   1322         except:
   1323             h.close()

/usr/lib64/python3.6/http/client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

/usr/lib64/python3.6/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

/usr/lib64/python3.6/http/client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

/usr/lib64/python3.6/socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True

/usr/lib64/python3.6/ssl.py in recv_into(self, buffer, nbytes, flags)
    963                   "non-zero flags not allowed in calls to recv_into() on %s" %
    964                   self.__class__)
--> 965             return self.read(nbytes, buffer)
    966         else:
    967             return socket.recv_into(self, buffer, nbytes, flags)

/usr/lib64/python3.6/ssl.py in read(self, len, buffer)
    825             raise ValueError("Read on closed or unwrapped SSL socket.")
    826         try:
--> 827             return self._sslobj.read(len, buffer)
    828         except SSLError as x:
    829             if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:

/usr/lib64/python3.6/ssl.py in read(self, len, buffer)
    585         """
    586         if buffer is not None:
--> 587             v = self._sslobj.read(len, buffer)
    588         else:
    589             v = self._sslobj.read(len)

KeyboardInterrupt: 
In [43]:
df[-10:]
Out[43]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
NTA Taxi Zone Taxi Trips
253 BX44 254 362003
254 BK73 255 1130817
255 BK73 256 871085
256 BK40 257 166888
257 QN53 258 254391
258 BX62 259 233808
259 QN63 260 354218
260 MN25 261 383808
261 MN32 262 570956
262 MN32 263 707551
In [14]:
# url = 'https://data.cityofnewyork.us/resource/ifj4-ept5.csv?$select=count(*)&$where=PUlocationID=';
# # for i in range(1,264):
# for i in range(1,3):
#     count = pd.read_csv(url + str(i))
#     df['Taxi Trips'][i-1] = count.iloc[0]
# df.describe()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Taxi Trips'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-14-9fe664776ee6> in <module>
      3 for i in range(1,3):
      4     count = pd.read_csv(url + str(i))
----> 5     df['Taxi Trips'][i-1] = count.iloc[0]
      6 df.describe()

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Taxi Trips'
In [44]:
#Import ACS dataset on median income by NTA
acs = pd.read_excel("https://www1.nyc.gov/assets/planning/download/office/data-maps/nyc-population/acs/econ_2016acs5yr_nta.xlsx?r=1")
In [45]:
acs = acs.set_index("GeoID")
In [49]:
acs = acs.sort_index()
incomes = acs['MdFamIncE']
Out[49]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
NTA Taxi Zone Taxi Trips Median Income
0 BK88 1 18707 NaN
1 QN99 2 174 NaN
2 BX31 3 163797 NaN
In [52]:
df1 = df.copy()
In [54]:
df1 = df1.set_index("NTA")
df1[:3]
Out[54]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
BK88 1 18707 NaN
QN99 2 174 NaN
BX31 3 163797 NaN
In [56]:
df1['Median Income'] = incomes
In [57]:
df1[:3]
Out[57]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Taxi Zone Taxi Trips Median Income
NTA
BK88 1 18707 38187.0
QN99 2 174 78124.0
BX31 3 163797 69099.0
In [60]:
df1.to_csv('trip.csv',index=False)
In [61]:
ls -l
total 136
-rw-r--r--. 1 user user 68470 Feb 13 20:37 'FHV, MTA Monthly .ipynb'
-rw-r--r--. 1 user user 51984 Feb 13 20:38  geotable.ipynb
-rw-r--r--. 1 user user  4989 Feb 13 05:36 'Main Tool.ipynb'
-rw-r--r--. 1 user user  4845 Feb 13 20:38  trip.csv