Introduction to Computational Analysis




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
from pandas import read_csv
calendar = read_csv('datasets/NYC-MTA-Calendar.csv')
routes = read_csv('datasets/NYC-MTA-Routes.csv')
trips = read_csv('datasets/NYC-MTA-Trips.csv')
times = read_csv('datasets/NYC-MTA-StopTimes.csv')
stops = read_csv('datasets/NYC-MTA-Stops.csv')
In [23]:
calendar.loc[0]
Out[23]:
service_id    A20120610WKD
monday                   1
tuesday                  1
wednesday                1
thursday                 1
friday                   1
saturday                 0
sunday                   0
start_date        20120610
end_date          20121231
Name: 0, dtype: object
In [3]:
weekday_service_ids = list(filter(lambda x: x.endswith('WKD'), calendar.service_id))
weekday_service_ids
Out[3]:
['A20120610WKD', 'B20120610WKD', 'R20100324WKD']
In [24]:
routes.loc[0]
Out[24]:
route_id                                                            1
agency_id                                                    MTA NYCT
route_short_name                                                    1
route_long_name                             Broadway - 7 Avenue Local
route_desc          Trains operate between 242 St in the Bronx and...
route_type                                                          1
route_url              http://www.mta.info/nyct/service/pdf/t1cur.pdf
route_color                                                    EE352E
route_text_color                                                  NaN
Name: 0, dtype: object
In [5]:
route_name_by_id = {x['route_id']: x['route_long_name'] for index, x in routes.iterrows()}
route_name_by_id['3']
Out[5]:
'7 Avenue Express'
In [25]:
trips.loc[0]
Out[25]:
route_id                                   1
service_id                      A20120610WKD
trip_id          A20120610WKD_000800_1..S03R
trip_headsign                    SOUTH FERRY
direction_id                               1
block_id                                 NaN
shape_id                             1..S03R
Name: 0, dtype: object
In [7]:
weekday_trips = trips[trips.service_id.isin(weekday_service_ids)]
len(weekday_trips)
Out[7]:
7953
In [26]:
times.loc[0]
Out[26]:
trip_id                A20120610WKD_000800_1..S03R
arrival_time                              00:08:00
departure_time                            00:08:00
stop_id                                       101S
stop_sequence                                    1
stop_headsign                                  NaN
pickup_type                                      0
drop_off_type                                    0
shape_dist_traveled                            NaN
Name: 0, dtype: object
In [9]:
weekday_times = times[times.trip_id.isin(weekday_trips.trip_id.unique())]
len(weekday_times)
Out[9]:
221440
In [27]:
stops.loc[0]
Out[27]:
stop_id                                   101
stop_code                                 NaN
stop_name         Van Cortlandt Park - 242 St
stop_desc                                 NaN
stop_lat                              40.8892
stop_lon                             -73.8986
zone_id                                   NaN
stop_url                                  NaN
location_type                               1
parent_station                            NaN
Name: 0, dtype: object
In [11]:
# List weekday stops on the 5 line
route_id = '5'
print(route_name_by_id[route_id])
route_5_weekday_trips = weekday_trips[weekday_trips.route_id == route_id]
stop_ids = route_5_weekday_trips.merge(weekday_times).stop_id.unique()
stops[stops.stop_id.isin(stop_ids)].stop_name.unique()
Lexington Avenue Express
Out[11]:
array(['Nereid Av', '233 St', '225 St', '219 St', 'Gun Hill Rd',
       'Burke Av', 'Allerton Av', 'Pelham Pkwy', 'Bronx Park East',
       'E 180 St', 'West Farms Sq - E Tremont Av', '174 St', 'Freeman St',
       'Simpson St', 'Intervale Av', 'Prospect Av', 'Jackson Av',
       '3 Av - 149 St', '149 St - Grand Concourse', 'Nevins St',
       'Atlantic Av', 'Franklin Av', 'President St', 'Sterling St',
       'Winthrop St', 'Church Av', 'Beverly Rd', 'Newkirk Av',
       'Flatbush Av - Brooklyn College', 'Nostrand Av', 'Kingston Av',
       'Crown Hts - Utica Av', 'Sutter Av - Rutland Rd', 'Saratoga Av',
       'Rockaway Av', 'Junius St', 'Pennsylvania Av', 'Van Siclen Av',
       'New Lots Av', '138 St - Grand Concourse', 'Fulton St', 'Wall St',
       'Bowling Green', 'Borough Hall', 'Eastchester - Dyre Av',
       'Baychester Av', 'Morris Park', '125 St', '86 St', '59 St',
       'Grand Central - 42 St', '14 St - Union Sq',
       'Brooklyn Bridge - City Hall'], dtype=object)
In [30]:
route_5_weekday_trips.loc[route_5_weekday_trips.index[224]]
Out[30]:
route_id                                   5
service_id                      A20120610WKD
trip_id          A20120610WKD_101200_5..N04R
trip_headsign          EASTCHESTER - DYRE AV
direction_id                               0
block_id                                 NaN
shape_id                             5..N04R
Name: 3594, dtype: object
In [31]:
route_5_weekday_trips.iloc[224]
Out[31]:
route_id                                   5
service_id                      A20120610WKD
trip_id          A20120610WKD_101200_5..N04R
trip_headsign          EASTCHESTER - DYRE AV
direction_id                               0
block_id                                 NaN
shape_id                             5..N04R
Name: 3594, dtype: object
In [28]:
# List weekday stops on the 5 line by departure time
route_5_trip = route_5_weekday_trips.iloc[224]
route_5_trip_times = weekday_times[weekday_times.trip_id == route_5_trip['trip_id']]
route_5_stops = route_5_trip_times.merge(stops)[['departure_time', 'stop_id', 'stop_name']]
route_5_stops.sort_values('departure_time')
Out[28]:
<style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style>
departure_time stop_id stop_name
0 16:52:00 247N Flatbush Av - Brooklyn College
1 16:53:42 246N Newkirk Av
2 16:55:06 245N Beverly Rd
3 16:56:30 244N Church Av
4 16:57:48 243N Winthrop St
5 16:59:12 242N Sterling St
6 17:00:30 241N President St
7 17:04:30 239N Franklin Av
8 17:08:30 235N Atlantic Av
9 17:10:00 234N Nevins St
10 17:12:12 423N Borough Hall
11 17:16:30 420N Bowling Green
12 17:18:06 419N Wall St
13 17:19:42 418N Fulton St
14 17:21:30 640N Brooklyn Bridge - City Hall
15 17:27:24 635N 14 St - Union Sq
16 17:32:30 631N Grand Central - 42 St
17 17:35:18 629N 59 St
18 17:39:06 626N 86 St
19 17:44:00 621N 125 St
20 17:47:30 416N 138 St - Grand Concourse
21 17:50:00 222N 149 St - Grand Concourse
22 17:52:42 221N 3 Av - 149 St
23 18:01:00 213N E 180 St
24 18:04:12 505N Morris Park
25 18:05:54 504N Pelham Pkwy
26 18:08:36 503N Gun Hill Rd
27 18:11:00 502N Baychester Av
28 18:13:30 501N Eastchester - Dyre Av
In [13]:
# Build a network connecting stops on the 5 line and 6 line
import datetime
import networkx

def make_graph(trip_ids):
    graph = networkx.MultiDiGraph()
    for trip_id in trip_ids:
        trip_stops = get_trip_stops(trip_id)
        stop_generator = trip_stops.iterrows()
        last_stop = next(stop_generator)[1]
        for index, this_stop in stop_generator:
            timedelta = parse_departure_time(this_stop) - parse_departure_time(last_stop)
            minutes = timedelta.total_seconds() / 60.
            graph.add_edge(
                last_stop['stop_name'], this_stop['stop_name'],
                key=route_5_trip['trip_id'], weight=minutes)
            last_stop = this_stop
    return graph

def get_trip_stops(trip_id):
    trip_times = times[times.trip_id == trip_id]
    return trip_times.merge(stops)[['departure_time', 'stop_name']].sort_values('departure_time')

def parse_departure_time(x):
    return datetime.datetime.strptime(x['departure_time'], '%H:%M:%S')

graph = make_graph([
    'A20120610WKD_101200_5..N04R',
    'A20120610WKD_104200_6..N03R',
])
In [14]:
networkx.dijkstra_path(graph, 'Brooklyn Bridge - City Hall', '110 St')
Out[14]:
['Brooklyn Bridge - City Hall',
 '14 St - Union Sq',
 'Grand Central - 42 St',
 '59 St',
 '86 St',
 '96 St',
 '103 St',
 '110 St']
In [15]:
networkx.dijkstra_path_length(graph, 'Brooklyn Bridge - City Hall', '110 St')
Out[15]:
22.900000000000002
In [16]:
# Make a graph of the NYC MTA subway system
trip_ids = []
# For each route,
for route_id, route_trips in weekday_trips.groupby('route_id'):
    # Pick a trip
    trip_ids.append(route_trips.trip_id.values[0])
graph = make_graph(trip_ids)
In [17]:
# Which ten stations are the most connected?
from pandas import Series
Series(networkx.pagerank_numpy(graph)).sort_values(ascending=False)[:10]
Out[17]:
7 Av                       0.013775
125 St                     0.012154
Canal St                   0.011880
59 St - Columbus Circle    0.009785
Fulton St                  0.009686
50 St                      0.009423
Times Sq - 42 St           0.009214
14 St - Union Sq           0.008764
23 St                      0.008433
Queensboro Plaza           0.008257
dtype: float64