from pandas import read_csv
calendar = read_csv('datasets/NYC-MTA-Calendar.csv')
routes = read_csv('datasets/NYC-MTA-Routes.csv')
trips = read_csv('datasets/NYC-MTA-Trips.csv')
times = read_csv('datasets/NYC-MTA-StopTimes.csv')
stops = read_csv('datasets/NYC-MTA-Stops.csv')
calendar.loc[0]
weekday_service_ids = list(filter(lambda x: x.endswith('WKD'), calendar.service_id))
weekday_service_ids
routes.loc[0]
route_name_by_id = {x['route_id']: x['route_long_name'] for index, x in routes.iterrows()}
route_name_by_id['3']
trips.loc[0]
weekday_trips = trips[trips.service_id.isin(weekday_service_ids)]
len(weekday_trips)
times.loc[0]
weekday_times = times[times.trip_id.isin(weekday_trips.trip_id.unique())]
len(weekday_times)
stops.loc[0]
# List weekday stops on the 5 line
route_id = '5'
print(route_name_by_id[route_id])
route_5_weekday_trips = weekday_trips[weekday_trips.route_id == route_id]
stop_ids = route_5_weekday_trips.merge(weekday_times).stop_id.unique()
stops[stops.stop_id.isin(stop_ids)].stop_name.unique()
route_5_weekday_trips.loc[route_5_weekday_trips.index[224]]
route_5_weekday_trips.iloc[224]
# List weekday stops on the 5 line by departure time
route_5_trip = route_5_weekday_trips.iloc[224]
route_5_trip_times = weekday_times[weekday_times.trip_id == route_5_trip['trip_id']]
route_5_stops = route_5_trip_times.merge(stops)[['departure_time', 'stop_id', 'stop_name']]
route_5_stops.sort_values('departure_time')
# Build a network connecting stops on the 5 line and 6 line
import datetime
import networkx
def make_graph(trip_ids):
graph = networkx.MultiDiGraph()
for trip_id in trip_ids:
trip_stops = get_trip_stops(trip_id)
stop_generator = trip_stops.iterrows()
last_stop = next(stop_generator)[1]
for index, this_stop in stop_generator:
timedelta = parse_departure_time(this_stop) - parse_departure_time(last_stop)
minutes = timedelta.total_seconds() / 60.
graph.add_edge(
last_stop['stop_name'], this_stop['stop_name'],
key=route_5_trip['trip_id'], weight=minutes)
last_stop = this_stop
return graph
def get_trip_stops(trip_id):
trip_times = times[times.trip_id == trip_id]
return trip_times.merge(stops)[['departure_time', 'stop_name']].sort_values('departure_time')
def parse_departure_time(x):
return datetime.datetime.strptime(x['departure_time'], '%H:%M:%S')
graph = make_graph([
'A20120610WKD_101200_5..N04R',
'A20120610WKD_104200_6..N03R',
])
networkx.dijkstra_path(graph, 'Brooklyn Bridge - City Hall', '110 St')
networkx.dijkstra_path_length(graph, 'Brooklyn Bridge - City Hall', '110 St')
# Make a graph of the NYC MTA subway system
trip_ids = []
# For each route,
for route_id, route_trips in weekday_trips.groupby('route_id'):
# Pick a trip
trip_ids.append(route_trips.trip_id.values[0])
graph = make_graph(trip_ids)
# Which ten stations are the most connected?
from pandas import Series
Series(networkx.pagerank_numpy(graph)).sort_values(ascending=False)[:10]