#CrossCompute
from io import BytesIO
from zipfile import ZipFile
import requests
url = 'http://web.mta.info/persdashboard/perxml/MTA_Performance_Datall.zip'
content = requests.get(url)
#unzip the content
f = ZipFile(BytesIO(content.content))
f.namelist()
import pandas as pd
df1 = pd.read_csv(f.open('Performance_MTABUS.csv'))
df1 = df1[["INDICATOR_NAME", "PERIOD_YEAR", "PERIOD_MONTH", "MONTHLY_ACTUAL"]]
df1.info()
mta_df = df1[((df1.PERIOD_YEAR > 2015))&(df1.INDICATOR_NAME == "Total Ridership - MTA Bus ")]
mta_df
dates = pd.to_datetime(dict(year=mta_df.PERIOD_YEAR, month=mta_df.PERIOD_MONTH, day=1))
mta_df.loc[:,'Date'] = dates
df2 = pd.read_csv(f.open('Performance_NYCT.csv'), encoding="iso-8859-1")
df2 = df2[["INDICATOR_NAME", "PERIOD_YEAR", "PERIOD_MONTH", "MONTHLY_ACTUAL"]]
df2.info()
nyct_df = df2[((df2.PERIOD_YEAR > 2015))&(df2.INDICATOR_NAME == "Total Ridership - NYCT Bus ")]
nyct_df.head()
dates = pd.to_datetime(dict(year=nyct_df.PERIOD_YEAR, month=nyct_df.PERIOD_MONTH, day=1))
nyct_df.loc[:,'Date'] = dates
mta_df['Ridership'] = mta_df['MONTHLY_ACTUAL'].values + nyct_df['MONTHLY_ACTUAL'].values
bus_df = mta_df[['Ridership','Date']]
bus_df.Date
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.plot(bus_df.Date, bus_df.Ridership)
fig.
df3 = pd.read_csv("https://data.cityofnewyork.us/api/views/2v9c-2k7f/rows.csv?accessType=DOWNLOAD")
df3 = df3[["Year", "Month", "Total Dispatched Trips"]]
fhv_df = df3[((df3.Year == 2016) | (df3.Year == 2017) | (df3.Year == 2018))]
fhv_df.info()
fhv_df['Date'] = pd.to_datetime(dict(year=fhv_df.Year, month=fhv_df.Month, day=1))
fhv_df = fhv_df.groupby(['Date']).sum()
fhv_df = fhv_df['Total Dispatched Trips']
fhv_df.plot()
plt.title("Monthly FHV Ridership")
plt.xlabel("Month")
plt.ylabel("FHV Ridership")
plt.show()
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
model = LinearRegression()
x = fhv_df[0:28].values
y = bus_df.Ridership.values
from datetime import timedelta
x1 = pd.DataFrame(data=fhv_df[0:28])
x1['b'] = fhv_df[1:29].values
x1['c'] = fhv_df[2:30].values
x1.columns = ['Month - 2','Month - 1','Current Month']
x1 = x1.reset_index()
x1['Month'] = x1['Date'].apply(lambda x: x.month)
x1 = x1.drop(columns = 'Date')
x = x.reshape(28,1)
y = y.reshape(28,1)
model.fit(X=x,y=y)
plt.scatter(x,y)
plt.xlabel("FHV Trips")
plt.ylabel("MTA Bus Trips")
plt.plot(x,model.predict(x),color="red")
plt.show()
from sklearn.model_selection import cross_val_score
cross_val_score(model,x,y,scoring = 'neg_mean_absolute_error').mean()
model = LinearRegression()
model.fit(x1,y)
cross_val_score(model,x1,y,scoring = 'neg_mean_absolute_error').mean()
import statsmodels.api as sm
x = sm.add_constant(x)
reg = sm.OLS(y,x).fit()
print(reg.summary())
model.fit(x=x,y=y)