In [ ]:

#CrossCompute

In [1]:

from io import BytesIO
from zipfile import ZipFile
import requests

url = 'http://web.mta.info/persdashboard/perxml/MTA_Performance_Datall.zip'
content = requests.get(url)

#unzip the content
f = ZipFile(BytesIO(content.content))
f.namelist()

Out[1]:

['MTA_Performance_Agencies.csv',
 'MTA_Performance_LIRR.csv',
 'MTA_Performance_MNR.csv',
 'MTA_Performance_MTABUS.csv',
 'MTA_Performance_NYCT.csv',
 'MTA_Performance_TBTA.csv',
 'Performance-MetaData.xlsx',
 'Performance_LIRR.csv',
 'Performance_LIRR.xml',
 'Performance_MNRR.csv',
 'Performance_MNRR.xml',
 'Performance_MTABUS.csv',
 'Performance_MTABUS.xml',
 'Performance_NYCT.csv',
 'Performance_NYCT.xml',
 'Performance_schema.xsd',
 'Performance_TBTA.csv',
 'Performance_TBTA.xml']

In [2]:

import pandas as pd
df1 = pd.read_csv(f.open('Performance_MTABUS.csv'))

In [4]:

df1 = df1[["INDICATOR_NAME", "PERIOD_YEAR", "PERIOD_MONTH", "MONTHLY_ACTUAL"]]
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1813 entries, 0 to 1812
Data columns (total 4 columns):
INDICATOR_NAME    1813 non-null object
PERIOD_YEAR       1813 non-null int64
PERIOD_MONTH      1813 non-null int64
MONTHLY_ACTUAL    1813 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 56.7+ KB

In [6]:

mta_df = df1[((df1.PERIOD_YEAR > 2015))&(df1.INDICATOR_NAME == "Total Ridership - MTA Bus ")]
mta_df

Out[6]:

	INDICATOR_NAME	PERIOD_YEAR	PERIOD_MONTH	MONTHLY_ACTUAL
334	Total Ridership - MTA Bus	2016	1	9464783.00
335	Total Ridership - MTA Bus	2016	2	9650946.00
336	Total Ridership - MTA Bus	2016	3	11393621.00
337	Total Ridership - MTA Bus	2016	4	10566032.00
338	Total Ridership - MTA Bus	2016	5	11022447.00
339	Total Ridership - MTA Bus	2016	6	10778433.00
340	Total Ridership - MTA Bus	2016	7	10002577.00
341	Total Ridership - MTA Bus	2016	8	10491025.00
342	Total Ridership - MTA Bus	2016	9	10898004.00
343	Total Ridership - MTA Bus	2016	10	10831627.00
344	Total Ridership - MTA Bus	2016	11	10414870.00
345	Total Ridership - MTA Bus	2016	12	10102793.00
346	Total Ridership - MTA Bus	2017	1	9840105.00
347	Total Ridership - MTA Bus	2017	2	9289269.80
348	Total Ridership - MTA Bus	2017	3	10774447.00
349	Total Ridership - MTA Bus	2017	4	9868714.00
350	Total Ridership - MTA Bus	2017	5	10948781.97
351	Total Ridership - MTA Bus	2017	6	10415892.00
352	Total Ridership - MTA Bus	2017	7	9835319.00
353	Total Ridership - MTA Bus	2017	8	10182683.00
354	Total Ridership - MTA Bus	2017	9	10293093.00
355	Total Ridership - MTA Bus	2017	10	10879804.00
356	Total Ridership - MTA Bus	2017	11	10230691.00
357	Total Ridership - MTA Bus	2017	12	9592517.00
358	Total Ridership - MTA Bus	2018	1	9378094.00
359	Total Ridership - MTA Bus	2018	2	9153053.00
360	Total Ridership - MTA Bus	2018	3	10383080.00
361	Total Ridership - MTA Bus	2018	4	10145682.00

In [7]:

dates = pd.to_datetime(dict(year=mta_df.PERIOD_YEAR, month=mta_df.PERIOD_MONTH, day=1))
mta_df.loc[:,'Date'] = dates

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py:362: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

In [8]:

df2 = pd.read_csv(f.open('Performance_NYCT.csv'), encoding="iso-8859-1")
df2 = df2[["INDICATOR_NAME", "PERIOD_YEAR", "PERIOD_MONTH", "MONTHLY_ACTUAL"]]
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9213 entries, 0 to 9212
Data columns (total 4 columns):
INDICATOR_NAME    9213 non-null object
PERIOD_YEAR       9213 non-null int64
PERIOD_MONTH      9213 non-null int64
MONTHLY_ACTUAL    9213 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 288.0+ KB

In [9]:

nyct_df = df2[((df2.PERIOD_YEAR > 2015))&(df2.INDICATOR_NAME == "Total Ridership - NYCT Bus ")]
nyct_df.head()

Out[9]:

	INDICATOR_NAME	PERIOD_YEAR	PERIOD_MONTH	MONTHLY_ACTUAL
6495	Total Ridership - NYCT Bus	2016	1	49049885.0
6496	Total Ridership - NYCT Bus	2016	2	50327673.0
6497	Total Ridership - NYCT Bus	2016	3	58720672.0
6498	Total Ridership - NYCT Bus	2016	4	54318126.0
6499	Total Ridership - NYCT Bus	2016	5	56608696.0

In [10]:

dates = pd.to_datetime(dict(year=nyct_df.PERIOD_YEAR, month=nyct_df.PERIOD_MONTH, day=1))
nyct_df.loc[:,'Date'] = dates

In [16]:

mta_df['Ridership'] = mta_df['MONTHLY_ACTUAL'].values + nyct_df['MONTHLY_ACTUAL'].values
bus_df = mta_df[['Ridership','Date']]

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [22]:

bus_df.Date

Out[22]:

334   2016-01-01
335   2016-02-01
336   2016-03-01
337   2016-04-01
338   2016-05-01
339   2016-06-01
340   2016-07-01
341   2016-08-01
342   2016-09-01
343   2016-10-01
344   2016-11-01
345   2016-12-01
346   2017-01-01
347   2017-02-01
348   2017-03-01
349   2017-04-01
350   2017-05-01
351   2017-06-01
352   2017-07-01
353   2017-08-01
354   2017-09-01
355   2017-10-01
356   2017-11-01
357   2017-12-01
358   2018-01-01
359   2018-02-01
360   2018-03-01
361   2018-04-01
Name: Date, dtype: datetime64[ns]

In [23]:

import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.plot(bus_df.Date, bus_df.Ridership)
fig.

  File "<ipython-input-23-372b4ae0a5dc>", line 5
    fig.
        ^
SyntaxError: invalid syntax

In [14]:

df3 = pd.read_csv("https://data.cityofnewyork.us/api/views/2v9c-2k7f/rows.csv?accessType=DOWNLOAD")
df3 = df3[["Year", "Month", "Total Dispatched Trips"]]
fhv_df = df3[((df3.Year == 2016) | (df3.Year == 2017) | (df3.Year == 2018))]
fhv_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20100 entries, 2 to 23415
Data columns (total 3 columns):
Year                      20100 non-null int64
Month                     20100 non-null int64
Total Dispatched Trips    20100 non-null int64
dtypes: int64(3)
memory usage: 628.1 KB

In [15]:

fhv_df['Date'] = pd.to_datetime(dict(year=fhv_df.Year, month=fhv_df.Month, day=1))

C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [16]:

fhv_df = fhv_df.groupby(['Date']).sum()
fhv_df = fhv_df['Total Dispatched Trips']

In [17]:

fhv_df.plot()

plt.title("Monthly FHV Ridership")
plt.xlabel("Month")
plt.ylabel("FHV Ridership")
plt.show()

In [72]:

import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
model = LinearRegression()
x = fhv_df[0:28].values
y = bus_df.Ridership.values

In [73]:

from datetime import timedelta
x1 = pd.DataFrame(data=fhv_df[0:28])
x1['b'] = fhv_df[1:29].values
x1['c'] = fhv_df[2:30].values
x1.columns = ['Month - 2','Month - 1','Current Month']
x1 = x1.reset_index()

In [74]:

x1['Month'] = x1['Date'].apply(lambda x: x.month)

In [75]:

x1 = x1.drop(columns = 'Date')

In [77]:

x = x.reshape(28,1)
y = y.reshape(28,1)
model.fit(X=x,y=y)
plt.scatter(x,y)
plt.xlabel("FHV Trips")
plt.ylabel("MTA Bus Trips")
plt.plot(x,model.predict(x),color="red")
plt.show()

In [84]:

from sklearn.model_selection import cross_val_score
cross_val_score(model,x,y,scoring = 'neg_mean_absolute_error').mean()

C:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)

Out[84]:

-2943038.9585895427

In [97]:

model = LinearRegression()
model.fit(x1,y)

Out[97]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [98]:

cross_val_score(model,x1,y,scoring = 'neg_mean_absolute_error').mean()

C:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)

Out[98]:

-2710456.008361019

In [117]:

import statsmodels.api as sm
x = sm.add_constant(x)

In [118]:

reg = sm.OLS(y,x).fit()

In [119]:

print(reg.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.259
Model:                            OLS   Adj. R-squared:                  0.230
Method:                 Least Squares   F-statistic:                     9.086
Date:                Thu, 31 Jan 2019   Prob (F-statistic):            0.00569
Time:                        17:01:33   Log-Likelihood:                -460.46
No. Observations:                  28   AIC:                             924.9
Df Residuals:                      26   BIC:                             927.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.893e+07   2.62e+06     26.325      0.000    6.35e+07    7.43e+07
x1            -0.5278      0.175     -3.014      0.006      -0.888      -0.168
==============================================================================
Omnibus:                        1.381   Durbin-Watson:                   1.677
Prob(Omnibus):                  0.501   Jarque-Bera (JB):                0.937
Skew:                          -0.016   Prob(JB):                        0.626
Kurtosis:                       2.105   Cond. No.                     5.95e+07
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.95e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

Pay Notebook Creator: Henry Weng	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

Predict bus ridership using median income and FHV trips per region