In [1]:

# crosscompute
Test_Data_Path = 'Testing_Data.csv'
target_folder = '/tmp'

In [2]:

import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve

def download(target_path, source_url):
    if not exists(target_path):
        urlretrieve(source_url, target_path)    
    return target_path

def download_zip(target_folder, source_url):
    archive_path = download(target_folder + '.zip', source_url)
    return uncompress(archive_path, target_folder)
            
def download_gz(target_path, source_url):
    archive_path = download(target_path + '.gz', source_url)
    with gzip.open(archive_path, 'rb') as f:
        open(target_path, 'wb').write(f.read())

In [3]:

import subprocess
subprocess.call('pip install -U seaborn'.split())

Out[3]:

In [4]:

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:

schools_2006 = pd.read_csv("Clean School Data 2006.csv")

In [6]:

X = schools_2006[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]].values

In [7]:

y = schools_2006['Total Grads - n'].values

In [8]:

from sklearn.model_selection import train_test_split

In [9]:

X_train,X_test,y_train,y_test  = train_test_split( X, y , test_size = 0.4, random_state = 101)

In [10]:

print(len(X_train),len(X_test))

179 120

In [11]:

from sklearn.linear_model import LinearRegression

In [12]:

lm = LinearRegression()
lm.fit(X_train,y_train)

Out[12]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:

predictions = lm.predict(X_test)

In [14]:

plt.scatter(y_test,predictions)

Out[14]:

<matplotlib.collections.PathCollection at 0x7f61cc9aada0>

In [15]:

sns.distplot((y_test-predictions),bins=50);

In [16]:

from sklearn import metrics
from sklearn.metrics import r2_score

In [17]:

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("R^2:", r2_score(y_test, predictions))

MAE: 19.69785437491135
MSE: 861.6196757743496
RMSE: 29.35335884995701
R^2: 0.9659457703829093

In [18]:

test_data = pd.read_csv(Test_Data_Path) # Use a given file of number of stores near a school
test_data_v2 = test_data[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]]
y_test_data = test_data[['Total Grads - n']]

In [19]:

User_X_test = test_data_v2[np.array(test_data_v2.columns)].values

In [20]:

test_data_v2["(Predicted)Total Grads - n"] = np.array(lm.predict(User_X_test)).astype(int)
test_data_v2["Total Grads - n"] = y_test_data.values
test_data_v2['Difference in results'] = abs(test_data_v2["(Predicted)Total Grads - n"] - test_data_v2["Total Grads - n"])

In [21]:

from os.path import join
target_path = join(target_folder,'updated_table.csv')
test_data_v2.to_csv(target_path,index = False)
print('graduation_table_path = %s' % target_path)

graduation_table_path = /tmp/updated_table.csv

In [ ]:

NYC HS Graduation Rate Estimator

Train Test Split¶

Creating and Testing the Model (OLS-Linear Model)¶

Testing User input csv¶

Pay Notebook Creator: Jendri Morocho	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0