# crosscompute
Test_Data_Path = 'Testing_Data.csv'
target_folder = '/tmp'
import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve
def download(target_path, source_url):
if not exists(target_path):
urlretrieve(source_url, target_path)
return target_path
def download_zip(target_folder, source_url):
archive_path = download(target_folder + '.zip', source_url)
return uncompress(archive_path, target_folder)
def download_gz(target_path, source_url):
archive_path = download(target_path + '.gz', source_url)
with gzip.open(archive_path, 'rb') as f:
open(target_path, 'wb').write(f.read())
import subprocess
subprocess.call('pip install -U seaborn'.split())
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
schools_2006 = pd.read_csv("Clean School Data 2006.csv")
X = schools_2006[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]].values
y = schools_2006['Total Grads - n'].values
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split( X, y , test_size = 0.4, random_state = 101)
print(len(X_train),len(X_test))
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)
sns.distplot((y_test-predictions),bins=50);
from sklearn import metrics
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("R^2:", r2_score(y_test, predictions))
test_data = pd.read_csv(Test_Data_Path) # Use a given file of number of stores near a school
test_data_v2 = test_data[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]]
y_test_data = test_data[['Total Grads - n']]
User_X_test = test_data_v2[np.array(test_data_v2.columns)].values
test_data_v2["(Predicted)Total Grads - n"] = np.array(lm.predict(User_X_test)).astype(int)
test_data_v2["Total Grads - n"] = y_test_data.values
test_data_v2['Difference in results'] = abs(test_data_v2["(Predicted)Total Grads - n"] - test_data_v2["Total Grads - n"])
from os.path import join
target_path = join(target_folder,'updated_table.csv')
test_data_v2.to_csv(target_path,index = False)
print('graduation_table_path = %s' % target_path)