#crosscompute
school_table_path = 'schools_test_data.csv'
target_folder = '/tmp'
import subprocess
subprocess.call('pip install -U folium'.split())
#import geopandas
import pysal as ps
import folium
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geotable
import numpy as np
import pandas as pd
import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve
def download(target_path, source_url):
if not exists(target_path):
urlretrieve(source_url, target_path)
return target_path
def download_zip(target_folder, source_url):
archive_path = download(target_folder + '.zip', source_url)
return uncompress(archive_path, target_folder)
def download_gz(target_path, source_url):
archive_path = download(target_path + '.gz', source_url)
with gzip.open(archive_path, 'rb') as f:
open(target_path, 'wb').write(f.read())
cig_data = pd.read_excel('cig_data_trimmed_down.xlsx')
cig_data = cig_data[['Business_Name', 'Borough_Code', 'Longitude', 'Latitude']]
cig_data.head()
#schools = pd.read_csv('Modified_school_data.csv')
schools = pd.read_csv('Modified_school_data.csv', na_values= 's')
schools.head()
schools_2006 = schools[schools['Cohort']== '2006']
schools_2006 = schools_2006[["Cohort","School Name","Total Cohort","Dropped Out - n","Dropped Out - % of cohort",
"Total Grads - n","Total Grads - % of cohort",
"Total Regents - % of cohort","Latitude","Longitude","boro_num"]]
#schools_2006.head()
schools_2006.index = range(len(schools_2006.index))
schools_2006
'''
dataf = pd.DataFrame(columns=['School', 'boro_num', 'Longitude', 'Latitude', 'Pct_cohort_regents'])
dataf.School = schools_2006['School Name'].unique()
dataf.Pct_cohort_regents = schools_2006['Total Regents - % of cohort']
''';
schools_2006_v2 = schools_2006.drop_duplicates(subset = ['School Name'])
schools_2006_v2['Total Regents - % of cohort'].unique()
schools_2006_v3 = schools_2006_v2.dropna().copy()
#schools_2006_v3.dtypes
#from geopy.distance import geodesic as get_distance
'''
get_distance
newport_ri = (41.49008, -71.312796)
cleveland_oh = (41.499498, -81.695391)
x = get_distance(newport_ri, cleveland_oh)
''';
'''
for index,row in schools_2006_v3.iterrows():
rad = .3
school_loc = row['Latitude'], row['Longitude']
cig_count = 0
for cig_index,cig_row in cig_data[cig_data['Borough_Code']==row['boro_num']].iterrows():
cig_loc = cig_row['Latitude'], cig_row['Longitude']
dist = get_distance(school_loc,cig_loc).miles
if(dist <= rad ):
cig_count += 1
schools_2006_v3['num_stores'] = cig_count
''';
'''
stores_xys = []
for index, row in cig_data.iterrows():
stores_xys.append((row['Longitude'], row['Latitude']))
stores_xys[:5]''';
store_xys = cig_data[['Longitude', 'Latitude']].values
test_data = pd.read_csv(school_table_path) # Use a given file of number of stores near a school
test_data.head()
# Make school kdtree
# from pysal.cg.kdtree import KDTree
# from pysal.cg import RADIUS_EARTH_MILES
# store_tree = KDTree(store_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
# NEW
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
from pysal.lib.weights import KNN
store_tree = KDTree(np.array(store_xys), distance_metric='Arc', radius=RADIUS_EARTH_KM)
#w = KNN(store_tree, k=2)
#w.set_transform('R')
radius_in_miles = 0.4
store_count = len(cig_data)
def get_store_count(r):
xy = r['Longitude'], r['Latitude']
distances, indices = store_tree.query(
xy, k=store_count, distance_upper_bound=radius_in_miles)
return sum(indices < store_count)
schools_2006_v3['nearby_store_count'] = schools_2006_v3.apply(
get_store_count, axis=1)
schools_2006_v3.head() #nearby store count needs to be changed
X = schools_2006_v3[['nearby_store_count']].values
y = schools_2006_v3['Total Grads - % of cohort'].values
from sklearn.svm import SVR
model = SVR()
model.fit(X, y)
test_data.columns
X_test = test_data[['number_of_stores']].values
#X_test['graduation_rate']=model.predict(X_test)
test_data['graduation_rate'] = model.predict(X_test)
test_data['graduation_rate']
from os.path import join
target_path = join(target_folder,'updated_table.csv')
test_data.to_csv(target_path,index = False)
print('graduation_table_path = %s' % target_path)