#CrossCompute
outcome_select = """
Total Grads - % of cohort
Total Grads - % of cohort
Dropped Out - % of cohort
Total Regents - % of cohort
"""
# attribute_select = """
# Diversity Index
# Diversity Index
# economic_need_index
# students_with_disabilities_1
# """
target_folder = '/tmp'
selected_outcome = outcome_select.strip().splitlines()[0]
selected_outcome
# selected_attribute = attribute_select.strip().splitlines()[0]
# selected_attribute
url = 'https://data.cityofnewyork.us/api/geospatial/r8nu-ymqj?method=export&format=Shapefile'
import geotable
dist_table = geotable.load(url)
dist_table.iloc[0]
url = 'https://data.cityofnewyork.us/resource/98et-3mve.csv'
import pandas as pd
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, ignore_index=True, sort=False)
demo_table = load(url, na_values=['No Data'])
demo_table.iloc[0]
len(demo_table)
selected_attribute = 'economic_need_index'
if selected_attribute in demo_table:
demo_table = demo_table.dropna(subset=[selected_attribute])
demo_table[selected_attribute] = demo_table[selected_attribute].str.replace('%', '')
demo_table[selected_attribute] = demo_table[selected_attribute].astype('float')
# demo_table.economic_need_index.unique()
len(demo_table)
demo_table.iloc[0]
# # CALCULATE DIVERSITY INDEX OF EACH SCHOOL & ADD TO DEMODF DATAFRAME
# demo_table['Diversity Index'] = 1
# def d_index(row):
# o = row['% Multiple Race Categories Not Represented']
# total = 100 - o
# target = total / 4
# abs_distance = (abs(target - row['% Asian'])) + abs((target - row['% Black'])) + abs((target - row['% Hispanic'])) + abs((target - row['% White']))
# if o == 0.0:
# return abs_distance
# if o > 0.0:
# # diversity index
# return abs_distance * (1/o)
# # append diversity index to demodf
# demo_table['Diversity Index'] = demo_table.apply(d_index, axis=1)
demo_table['Diversity Index'] = demo_table['economic_need_index']
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
# Load schools
school_location_table = load(
endpoint_url,
buffer_size=1000)
school_location_table[:5]
school_location_table.iloc[0]
school_location_table.iloc[0]['location_1']
school_location_table.iloc[0]['ats_system_code']
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
school_location_table.iloc[0]['DBN']
school_location_table = school_location_table.rename(columns={
'location_1': 'WKT',
'location_name': 'School Name',
})
trimmed_school_location_table = school_location_table[[
'DBN',
'WKT',
'School Name',
]]
school_table = pd.merge(
trimmed_school_location_table,
demo_table,
left_on='DBN',
right_on='dbn')
len(school_table)
school_table = school_table.dropna(subset=['WKT'])
len(school_table)
school_table.iloc[0]
dist_table.iloc[0]
geometry_wkt = school_table.iloc[0]['WKT']
geometry_wkt
from shapely import wkt
g = wkt.loads(geometry_wkt)
g
district_polygons = dist_table['geometry_object']
flags = [x.contains(g) for x in district_polygons]
flags
import numpy as np
index = np.argmax(flags)
dist_table.iloc[index]
index
len(flags)
int(dist_table.iloc[index]['school_dis'])
len(school_table)
from geotable import ColorfulGeometryCollection, GeoTable
dist_table
dist_table.geometry_object[0].wkt
school_table.WKT[0]
len(dist_table.school_dis.unique())
# aggregate districts
from shapely import wkt
import numpy as np
district_polygons = dist_table['geometry_object']
def get_district(geometry_wkt):
g = wkt.loads(geometry_wkt)
#flags = [x.contains(g) for x in district_polygons]
flags = [x.intersects(g) for x in district_polygons]
index = np.argmax(flags)
district = int(dist_table.iloc[index]['school_dis'])
return district
school_table['district'] = school_table['WKT'].apply(get_district)
len(school_table.district.unique())
len(dist_table)
mean_table = school_table.groupby('district').mean()
mean_table = mean_table.reset_index()
mean_table = mean_table[[
'district',
'economic_need_index',
]]
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
map_table = t.copy()
map_table['FillBlues'] = map_table.economic_need_index
school_map_table = school_table.copy()
school_map_table['geometry_object'] = school_map_table.WKT.apply(wkt.loads)
district_map_table = t.copy()
district_map_table['FillBlues'] = district_map_table['economic_need_index']
school_map_table = school_table.copy()
#school_map_table['FillBlues'] = district_map_table['economic_need_index'].max()
map_table = pd.concat([
#district_map_table,
school_map_table,
], sort=False)
url = 'https://data.cityofnewyork.us/api/views/vh2h-md7a/rows.csv'
#school_graduation_table = pd.read_csv(url)
#len(school_graduation_table)
#school_graduation_table.iloc[0]
#school_graduation_table.groupby('DBN').mean()[:5]
#school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
#len(school_graduation_table)
#school_graduation_table = school_graduation_table.dropna()
#sum(school_graduation_table['Total Grads - % of cohort'] == 's')
# Define which values mean null
school_graduation_table = pd.read_csv(url, na_values=['s'])
school_graduation_table = school_graduation_table[['DBN', 'School Name', selected_outcome]].copy()
school_graduation_table = school_graduation_table.dropna()
len(school_graduation_table)
school_graduation_table.dtypes
school_graduation_table
school_graduation_table = school_graduation_table.groupby('DBN').mean()
school_graduation_table[:5]
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
# Load schools
school_location_table = load(
endpoint_url,
buffer_size=1000)
school_location_table[:5]
school_location_table.iloc[0]
school_location_table.iloc[0]['location_1']
school_location_table.iloc[0]['ats_system_code']
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
school_location_table.iloc[0]['DBN']
school_location_table = school_location_table.rename(columns={
'location_1': 'WKT',
'location_name': 'School Name',
})
trimmed_school_location_table = school_location_table[[
'DBN',
'WKT',
'School Name',
]]
merged_school_table = pd.merge(
school_table,
school_graduation_table,
left_on='DBN',
right_on='DBN')
len(school_table)
merged_school_table.iloc[0]
sorted_school_table = merged_school_table.sort_values(selected_outcome, ascending=False)
sorted_school_table[:3]
sorted_school_table[-3:]
school_table.iloc[0]
# # top 10 performing
# top_schools = sorted_school_table[:10].copy()
# # bottom 10 performing
# bottom_schools = sorted_school_table[-10:].copy()
# getting top and bottom performing schools
# a_school_map_table = sorted_school_table[:100].copy()
# a_school_map_table = a_school_map_table[['WKT', 'School Name','total_enrollment']]
# a_school_map_table = a_school_map_table.rename({
# 'School Name': 'Name',
# })
# a_school_map_table['FillColor'] = 'g'
top_schools = sorted_school_table[:10].copy()
top_schools = top_schools[['WKT', 'School Name','total_enrollment']]
top_schools = top_schools.rename({
'School Name': 'Name',
})
top_schools['FillColor'] = 'g'
# b_school_map_table = sorted_school_table[-100:].copy()
# b_school_map_table = b_school_map_table[['WKT', 'School Name','total_enrollment']]
# b_school_map_table = b_school_map_table.rename({
# 'School Name': 'Name',
# })
# b_school_map_table['FillColor'] = 'r'
bottom_schools = sorted_school_table[-10:].copy()
bottom_schools = bottom_schools[['WKT', 'School Name','total_enrollment']]
bottom_schools = bottom_schools.rename({
'School Name': 'Name',
})
bottom_schools['FillColor'] = 'r'
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
t[:3]
district_map_table = t.copy()
district_map_table['WKT'] = district_map_table['geometry_object'].apply(
lambda x: x.wkt)
# Define FillColor
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 100))
array = district_map_table['economic_need_index'].values.reshape(-1, 1)
scaler.fit(array)
district_map_table['scaled_index'] = scaler.transform(array)
district_map_table.head()
district_map_table.loc[district_map_table['scaled_index'] < 25, 'FillColor'] = '#fff7bc'
district_map_table.loc[(district_map_table['scaled_index'] >= 25)
& (district_map_table['scaled_index'] < 50), 'FillColor'] = '#fec44f'
district_map_table.loc[(district_map_table['scaled_index'] >= 50)
& (district_map_table['scaled_index'] < 75), 'FillColor'] = '#ec7014'
district_map_table.loc[(district_map_table['scaled_index'] >= 75), 'FillColor'] = '#8c2d04'
district_map_table
# a_school_map_table = a_school_map_table.rename(columns={
# 'School Name': 'Name',
# })
# b_school_map_table = b_school_map_table.rename(columns={
# 'School Name': 'Na
me',
# })
# district_map_table['Name'] = district_map_table['district']
top_schools = top_schools.rename(columns={
'School Name': 'Name',
})
bottom_schools = bottom_schools.rename(columns={
'School Name': 'Name',
})
district_map_table['Name'] = district_map_table['district']
#a_school_map_table['radiusInPixelsRange3-27'] = a_school_map_table['total_enrollment']
#b_school_map_table['radiusInPixelsRange3-27'] = b_school_map_table['total_enrollment']
# Extract columns
map_table = pd.concat([
district_map_table[['WKT','FillColor','Name']],
top_schools[['WKT','FillColor','Name']],
bottom_schools[['WKT','FillColor','Name']],
])
top_schools[:3]
district_map_table
type(district_map_table)
target_path = target_folder + '/choropleth.csv'
map_table.to_csv(target_path, index=False)
print('a_geotable_path = %s' % target_path)
map_table
len(school_map_table)
max(district_map_table['scaled_index'])
#%matplotlib inline
#t['economic_need_index'].plot(kind='bar')
sorted_school_table.to_csv('data.csv', index=False)
sorted_school_table.iloc[0]
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Fixing random state for reproducibility
np.random.seed(19680801)
x = sorted_school_table[selected_attribute].values
y = sorted_school_table[selected_outcome].values
#colors = np.random.rand(N)
#area = (30 * np.random.rand(N))**2 # 0 to 15 point radii
#axes = plt.scatter(x, y, s=area, c=colors, alpha=0.5)
axes = plt.scatter(x, y, alpha=0.5)
plt.show()
# Save file to target folder to include it in the result download
target_path = target_folder + '/c.png'
figure = axes.get_figure()
figure.savefig(target_path)
print(f'c_image_path = {target_path}')
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X = sorted_school_table[['economic_need_index']].values
X
y = sorted_school_table[selected_outcome].values
y
model.fit(X, y)
from sklearn.model_selection import cross_val_score
models = []
scores = []
def train(model, X):
model.fit(X, y)
models.append(model)
score = cross_val_score(
model, X, y, cv=3,
scoring='neg_mean_absolute_error',
).mean()
scores.append(score)
return score
model_scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
print('average_negative_mean_error = %s' % np.mean(model_scores))