gist




Pay Notebook Creator: Michelle Chung0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [ ]:
#CrossCompute

outcome_select = """
    Total Grads - % of cohort

    Total Grads - % of cohort
    Dropped Out - % of cohort
    Total Regents - % of cohort
"""
# attribute_select = """
#     Diversity Index

#     Diversity Index
#     economic_need_index
#     students_with_disabilities_1
# """
target_folder = '/tmp'
In [ ]:
selected_outcome = outcome_select.strip().splitlines()[0]
selected_outcome

# selected_attribute = attribute_select.strip().splitlines()[0]
# selected_attribute
In [ ]:
url = 'https://data.cityofnewyork.us/api/geospatial/r8nu-ymqj?method=export&format=Shapefile'
In [ ]:
import geotable
In [ ]:
dist_table = geotable.load(url)
In [ ]:
dist_table.iloc[0]
In [ ]:
url = 'https://data.cityofnewyork.us/resource/98et-3mve.csv'
In [ ]:
import pandas as pd

def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, ignore_index=True, sort=False)
In [ ]:
demo_table = load(url, na_values=['No Data'])
In [ ]:
demo_table.iloc[0]
In [ ]:
len(demo_table)

selected_attribute = 'economic_need_index'

if selected_attribute in demo_table:
    demo_table = demo_table.dropna(subset=[selected_attribute])
    demo_table[selected_attribute] = demo_table[selected_attribute].str.replace('%', '')
    demo_table[selected_attribute] = demo_table[selected_attribute].astype('float')
In [ ]:
# demo_table.economic_need_index.unique()
In [ ]:
len(demo_table)
In [ ]:
demo_table.iloc[0]
In [ ]:
# # CALCULATE DIVERSITY INDEX OF EACH SCHOOL & ADD TO DEMODF DATAFRAME
# demo_table['Diversity Index'] = 1

# def d_index(row):
#     o = row['% Multiple Race Categories Not Represented']
#     total = 100 - o
#     target = total / 4
#     abs_distance = (abs(target - row['% Asian'])) + abs((target - row['% Black'])) + abs((target - row['% Hispanic'])) + abs((target - row['% White']))    
#     if o == 0.0:
#         return abs_distance 
#     if o > 0.0:
#         # diversity index
#         return abs_distance * (1/o)
    
# # append diversity index to demodf
# demo_table['Diversity Index'] = demo_table.apply(d_index, axis=1)
In [ ]:
demo_table['Diversity Index'] = demo_table['economic_need_index']
In [ ]:
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
In [ ]:
# Load schools
school_location_table = load(
    endpoint_url,
    buffer_size=1000)
school_location_table[:5]
In [ ]:
school_location_table.iloc[0]
In [ ]:
school_location_table.iloc[0]['location_1']
In [ ]:
school_location_table.iloc[0]['ats_system_code']
In [ ]:
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
In [ ]:
school_location_table.iloc[0]['DBN']
In [ ]:
school_location_table = school_location_table.rename(columns={
    'location_1': 'WKT',
    'location_name': 'School Name',
})
In [ ]:
trimmed_school_location_table = school_location_table[[
    'DBN',
    'WKT',
    'School Name',
]]
In [ ]:
school_table = pd.merge(
    trimmed_school_location_table,
    demo_table,
    left_on='DBN',
    right_on='dbn')
len(school_table)
In [ ]:
school_table = school_table.dropna(subset=['WKT'])
In [ ]:
len(school_table)
In [ ]:
school_table.iloc[0]
In [ ]:
dist_table.iloc[0]
In [ ]:
geometry_wkt = school_table.iloc[0]['WKT']
In [ ]:
geometry_wkt
In [ ]:
from shapely import wkt
In [ ]:
g = wkt.loads(geometry_wkt)
In [ ]:
g
In [ ]:
district_polygons = dist_table['geometry_object']
In [ ]:
flags = [x.contains(g) for x in district_polygons]

flags
In [ ]:
import numpy as np
In [ ]:
index = np.argmax(flags)
In [ ]:
dist_table.iloc[index]
In [ ]:
index
In [ ]:
len(flags)
In [ ]:
int(dist_table.iloc[index]['school_dis'])
In [ ]:
len(school_table)
In [ ]:
from geotable import ColorfulGeometryCollection, GeoTable
In [ ]:
dist_table
In [ ]:
dist_table.geometry_object[0].wkt
In [ ]:
school_table.WKT[0]
In [ ]:
len(dist_table.school_dis.unique())
In [ ]:
# aggregate districts
from shapely import wkt
import numpy as np

district_polygons = dist_table['geometry_object']

def get_district(geometry_wkt):
    g = wkt.loads(geometry_wkt)
    #flags = [x.contains(g) for x in district_polygons]
    flags = [x.intersects(g) for x in district_polygons]
    index = np.argmax(flags)
    district = int(dist_table.iloc[index]['school_dis'])
    return district

school_table['district'] = school_table['WKT'].apply(get_district)
In [ ]:
len(school_table.district.unique())
In [ ]:
len(dist_table)
In [ ]:
mean_table = school_table.groupby('district').mean()
In [ ]:
mean_table = mean_table.reset_index()
In [ ]:
mean_table = mean_table[[
    'district',
    'economic_need_index',
]]
In [ ]:
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
In [ ]:
map_table = t.copy()
In [ ]:
map_table['FillBlues'] = map_table.economic_need_index
In [ ]:
school_map_table = school_table.copy()
In [ ]:
school_map_table['geometry_object'] = school_map_table.WKT.apply(wkt.loads)
In [ ]:
district_map_table = t.copy()
district_map_table['FillBlues'] = district_map_table['economic_need_index']
school_map_table = school_table.copy()
#school_map_table['FillBlues'] = district_map_table['economic_need_index'].max()
map_table = pd.concat([
	#district_map_table,
school_map_table,
], sort=False)
In [ ]:
url = 'https://data.cityofnewyork.us/api/views/vh2h-md7a/rows.csv'
#school_graduation_table = pd.read_csv(url)
#len(school_graduation_table)
In [ ]:
#school_graduation_table.iloc[0]
In [ ]:
#school_graduation_table.groupby('DBN').mean()[:5]
In [ ]:
#school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
#len(school_graduation_table)
In [ ]:
#school_graduation_table = school_graduation_table.dropna()
#sum(school_graduation_table['Total Grads - % of cohort'] == 's')
In [ ]:
# Define which values mean null

school_graduation_table = pd.read_csv(url, na_values=['s'])
school_graduation_table = school_graduation_table[['DBN', 'School Name', selected_outcome]].copy()
school_graduation_table = school_graduation_table.dropna()

len(school_graduation_table)
In [ ]:
school_graduation_table.dtypes
In [ ]:
school_graduation_table
In [ ]:
school_graduation_table = school_graduation_table.groupby('DBN').mean()
school_graduation_table[:5]
In [ ]:
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
In [ ]:
# Load schools
school_location_table = load(
    endpoint_url,
    buffer_size=1000)
school_location_table[:5]
In [ ]:
school_location_table.iloc[0]
In [ ]:
school_location_table.iloc[0]['location_1']
In [ ]:
school_location_table.iloc[0]['ats_system_code']
In [ ]:
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
In [ ]:
school_location_table.iloc[0]['DBN']
In [ ]:
school_location_table = school_location_table.rename(columns={
    'location_1': 'WKT',
    'location_name': 'School Name',
})
In [ ]:
trimmed_school_location_table = school_location_table[[
    'DBN',
    'WKT',
    'School Name',
]]
In [ ]:
merged_school_table = pd.merge(
    school_table,
    school_graduation_table,
    left_on='DBN',
    right_on='DBN')
len(school_table)
In [ ]:
merged_school_table.iloc[0]
In [ ]:
sorted_school_table = merged_school_table.sort_values(selected_outcome, ascending=False)
In [ ]:
sorted_school_table[:3]
In [ ]:
sorted_school_table[-3:]
In [ ]:
school_table.iloc[0]
In [ ]:
# # top 10 performing
# top_schools = sorted_school_table[:10].copy()
In [1]:
# # bottom 10 performing
# bottom_schools = sorted_school_table[-10:].copy()
In [2]:
# getting top and bottom performing schools

# a_school_map_table = sorted_school_table[:100].copy()
# a_school_map_table = a_school_map_table[['WKT', 'School Name','total_enrollment']]
# a_school_map_table = a_school_map_table.rename({
# 'School Name': 'Name',
# })
# a_school_map_table['FillColor'] = 'g'

top_schools = sorted_school_table[:10].copy()
top_schools = top_schools[['WKT', 'School Name','total_enrollment']]
top_schools = top_schools.rename({
'School Name': 'Name',
})
top_schools['FillColor'] = 'g'
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-cf98aabdcb79> in <module>
      8 # a_school_map_table['FillColor'] = 'g'
      9 
---> 10 top_schools = sorted_school_table[:10].copy()
     11 top_schools = top_schools[['WKT', 'School Name','total_enrollment']]
     12 top_schools = top_schools.rename({

NameError: name 'sorted_school_table' is not defined
In [ ]:
# b_school_map_table = sorted_school_table[-100:].copy()
# b_school_map_table = b_school_map_table[['WKT', 'School Name','total_enrollment']]
# b_school_map_table = b_school_map_table.rename({
# 'School Name': 'Name',
# })
# b_school_map_table['FillColor'] = 'r'

bottom_schools = sorted_school_table[-10:].copy()
bottom_schools = bottom_schools[['WKT', 'School Name','total_enrollment']]
bottom_schools = bottom_schools.rename({
'School Name': 'Name',
})
bottom_schools['FillColor'] = 'r'
In [ ]:
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
In [ ]:
t[:3]
In [ ]:
district_map_table = t.copy()
district_map_table['WKT'] = district_map_table['geometry_object'].apply(
lambda x: x.wkt)
# Define FillColor
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 100))
In [ ]:
array = district_map_table['economic_need_index'].values.reshape(-1, 1)
scaler.fit(array)
district_map_table['scaled_index'] = scaler.transform(array)
In [ ]:
district_map_table.head()
In [ ]:
district_map_table.loc[district_map_table['scaled_index'] < 25, 'FillColor'] = '#fff7bc'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 25) 
                       & (district_map_table['scaled_index'] < 50), 'FillColor'] = '#fec44f'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 50) 
                       & (district_map_table['scaled_index'] < 75), 'FillColor'] = '#ec7014'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 75), 'FillColor'] = '#8c2d04'
In [ ]:
district_map_table
In [ ]:
# a_school_map_table = a_school_map_table.rename(columns={
# 	'School Name': 'Name',
# })
# b_school_map_table = b_school_map_table.rename(columns={
# 	'School Name': 'Na
me',
# })
# district_map_table['Name'] = district_map_table['district']
In [ ]:
top_schools = top_schools.rename(columns={
	'School Name': 'Name',
})
bottom_schools = bottom_schools.rename(columns={
	'School Name': 'Name',
})
district_map_table['Name'] = district_map_table['district']
In [ ]:
#a_school_map_table['radiusInPixelsRange3-27'] = a_school_map_table['total_enrollment']
In [ ]:
#b_school_map_table['radiusInPixelsRange3-27'] = b_school_map_table['total_enrollment']
In [ ]:
# Extract columns
map_table = pd.concat([
district_map_table[['WKT','FillColor','Name']],
top_schools[['WKT','FillColor','Name']],
bottom_schools[['WKT','FillColor','Name']],
])
In [ ]:
top_schools[:3]
In [ ]:
district_map_table
In [ ]:
type(district_map_table)
In [ ]:
target_path = target_folder + '/choropleth.csv'
map_table.to_csv(target_path, index=False)
print('a_geotable_path = %s' % target_path)
In [ ]:
map_table
In [ ]:
len(school_map_table)
In [ ]:
max(district_map_table['scaled_index'])
In [ ]:
#%matplotlib inline
#t['economic_need_index'].plot(kind='bar')
In [ ]:
sorted_school_table.to_csv('data.csv', index=False)
In [ ]:
sorted_school_table.iloc[0]
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Fixing random state for reproducibility
np.random.seed(19680801)


x = sorted_school_table[selected_attribute].values
y = sorted_school_table[selected_outcome].values
#colors = np.random.rand(N)
#area = (30 * np.random.rand(N))**2  # 0 to 15 point radii

#axes = plt.scatter(x, y, s=area, c=colors, alpha=0.5)
axes = plt.scatter(x, y, alpha=0.5)
plt.show()
In [ ]:
# Save file to target folder to include it in the result download
target_path = target_folder + '/c.png'
figure = axes.get_figure()
figure.savefig(target_path)
print(f'c_image_path = {target_path}')
In [ ]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
In [ ]:
X = sorted_school_table[['economic_need_index']].values
X
In [ ]:
y = sorted_school_table[selected_outcome].values
y
In [ ]:
model.fit(X, y)
In [ ]:
from sklearn.model_selection import cross_val_score
models = []
scores = []

def train(model, X):
    model.fit(X, y)
    models.append(model)
    score = cross_val_score(
        model, X, y, cv=3,
        scoring='neg_mean_absolute_error',
    ).mean()
    scores.append(score)
    return score
In [ ]:
model_scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
print('average_negative_mean_error = %s' % np.mean(model_scores))
In [ ]: