gist




Pay Notebook Creator: Michelle Chung0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [285]:
#CrossCompute

outcome_select = """
    Total Grads - % of cohort

    Total Grads - % of cohort
    Dropped Out - % of cohort
    Total Regents - % of cohort
"""
attribute_select = """
    diversity_index

    diversity_index
    economic_need_index
    students_with_disabilities_2
    english_language_learners_2
    female_2
"""
target_folder = '/tmp'
In [286]:
selected_outcome = outcome_select.strip().splitlines()[0]
selected_outcome

# selected_attribute = attribute_select.strip().splitlines()[0]
# selected_attribute
Out[286]:
'Total Grads - % of cohort'
In [287]:
url = 'https://data.cityofnewyork.us/api/geospatial/r8nu-ymqj?method=export&format=Shapefile'
In [288]:
import geotable
In [289]:
dist_table = geotable.load(url)
In [290]:
dist_table.iloc[0]
Out[290]:
school_dis                                                        16
shape_area                                               4.67636e+07
shape_leng                                                   35848.9
geometry_object    POLYGON ((-73.93311862859143 40.69579115384632...
geometry_layer       geo_export_237252e0-27f5-439e-a824-157f1aa920ad
geometry_proj4                   +proj=longlat +ellps=WGS84 +no_defs
Name: 0, dtype: object
In [291]:
url = 'https://data.cityofnewyork.us/resource/98et-3mve.csv'
In [292]:
import pandas as pd

def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, ignore_index=True, sort=False)
In [293]:
demo_table = load(url, na_values=['No Data'])
https://data.cityofnewyork.us/resource/98et-3mve.csv?$limit=1000
1000
2000
3000
4000
5000
6000
7000
8000
8972
In [294]:
demo_table = demo_table[demo_table.year == '2016-17']
In [295]:
demo_table.iloc[0]
Out[295]:
asian_1                                                              14
asian_2                                                             7.9
black_1                                                              51
black_2                                                            28.7
dbn                                                              01M015
economic_need_index                                               89.2%
english_language_learners_1                                          12
english_language_learners_2                                         6.7
female_1                                                             83
female_2                                                           46.6
grade_1                                                              33
grade_10                                                              0
grade_11                                                              0
grade_12                                                              0
grade_2                                                              27
grade_3                                                              31
grade_4                                                              24
grade_5                                                              18
grade_6                                                               0
grade_7                                                               0
grade_8                                                               0
grade_9                                                               0
grade_k                                                              28
grade_pk_half_day_full_day                                           17
hispanic_1                                                          105
hispanic_2                                                           59
male_1                                                               95
male_2                                                             53.4
multiple_race_categories_not_represented_1                            4
multiple_race_categories_not_represented_2                          2.2
poverty_1                                                           152
poverty_2                                                          85.4
school_name                                   P.S. 015 Roberto Clemente
students_with_disabilities_1                                         51
students_with_disabilities_2                                       28.7
total_enrollment                                                    178
white_1                                                               4
white_2                                                             2.2
year                                                            2016-17
Name: 3, dtype: object
In [296]:
# ADD DIVERSITY INDEX

def d_index(row):
    o = row['multiple_race_categories_not_represented_2']
    total = 100 - o
    target = total / 4
    abs_distance = (abs(target - row['asian_2'])) + abs((target - row['black_2'])) + abs((target - row['hispanic_2'])) + abs((target - row['white_2']))    
    if o == 0.0:
        return abs_distance 
    if o > 0.0:
        # diversity index
        return abs_distance * (1/o)
In [297]:
demo_table['diversity_index'] = demo_table.apply(d_index, axis=1)
In [298]:
demo_table.iloc[0]
Out[298]:
asian_1                                                              14
asian_2                                                             7.9
black_1                                                              51
black_2                                                            28.7
dbn                                                              01M015
economic_need_index                                               89.2%
english_language_learners_1                                          12
english_language_learners_2                                         6.7
female_1                                                             83
female_2                                                           46.6
grade_1                                                              33
grade_10                                                              0
grade_11                                                              0
grade_12                                                              0
grade_2                                                              27
grade_3                                                              31
grade_4                                                              24
grade_5                                                              18
grade_6                                                               0
grade_7                                                               0
grade_8                                                               0
grade_9                                                               0
grade_k                                                              28
grade_pk_half_day_full_day                                           17
hispanic_1                                                          105
hispanic_2                                                           59
male_1                                                               95
male_2                                                             53.4
multiple_race_categories_not_represented_1                            4
multiple_race_categories_not_represented_2                          2.2
poverty_1                                                           152
poverty_2                                                          85.4
school_name                                   P.S. 015 Roberto Clemente
students_with_disabilities_1                                         51
students_with_disabilities_2                                       28.7
total_enrollment                                                    178
white_1                                                               4
white_2                                                             2.2
year                                                            2016-17
diversity_index                                                 35.2727
Name: 3, dtype: object
In [299]:
len(demo_table)

selected_attribute = 'diversity_index'

if selected_attribute in demo_table:
    demo_table = demo_table.dropna(subset=[selected_attribute])
    #demo_table[selected_attribute] = demo_table[selected_attribute].str.replace('%', '')
    demo_table[selected_attribute] = demo_table[selected_attribute].astype('float')
In [300]:
demo_table.year
Out[300]:
3       2016-17
8       2016-17
13      2016-17
18      2016-17
23      2016-17
28      2016-17
33      2016-17
38      2016-17
43      2016-17
48      2016-17
53      2016-17
58      2016-17
63      2016-17
68      2016-17
73      2016-17
78      2016-17
83      2016-17
88      2016-17
93      2016-17
98      2016-17
103     2016-17
109     2016-17
114     2016-17
119     2016-17
124     2016-17
129     2016-17
133     2016-17
138     2016-17
143     2016-17
148     2016-17
         ...   
8839    2016-17
8843    2016-17
8847    2016-17
8851    2016-17
8855    2016-17
8860    2016-17
8863    2016-17
8866    2016-17
8871    2016-17
8876    2016-17
8879    2016-17
8884    2016-17
8887    2016-17
8892    2016-17
8897    2016-17
8902    2016-17
8905    2016-17
8910    2016-17
8915    2016-17
8920    2016-17
8925    2016-17
8927    2016-17
8929    2016-17
8940    2016-17
8945    2016-17
8950    2016-17
8955    2016-17
8960    2016-17
8965    2016-17
8970    2016-17
Name: year, Length: 1823, dtype: object
In [301]:
#demo_table['Diversity Index'] = demo_table['economic_need_index']
In [302]:
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
In [303]:
# Load schools
school_location_table = load(
    endpoint_url,
    buffer_size=1000)
school_location_table[:5]
https://data.cityofnewyork.us/resource/r2nx-nhxe.csv?$limit=1000
1000
1823
Out[303]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
:@computed_region_92fq_4b7q :@computed_region_efsh_h5xi :@computed_region_f5dn_yrer :@computed_region_sbqj_enih :@computed_region_yeji_bk3q admin_district_location_code administrative_district_name ats_system_code beds_number borough_block_lot ... primary_building_code principal_name principal_phone_number principal_title school_support_team_leader_name school_support_team_name state_code status_descriptions x_coordinate y_coordinate
0 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M015 310100010015 1003740020 ... M015 IRENE SANCHEZ 212-228-8730 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 990141.0 202349.0
1 50.0 11724.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M019 310100010019 1004530034 ... M019 JACQUELINE FLANAGAN 212-533-5340 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988547.0 205239.0
2 32.0 11723.0 70.0 4.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M020 310100010020 1003550001 ... M020 SARAH PINTO VIAGRAN 212-254-9577 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988044.0 202068.0
3 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M034 310100010034 1003810038 ... M034 ANGELIKI LOUKATOS 212-228-4433 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 991163.0 203782.0
4 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M063 310100010063 1004310014 ... M063 DARLENE CAMERON 212-674-3180 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988071.0 203210.0
<p>5 rows × 48 columns</p>
In [304]:
school_location_table.iloc[0]
Out[304]:
:@computed_region_92fq_4b7q                                                         50
:@computed_region_efsh_h5xi                                                      11729
:@computed_region_f5dn_yrer                                                         70
:@computed_region_sbqj_enih                                                          5
:@computed_region_yeji_bk3q                                                          4
admin_district_location_code                                                      M801
administrative_district_name                              COMMUNITY SCHOOL DISTRICT 01
ats_system_code                                                           01M015      
beds_number                                                               310100010015
borough_block_lot                                                           1003740020
census_tract                                                                      2601
community_district                                                                 103
community_school_sup_name                                           PHILLIPS, DANIELLA
council_district                                                                     2
fax_number                                                                212-477-0931
field_support_center_leader_name                                             CHU, YUET
field_support_center_name                             Field Support Center - Manhattan
fiscal_year                                                                       2018
geographical_district_code                                                           1
grades_final_text                                                 PK,0K,01,02,03,04,05
grades_text                                                    PK,0K,01,02,03,04,05,SE
highschool_network_location_code                                                   NaN
highschool_network_name                                                            NaN
highschool_network_superintendent                                                  NaN
location_1                                                POINT (-73.978747 40.722075)
location_1_address                                                   333 EAST 4 STREET
location_1_city                                                              MANHATTAN
location_1_state                                                                    NY
location_1_zip                                                                   10009
location_category_description                                               Elementary
location_code                                                                     M015
location_name                                                P.S. 015 Roberto Clemente
location_type_description                                             General Academic
managed_by_name                                                                    DOE
nta                                                                               MN28
nta_name                             Lower East Side                               ...
open_date                                                      1904-07-01T00:00:00.000
primary_address_line_1                                              333 EAST  4 STREET
primary_building_code                                                             M015
principal_name                                                           IRENE SANCHEZ
principal_phone_number                                                    212-228-8730
principal_title                                                              PRINCIPAL
school_support_team_leader_name                                                    NaN
school_support_team_name                              School Support Team 3- Manhattan
state_code                                                                          NY
status_descriptions                                                               Open
x_coordinate                                                                    990141
y_coordinate                                                                    202349
Name: 0, dtype: object
In [305]:
school_location_table.iloc[0]['location_1']
Out[305]:
'POINT (-73.978747 40.722075)'
In [306]:
school_location_table.iloc[0]['ats_system_code']
Out[306]:
'01M015      '
In [307]:
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
In [308]:
school_location_table.iloc[0]['DBN']
Out[308]:
'01M015'
In [309]:
school_location_table = school_location_table.rename(columns={
    'location_1': 'WKT',
    'location_name': 'School Name',
})
In [310]:
trimmed_school_location_table = school_location_table[[
    'DBN',
    'WKT',
    'School Name',
]]
In [311]:
school_table = pd.merge(
    trimmed_school_location_table,
    demo_table,
    left_on='DBN',
    right_on='dbn')
len(school_table)
Out[311]:
1802
In [312]:
school_table = school_table.dropna(subset=['WKT'])
In [313]:
len(school_table)
Out[313]:
1801
In [314]:
school_table.iloc[0]
Out[314]:
DBN                                                                 01M015
WKT                                           POINT (-73.978747 40.722075)
School Name                                      P.S. 015 Roberto Clemente
asian_1                                                                 14
asian_2                                                                7.9
black_1                                                                 51
black_2                                                               28.7
dbn                                                                 01M015
economic_need_index                                                  89.2%
english_language_learners_1                                             12
english_language_learners_2                                            6.7
female_1                                                                83
female_2                                                              46.6
grade_1                                                                 33
grade_10                                                                 0
grade_11                                                                 0
grade_12                                                                 0
grade_2                                                                 27
grade_3                                                                 31
grade_4                                                                 24
grade_5                                                                 18
grade_6                                                                  0
grade_7                                                                  0
grade_8                                                                  0
grade_9                                                                  0
grade_k                                                                 28
grade_pk_half_day_full_day                                              17
hispanic_1                                                             105
hispanic_2                                                              59
male_1                                                                  95
male_2                                                                53.4
multiple_race_categories_not_represented_1                               4
multiple_race_categories_not_represented_2                             2.2
poverty_1                                                              152
poverty_2                                                             85.4
school_name                                      P.S. 015 Roberto Clemente
students_with_disabilities_1                                            51
students_with_disabilities_2                                          28.7
total_enrollment                                                       178
white_1                                                                  4
white_2                                                                2.2
year                                                               2016-17
diversity_index                                                    35.2727
Name: 0, dtype: object
In [315]:
dist_table.iloc[0]
Out[315]:
school_dis                                                        16
shape_area                                               4.67636e+07
shape_leng                                                   35848.9
geometry_object    POLYGON ((-73.93311862859143 40.69579115384632...
geometry_layer       geo_export_237252e0-27f5-439e-a824-157f1aa920ad
geometry_proj4                   +proj=longlat +ellps=WGS84 +no_defs
Name: 0, dtype: object
In [316]:
geometry_wkt = school_table.iloc[0]['WKT']
In [317]:
geometry_wkt
Out[317]:
'POINT (-73.978747 40.722075)'
In [318]:
from shapely import wkt
In [319]:
g = wkt.loads(geometry_wkt)
In [320]:
g
Out[320]:
<shapely.geometry.point.Point at 0x7f536f5cec18>
In [321]:
district_polygons = dist_table['geometry_object']
In [322]:
flags = [x.contains(g) for x in district_polygons]

flags
Out[322]:
[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]
In [323]:
import numpy as np
In [324]:
index = np.argmax(flags)
In [325]:
dist_table.iloc[index]
Out[325]:
school_dis                                                         1
shape_area                                               3.51607e+07
shape_leng                                                   28641.3
geometry_object    POLYGON ((-73.97177410965313 40.72582128133706...
geometry_layer       geo_export_237252e0-27f5-439e-a824-157f1aa920ad
geometry_proj4                   +proj=longlat +ellps=WGS84 +no_defs
Name: 17, dtype: object
In [326]:
index
Out[326]:
17
In [327]:
len(flags)
Out[327]:
33
In [328]:
int(dist_table.iloc[index]['school_dis'])
Out[328]:
1
In [329]:
len(school_table)
Out[329]:
1801
In [330]:
from geotable import ColorfulGeometryCollection, GeoTable
In [331]:
dist_table
Out[331]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
school_dis shape_area shape_leng geometry_object geometry_layer geometry_proj4
0 16.0 4.676362e+07 35848.904605 POLYGON ((-73.93311862859143 40.69579115384632... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
1 32.0 5.189850e+07 37251.057847 POLYGON ((-73.91180710069435 40.70343495202662... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
2 6.0 9.634170e+07 70447.849084 POLYGON ((-73.92640556921116 40.87762147653734... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
3 31.0 1.604472e+09 434471.412859 (POLYGON ((-74.05050806403247 40.5664220341608... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
4 7.0 9.226247e+07 65294.452403 (POLYGON ((-73.89680883223774 40.7958084451597... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
5 23.0 4.740069e+07 40317.452033 (POLYGON ((-73.92044366203014 40.6656262871675... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
6 29.0 4.201981e+08 135035.241651 POLYGON ((-73.73816144093141 40.72895809117297... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
7 26.0 4.247909e+08 125677.678898 POLYGON ((-73.74344992332192 40.77824115291502... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
8 15.0 1.961534e+08 153439.165680 POLYGON ((-73.98633135042395 40.69105051012824... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
9 17.0 1.284414e+08 68341.398899 POLYGON ((-73.92044366203014 40.6656262871675,... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
10 19.0 2.034175e+08 184183.167312 (POLYGON ((-73.846736514711 40.60485301485166,... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
11 13.0 1.048706e+08 86635.210559 POLYGON ((-73.97906084911834 40.70594602894087... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
12 18.0 1.751488e+08 121184.158477 (POLYGON ((-73.86706149472118 40.5820879767934... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
13 20.0 2.426965e+08 94309.778946 POLYGON ((-74.02552971543656 40.65147855069281... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
14 3.0 1.134889e+08 52072.051321 POLYGON ((-73.95671863064405 40.78660079332199... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
15 5.0 5.251977e+07 44469.588221 POLYGON ((-73.93515659239551 40.83268240623763... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
16 9.0 8.341539e+07 46648.958586 POLYGON ((-73.9212971968614 40.85428933985649,... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
17 1.0 3.516075e+07 28641.276279 POLYGON ((-73.97177410965313 40.72582128133706... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
18 14.0 1.503102e+08 95792.082090 POLYGON ((-73.95439555417087 40.73911477252251... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
19 4.0 5.262043e+07 52061.828459 (POLYGON ((-73.92133752419399 40.8008521064970... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
20 10.0 2.825415e+08 94957.570434 POLYGON ((-73.86789798628736 40.90294017690526... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
21 12.0 6.907182e+07 48527.595776 POLYGON ((-73.88284445574813 40.84781722645163... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
22 25.0 4.436285e+08 175827.007127 POLYGON ((-73.82049919995312 40.80101146781899... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
23 28.0 2.475679e+08 114694.912786 POLYGON ((-73.84485477879177 40.7357514698091,... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
24 24.0 3.949782e+08 127343.703736 (POLYGON ((-73.90641585511733 40.7398683641967... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
25 30.0 3.181290e+08 150392.978241 POLYGON ((-73.90647314610101 40.79018117520807... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
26 21.0 2.101971e+08 123858.087345 POLYGON ((-73.96184657346174 40.62757081710622... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
27 22.0 3.855533e+08 271718.504936 (POLYGON ((-73.91990064270161 40.5996005215871... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
28 27.0 7.955970e+08 589135.490708 (POLYGON ((-73.82784008953526 40.5887858248046... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
29 11.0 3.926651e+08 305305.869806 (POLYGON ((-73.78833349834532 40.8346671297593... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
30 8.0 2.588266e+08 223080.044096 (POLYGON ((-73.83979488562292 40.8356192069902... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
31 2.0 2.804004e+08 212406.819436 (POLYGON ((-74.0438776163991 40.69018767537123... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
32 10.0 3.282963e+06 7883.372664 (POLYGON ((-73.9089323517538 40.8721573479701,... geo_export_237252e0-27f5-439e-a824-157f1aa920ad +proj=longlat +ellps=WGS84 +no_defs
In [332]:
dist_table.geometry_object[0].wkt
Out[332]:
'POLYGON ((-73.93311862859143 40.69579115384632, -73.93236605675587 40.69535837729649, -73.9319289006206 40.6951146575314, -73.93177404555487 40.69502821484371, -73.93147184545109 40.69486090242515, -73.93121199886629 40.69471276978224, -73.93114516482393 40.69467397486358, -73.930511465969 40.69431652304853, -73.92970192402746 40.69386339761171, -73.92882897891457 40.69335047407161, -73.92882834168937 40.6933500992537, -73.92864268981751 40.69324101255102, -73.92850659671787 40.69316572635911, -73.92804330687132 40.69290417236994, -73.92781120671985 40.6927742086535, -73.92750685270927 40.69260381145623, -73.92707069461525 40.69234622359373, -73.92644992133326 40.69200176882866, -73.92631612531525 40.69192845396802, -73.92556064122292 40.691490840003, -73.92534903613239 40.69136694183798, -73.92439183303664 40.69083174877439, -73.92429213268012 40.69077335685399, -73.92364379835564 40.6903988375817, -73.92316966710699 40.69012340599072, -73.92280702359515 40.68992392534457, -73.92207133797079 40.68948984079818, -73.92130587808545 40.68907164359784, -73.92121369950202 40.68901947711544, -73.92096103866248 40.68887205183747, -73.92055013077213 40.68864092211603, -73.91980961874593 40.6882112414115, -73.91906109224908 40.68778436875932, -73.91878323293736 40.68762608844376, -73.91829345832642 40.68735666781527, -73.91804606958546 40.68721324726834, -73.91768113918805 40.68700409429878, -73.91755226067554 40.68693022304933, -73.91679832884886 40.68649719987184, -73.91679482480477 40.68647585566188, -73.91677745031008 40.68637001456988, -73.9167269298151 40.6861315560806, -73.91739374561757 40.68605758524438, -73.91784937410198 40.68623316741141, -73.91800616261635 40.6858220326253, -73.91770109452972 40.68550035866507, -73.9178810736956 40.68509394091895, -73.91755763291395 40.68476719192117, -73.91776142009367 40.68432465483016, -73.91741294336087 40.68403405152289, -73.91758679937763 40.68391130921148, -73.91754176551858 40.68365836480015, -73.91749199778137 40.68340840097156, -73.91726632656643 40.68330322455323, -73.91748255687848 40.68289735539763, -73.91712138528143 40.68256847168654, -73.91732345753172 40.68244571271426, -73.9172785838021 40.68218148175983, -73.91722749280017 40.68193500491278, -73.9169763289098 40.68183415891587, -73.91717325370381 40.68163401677827, -73.91713117516936 40.68143498166116, -73.91708045205013 40.68119847288923, -73.91683011483136 40.68110098072739, -73.91676207442167 40.68075559994087, -73.91668638370811 40.68037137878948, -73.9165399872181 40.67963749735295, -73.91639300075401 40.67890584888099, -73.91630119795475 40.67857711243357, -73.91635492184039 40.6778235366874, -73.91645747530021 40.67672602823275, -73.91646486032333 40.67664704004651, -73.91647371343048 40.67655700413084, -73.91654436298559 40.67583843461703, -73.91661697848021 40.67505994757939, -73.91668911898741 40.67428057620312, -73.91677231309316 40.67339069306274, -73.91721449620965 40.67385599371995, -73.91907827608151 40.67397531376398, -73.91954092546408 40.67354187795393, -73.9200025573228 40.67398128283877, -73.92158580075649 40.67405555683759, -73.92213153379178 40.67368388950964, -73.92298622413946 40.67373038435373, -73.92309797315002 40.67373646296576, -73.92324436873267 40.67374442565551, -73.92336008453272 40.67375071908456, -73.92508152301917 40.67384433817344, -73.92550081445938 40.67361688849451, -73.92743998148244 40.67370890462769, -73.92784085188026 40.67399598368088, -73.92788485920073 40.67355077838425, -73.92792887672408 40.67310556484484, -73.9282445167355 40.67293933694761, -73.9282717980865 40.67252471210702, -73.92801317657924 40.67220920817189, -73.92826528535652 40.67222507724347, -73.92833507240064 40.67222889379835, -73.92843446083054 40.67223432828423, -73.92851854078968 40.67223892474855, -73.92862604211787 40.67224480269724, -73.92870956326151 40.67224936969293, -73.92882833736527 40.67225586323351, -73.92891232629145 40.67226045499795, -73.92939948737151 40.67228708901998, -73.9307932883315 40.67236495699643, -73.93355357161703 40.67251450202632, -73.9363431994143 40.67269957696197, -73.93667540626117 40.6727143953574, -73.93682717835949 40.67271928165021, -73.93708255238936 40.67272750062104, -73.93754574483314 40.672742485227, -73.93753568983917 40.67332211170566, -73.93753114689092 40.67358374466557, -73.93752485721895 40.67394597310864, -73.93751992857378 40.67422980176767, -73.93806258654983 40.67447201037736, -73.93832822275243 40.67458578836195, -73.93889518673745 40.67459873386304, -73.93877769674783 40.67472089131839, -73.93873696341934 40.67538905896871, -73.93884149817444 40.67549215171548, -73.93843297207545 40.67556514632768, -73.93852793096612 40.67617239034053, -73.93876783935517 40.67627240938755, -73.93853686439037 40.67634626398586, -73.93843334915296 40.67693817086859, -73.9386949143591 40.67705082000203, -73.93851537899995 40.67741865687472, -73.93863289914781 40.67776992010254, -73.93862492563589 40.67786256958296, -73.9386168305744 40.67794381002248, -73.93850818855235 40.67903432002991, -73.93843613326652 40.67978585170399, -73.93739763665229 40.67972990017674, -73.93764243456839 40.68096300863983, -73.93778801579033 40.68169433234844, -73.9348625715524 40.68203056869989, -73.93500821720075 40.68276376850579, -73.93515472893986 40.68349718465807, -73.93807695931454 40.68316090298936, -73.9382232586039 40.68389329800515, -73.93836816506894 40.68462560189113, -73.93851929553905 40.68535451524578, -73.93865905188188 40.68609038907362, -73.93880456947376 40.68682474574926, -73.93895074350998 40.68755641035276, -73.93938561082579 40.68787113499809, -73.9413859614605 40.6876682688049, -73.94178527584324 40.68722837212115, -73.9423520278467 40.68759150082202, -73.94428391577316 40.68738505531383, -73.94462127846789 40.68690436166384, -73.94471297541574 40.68737334166219, -73.94472684342399 40.68744426893098, -73.94476925905255 40.6876366000215, -73.94489497385723 40.68828685373113, -73.94491170954201 40.68837022260124, -73.94505645129829 40.68910234632374, -73.94519881242742 40.68983988030254, -73.94534976297854 40.69056729694257, -73.94549553209639 40.69130013188384, -73.94564166805849 40.69203345190423, -73.94578713653169 40.69276523278351, -73.94593170750511 40.69349761558435, -73.94607851790597 40.69423024360605, -73.94324249086142 40.69455748523078, -73.94338802084518 40.69528898951384, -73.94352527471476 40.69603085523811, -73.94335303762136 40.69605059879613, -73.94321604415771 40.69606631370468, -73.94279300209647 40.69611479586432, -73.94242149063366 40.69615738067107, -73.9423368032754 40.69616708769403, -73.94226167920422 40.696175698798, -73.94217061244066 40.69618613693807, -73.94208643350876 40.69619578590027, -73.94198436724183 40.69620748536691, -73.94152445883948 40.6962601986313, -73.94070625153131 40.69635305387921, -73.94040726721202 40.69638987090211, -73.94020994337993 40.69641107787343, -73.94003822508267 40.69643083533578, -73.93976769964368 40.69646329104629, -73.93948446000253 40.69649454647407, -73.93917199789162 40.69653049512544, -73.93884866773332 40.69656769159932, -73.93867948785814 40.69658715370254, -73.93779110726068 40.69668934223279, -73.93763027449141 40.69595086258837, -73.93755700214285 40.69558451152044, -73.93748376644189 40.6952184321377, -73.9370544081801 40.69553123358047, -73.9348298043286 40.69575489184654, -73.9345601174396 40.69555483174558, -73.93463155014204 40.69592163383321, -73.93470512606679 40.69628830975414, -73.9347969968416 40.69673659235447, -73.93468892593663 40.69667495388458, -73.9346067517839 40.69662808583959, -73.93457999376531 40.69661282447046, -73.93448784825009 40.69656026847871, -73.93413397294013 40.69635843140355, -73.93382817155928 40.69618427730787, -73.93311862859143 40.69579115384632))'
In [333]:
school_table.WKT[0]
Out[333]:
'POINT (-73.978747 40.722075)'
In [334]:
len(dist_table.school_dis.unique())
Out[334]:
32
In [335]:
# aggregate districts
from shapely import wkt
import numpy as np

district_polygons = dist_table['geometry_object']

def get_district(geometry_wkt):
    g = wkt.loads(geometry_wkt)
    #flags = [x.contains(g) for x in district_polygons]
    flags = [x.intersects(g) for x in district_polygons]
    index = np.argmax(flags)
    district = int(dist_table.iloc[index]['school_dis'])
    return district

school_table['district'] = school_table['WKT'].apply(get_district)
In [336]:
len(school_table.district.unique())
Out[336]:
32
In [337]:
len(dist_table)
Out[337]:
33
In [338]:
mean_table = school_table.groupby('district').mean()
In [339]:
mean_table = mean_table.reset_index()
In [340]:
mean_table = mean_table[[
    'district',
    'diversity_index',
]]
In [341]:
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
In [342]:
map_table = t.copy()
In [346]:
map_table['FillBlues'] = map_table.diversity_index
In [ ]:
school_map_table = school_table.copy()
In [ ]:
school_map_table['geometry_object'] = school_map_table.WKT.apply(wkt.loads)
In [ ]:
district_map_table = t.copy()
district_map_table['FillBlues'] = district_map_table['diversity_index']
school_map_table = school_table.copy()
#school_map_table['FillBlues'] = district_map_table['economic_need_index'].max()
map_table = pd.concat([
	#district_map_table,
school_map_table,
], sort=False)
In [ ]:
url = 'https://data.cityofnewyork.us/api/views/vh2h-md7a/rows.csv'
#school_graduation_table = pd.read_csv(url)
#len(school_graduation_table)
In [ ]:
#school_graduation_table.iloc[0]
In [ ]:
#school_graduation_table.groupby('DBN').mean()[:5]
In [ ]:
#school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
#len(school_graduation_table)
In [ ]:
#school_graduation_table = school_graduation_table.dropna()
#sum(school_graduation_table['Total Grads - % of cohort'] == 's')
In [284]:
# Define which values mean null

school_graduation_table = pd.read_csv(url, na_values=['s'])
school_graduation_table = school_graduation_table[['DBN', 'School Name', selected_outcome]].copy()
school_graduation_table = school_graduation_table.dropna()

len(school_graduation_table)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-284-141f23cd6165> in <module>
      2 
      3 school_graduation_table = pd.read_csv(url, na_values=['s'])
----> 4 school_graduation_table = school_graduation_table[['DBN', 'School Name', selected_outcome]].copy()
      5 school_graduation_table = school_graduation_table.dropna()
      6 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1244                 raise KeyError(
   1245                     u"None of [{key}] are in the [{axis}]".format(
-> 1246                         key=key, axis=self.obj._get_axis_name(axis)))
   1247 
   1248             # We (temporarily) allow for some missing keys with .loc, except in

KeyError: "None of [Index(['DBN', 'School Name', 'Total Grads - % of cohort'], dtype='object')] are in the [columns]"
In [ ]:
school_graduation_table.dtypes
In [ ]:
school_graduation_table
In [ ]:
school_graduation_table = school_graduation_table.groupby('DBN').mean()
school_graduation_table[:5]
In [ ]:
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
In [ ]:
# Load schools
school_location_table = load(
    endpoint_url,
    buffer_size=1000)
school_location_table[:5]
In [ ]:
school_location_table.iloc[0]
In [ ]:
school_location_table.iloc[0]['location_1']
In [ ]:
school_location_table.iloc[0]['ats_system_code']
In [ ]:
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
In [ ]:
school_location_table.iloc[0]['DBN']
In [ ]:
school_location_table = school_location_table.rename(columns={
    'location_1': 'WKT',
    'location_name': 'School Name',
})
In [ ]:
trimmed_school_location_table = school_location_table[[
    'DBN',
    'WKT',
    'School Name',
]]
In [ ]:
merged_school_table = pd.merge(
    school_table,
    school_graduation_table,
    left_on='DBN',
    right_on='DBN')
len(school_table)
In [ ]:
merged_school_table.iloc[0]
In [ ]:
sorted_school_table = merged_school_table.sort_values(selected_outcome, ascending=False)
In [ ]:
sorted_school_table[-10:]
In [ ]:
sorted_school_table[-3:]
In [ ]:
school_table.iloc[0]
In [ ]:
# getting top and bottom performing schools

a_school_map_table = sorted_school_table[:10].copy()
a_school_map_table = a_school_map_table[['WKT', 'School Name','total_enrollment']]
a_school_map_table = a_school_map_table.rename({
'School Name': 'Name',
})
a_school_map_table['FillColor'] = 'g'
In [ ]:
len(sorted_school_table)
In [ ]:
sorted_school_table['School Name'].nunique()
In [ ]:
sorted_school_table[30:]
In [ ]:
len(sorted_school_table.diversity_index)
In [ ]:
#max(sorted_school_table.diversity_index)
In [ ]:
b_school_map_table = sorted_school_table[-10:].copy()
b_school_map_table = b_school_map_table[['WKT', 'School Name','total_enrollment']]
b_school_map_table = b_school_map_table.rename({
'School Name': 'Name',
})
b_school_map_table['FillColor'] = 'r'
In [ ]:
t = pd.merge(dist_table, mean_table, left_on='school_dis', right_on='district')
In [ ]:
t[:3]
In [ ]:
district_map_table = t.copy()
district_map_table['WKT'] = district_map_table['geometry_object'].apply(
lambda x: x.wkt)
# Define FillColor
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 100))
In [ ]:
array = district_map_table['diversity_index'].values.reshape(-1, 1)
scaler.fit(array)
district_map_table['scaled_index'] = scaler.transform(array)
In [ ]:
max(district_map_table['diversity_index'])
In [ ]:
district_map_table.head()
In [ ]:
district_map_table.loc[district_map_table['scaled_index'] < 25, 'FillColor'] = '#ffffcc'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 25) 
                       & (district_map_table['scaled_index'] < 50), 'FillColor'] = '#fed976'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 50) 
                       & (district_map_table['scaled_index'] < 75), 'FillColor'] = '#fd8d3c'
In [ ]:
district_map_table.loc[(district_map_table['scaled_index'] >= 75), 'FillColor'] = '#e31a1c'
In [ ]:
district_map_table
In [ ]:
# EOF error

a_school_map_table = a_school_map_table.rename(columns={
 	'School Name': 'Name',
 })
b_school_map_table = b_school_map_table.rename(columns={
 	'School Name': 'Name',
 })
district_map_table['Name'] = district_map_table['district']
In [ ]:
#a_school_map_table['radiusInPixels'] = 10
In [ ]:
#b_school_map_table['radiusInPixels'] = 10
In [ ]:
# Extract columns
map_table = pd.concat([
district_map_table[['WKT','FillColor','Name']],
a_school_map_table[['WKT','FillColor','Name']],
b_school_map_table[['WKT','FillColor','Name']],
])
In [ ]:
#top_schools[:3]
In [ ]:
district_map_table
In [ ]:
type(district_map_table)
In [ ]:
target_path = target_folder + '/choropleth.csv'
map_table.to_csv(target_path, index=False)
print('a_geotable_path = %s' % target_path)
In [ ]:
map_table
In [ ]:
len(school_map_table)
In [ ]:
max(district_map_table['scaled_index'])
In [ ]:
#%matplotlib inline
#t['economic_need_index'].plot(kind='bar')
In [ ]:
sorted_school_table.to_csv('data.csv', index=False)
In [ ]:
sorted_school_table.iloc[0]
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Fixing random state for reproducibility
np.random.seed(19680801)


x = sorted_school_table[selected_attribute].values
y = sorted_school_table[selected_outcome].values
#colors = np.random.rand(N)
#area = (30 * np.random.rand(N))**2  # 0 to 15 point radii

#axes = plt.scatter(x, y, s=area, c=colors, alpha=0.5)
axes = plt.scatter(x, y, alpha=0.5)
plt.ylabel('Graduation Rate')
plt.xlabel('Diversity Index')
plt.show()
In [ ]:
plt.scatter?
In [ ]:
# Save file to target folder to include it in the result download
target_path = target_folder + '/c.png'
figure = axes.get_figure()
figure.savefig(target_path)
print(f'c_image_path = {target_path}')
In [ ]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
In [ ]:
X = sorted_school_table[['diversity_index']].values
X
In [ ]:
y = sorted_school_table[selected_outcome].values
y
In [ ]:
model.fit(X, y)
In [ ]:
from sklearn.model_selection import cross_val_score
models = []
scores = []

def train(model, X):
    model.fit(X, y)
    models.append(model)
    score = cross_val_score(
        model, X, y, cv=3,
        scoring='neg_mean_absolute_error',
    ).mean()
    scores.append(score)
    return score
In [ ]:
model_scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
print('average_negative_mean_error = %s' % np.mean(model_scores))
In [ ]: