NYC HS Graduation Rate Estimator




Pay Notebook Creator: Jendri Morocho0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
cig_data = pd.read_excel("cig_data_trimmed_down.xlsx")
cig_data = cig_data[['Business_Name', 'Borough_Code', 'Longitude', 'Latitude']]
cig_data.head()
Out[2]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Business_Name Borough_Code Longitude Latitude
0 NEW STAR TOBACCO INC. 3 -74.007600 40.647730
1 BAJWA FOOD MARKET LLC 4 -73.859122 40.745863
2 SLOPE FOODS, INC. 3 -73.977380 40.680945
3 M & S DELI OF S.I. INC 5 -74.134515 40.625896
4 515 DELI CORP 1 -73.978312 40.741459
In [3]:
schools = pd.read_csv("Dummified_School_data.csv", na_values= 's')
schools.head()
Out[3]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Demographic DBN School Name Cohort Total Cohort Total Grads - n Total Grads - % of cohort Total Regents - n Total Regents - % of cohort Total Regents - % of grads ... Still Enrolled - % of cohort Dropped Out - n Dropped Out - % of cohort Latitude Longitude boro_num1 boro_num2 boro_num3 boro_num4 boro_num5
0 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN 40.713684 -73.986336 1 0 0 0 0
1 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2004 55 37.0 67.3 17.0 30.9 45.9 ... 27.3 3.0 5.5 40.713684 -73.986336 1 0 0 0 0
2 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2005 64 43.0 67.2 27.0 42.2 62.8 ... 14.1 9.0 14.1 40.713684 -73.986336 1 0 0 0 0
3 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2006 78 43.0 55.1 36.0 46.2 83.7 ... 20.5 11.0 14.1 40.713684 -73.986336 1 0 0 0 0
4 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2006 Aug 78 44.0 56.4 37.0 47.4 84.1 ... 19.2 11.0 14.1 40.713684 -73.986336 1 0 0 0 0
<p>5 rows × 30 columns</p>
In [4]:
schools_2006 = schools[schools['Cohort'] == '2006']
schools_2006 = schools_2006[["Total Cohort",
        "Total Grads - n",'Advanced Regents - n',"Latitude","Longitude","boro_num1",
                            "boro_num2","boro_num3","boro_num4","boro_num5"]]
schools_2006 = schools_2006.reset_index(drop = True)
schools_2006.head()
Out[4]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Total Cohort Total Grads - n Advanced Regents - n Latitude Longitude boro_num1 boro_num2 boro_num3 boro_num4 boro_num5
0 78 43.0 0.0 40.713684 -73.986336 1 0 0 0 0
1 124 53.0 8.0 40.712399 -73.984497 1 0 0 0 0
2 90 70.0 0.0 40.729589 -73.982555 1 0 0 0 0
3 84 47.0 17.0 40.720581 -73.985645 1 0 0 0 0
4 193 105.0 69.0 40.720600 -73.985600 1 0 0 0 0
In [5]:
schools_2006_v3 = schools_2006.dropna().copy()
schools_2006_v3 = schools_2006_v3.reset_index(drop = True)
In [6]:
store_xys = cig_data[['Longitude','Latitude']].values
store_xys
Out[6]:
array([[-74.00760005,  40.6477301 ],
       [-73.85912156,  40.74586294],
       [-73.97737959,  40.68094544],
       ...,
       [-73.93894298,  40.8218358 ],
       [-74.10292214,  40.61667366],
       [-73.88995435,  40.67277599]])
In [7]:
# Make school kdtree 
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

store_tree = KDTree(store_xys,distance_metric='Arc', radius= RADIUS_EARTH_MILES)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/lib/weights/util.py:19: UserWarning: geopandas not available. Some functionality will be disabled.
  warn('geopandas not available. Some functionality will be disabled.')
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/model/spvcm/abstracts.py:10: UserWarning: The `dill` module is required to use the sqlite backend fully.
  from .sqlite import head_to_sql, start_sql
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-7-80a62d3d1f83> in <module>
      1 # Make school kdtree
----> 2 from pysal.cg.kdtree import KDTree
      3 from pysal.cg import RADIUS_EARTH_MILES
      4 
      5 store_tree = KDTree(store_xys,distance_metric='Arc', radius= RADIUS_EARTH_MILES)

ModuleNotFoundError: No module named 'pysal.cg'
In [ ]:
radius_in_miles = 0.3
store_count = len(cig_data)
def get_store_count(r):
    xy = r['Longitude'], r['Latitude']
    distance , indicies = store_tree.query(
        xy, k = store_count , distance_upper_bound= radius_in_miles
    )
    indicies = indicies[~np.isnan(indicies)]
    return len(indicies)
In [ ]:
schools_2006_v3['nearby_store_count'] = schools_2006_v3.apply(
    get_store_count, axis = 1
)
In [ ]:
len(schools_2006_v3)
In [8]:
indicies = np.random.randint(0,len(schools_2006_v3),30)

df_tmp = ((schools_2006_v3.iloc[indicies])[["Total Cohort","Total Grads - n",'Advanced Regents - n',"boro_num1",
                          "boro_num2","boro_num3","boro_num4","boro_num5",'nearby_store_count']])
len(df_tmp)

schools_2006_v3 = schools_2006_v3.drop(indicies)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-9d24801bbdb4> in <module>
      2 
      3 df_tmp = ((schools_2006_v3.iloc[indicies])[["Total Cohort","Total Grads - n",'Advanced Regents - n',"boro_num1",
----> 4                           "boro_num2","boro_num3","boro_num4","boro_num5",'nearby_store_count']])
      5 len(df_tmp)
      6 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1250             if not(self.name == 'loc' and not raise_missing):
   1251                 not_found = list(set(key) - set(ax))
-> 1252                 raise KeyError("{} not in index".format(not_found))
   1253 
   1254             # we skip the warning on Categorical/Interval

KeyError: "['nearby_store_count'] not in index"
In [9]:
len(schools_2006_v3)
Out[9]:
328
In [23]:
df_tmp.to_csv("Testing_Data.csv",index = False)
schools_2006_v3.to_csv("Clean School Data 2006.csv",index = False)