In [1]:

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:

cig_data = pd.read_excel("cig_data_trimmed_down.xlsx")
cig_data = cig_data[['Business_Name', 'Borough_Code', 'Longitude', 'Latitude']]
cig_data.head()

Out[2]:

	Business_Name	Borough_Code	Longitude	Latitude
0	NEW STAR TOBACCO INC.	3	-74.007600	40.647730
1	BAJWA FOOD MARKET LLC	4	-73.859122	40.745863
2	SLOPE FOODS, INC.	3	-73.977380	40.680945
3	M & S DELI OF S.I. INC	5	-74.134515	40.625896
4	515 DELI CORP	1	-73.978312	40.741459

In [3]:

schools = pd.read_csv("Dummified_School_data.csv", na_values= 's')
schools.head()

Out[3]:

	Demographic	DBN	School Name	Cohort	Total Cohort	Total Grads - n	Total Grads - % of cohort	Total Regents - n	Total Regents - % of cohort	Total Regents - % of grads	...	Still Enrolled - % of cohort	Dropped Out - n	Dropped Out - % of cohort	Latitude	Longitude	boro_num1
0	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2003	5	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	40.713684	-73.986336	1
1	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2004	55	37.0	67.3	17.0	30.9	45.9	...	27.3	3.0	5.5	40.713684	-73.986336	1
2	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2005	64	43.0	67.2	27.0	42.2	62.8	...	14.1	9.0	14.1	40.713684	-73.986336	1
3	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2006	78	43.0	55.1	36.0	46.2	83.7	...	20.5	11.0	14.1	40.713684	-73.986336	1
4	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2006 Aug	78	44.0	56.4	37.0	47.4	84.1	...	19.2	11.0	14.1	40.713684	-73.986336	1

<p>5 rows × 30 columns</p>

In [4]:

schools_2006 = schools[schools['Cohort'] == '2006']
schools_2006 = schools_2006[["Total Cohort",
        "Total Grads - n",'Advanced Regents - n',"Latitude","Longitude","boro_num1",
                            "boro_num2","boro_num3","boro_num4","boro_num5"]]
schools_2006 = schools_2006.reset_index(drop = True)
schools_2006.head()

Out[4]:

	Total Cohort	Total Grads - n	Advanced Regents - n	Latitude	Longitude	boro_num1
0	78	43.0	0.0	40.713684	-73.986336	1
1	124	53.0	8.0	40.712399	-73.984497	1
2	90	70.0	0.0	40.729589	-73.982555	1
3	84	47.0	17.0	40.720581	-73.985645	1
4	193	105.0	69.0	40.720600	-73.985600	1

In [5]:

schools_2006_v3 = schools_2006.dropna().copy()
schools_2006_v3 = schools_2006_v3.reset_index(drop = True)

In [6]:

store_xys = cig_data[['Longitude','Latitude']].values
store_xys

Out[6]:

array([[-74.00760005,  40.6477301 ],
       [-73.85912156,  40.74586294],
       [-73.97737959,  40.68094544],
       ...,
       [-73.93894298,  40.8218358 ],
       [-74.10292214,  40.61667366],
       [-73.88995435,  40.67277599]])

In [7]:

# Make school kdtree 
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

store_tree = KDTree(store_xys,distance_metric='Arc', radius= RADIUS_EARTH_MILES)

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/lib/weights/util.py:19: UserWarning: geopandas not available. Some functionality will be disabled.
  warn('geopandas not available. Some functionality will be disabled.')
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/model/spvcm/abstracts.py:10: UserWarning: The `dill` module is required to use the sqlite backend fully.
  from .sqlite import head_to_sql, start_sql

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-7-80a62d3d1f83> in <module>
      1 # Make school kdtree
----> 2 from pysal.cg.kdtree import KDTree
      3 from pysal.cg import RADIUS_EARTH_MILES
      4 
      5 store_tree = KDTree(store_xys,distance_metric='Arc', radius= RADIUS_EARTH_MILES)

ModuleNotFoundError: No module named 'pysal.cg'

In [ ]:

radius_in_miles = 0.3
store_count = len(cig_data)
def get_store_count(r):
    xy = r['Longitude'], r['Latitude']
    distance , indicies = store_tree.query(
        xy, k = store_count , distance_upper_bound= radius_in_miles
    )
    indicies = indicies[~np.isnan(indicies)]
    return len(indicies)

In [ ]:

schools_2006_v3['nearby_store_count'] = schools_2006_v3.apply(
    get_store_count, axis = 1
)

In [ ]:

len(schools_2006_v3)

In [8]:

indicies = np.random.randint(0,len(schools_2006_v3),30)

df_tmp = ((schools_2006_v3.iloc[indicies])[["Total Cohort","Total Grads - n",'Advanced Regents - n',"boro_num1",
                          "boro_num2","boro_num3","boro_num4","boro_num5",'nearby_store_count']])
len(df_tmp)

schools_2006_v3 = schools_2006_v3.drop(indicies)

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-9d24801bbdb4> in <module>
      2 
      3 df_tmp = ((schools_2006_v3.iloc[indicies])[["Total Cohort","Total Grads - n",'Advanced Regents - n',"boro_num1",
----> 4                           "boro_num2","boro_num3","boro_num4","boro_num5",'nearby_store_count']])
      5 len(df_tmp)
      6 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1250             if not(self.name == 'loc' and not raise_missing):
   1251                 not_found = list(set(key) - set(ax))
-> 1252                 raise KeyError("{} not in index".format(not_found))
   1253 
   1254             # we skip the warning on Categorical/Interval

KeyError: "['nearby_store_count'] not in index"

In [9]:

len(schools_2006_v3)

Out[9]:

In [23]:

df_tmp.to_csv("Testing_Data.csv",index = False)
schools_2006_v3.to_csv("Clean School Data 2006.csv",index = False)

Pay Notebook Creator: Jendri Morocho	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

NYC HS Graduation Rate Estimator