Prepare and Fit Spatial Regression Models 20190222




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Prepare Training Dataset to Estimate Graduation Rate from Tree Count

  • If you get -9 errors when running your notebook or previewing your tool, please shutdown other running notebooks and press ESC 0 0 to clear the memory and restart the kernel.
  • Alternatively, you can start your notebook session using SMALL or MEDIUM memory.
In [1]:
import pandas as pd

def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, ignore_index=True, sort=False)

Load Raw Data

Load Graduation Rates by School Code

In [2]:
url = 'https://data.cityofnewyork.us/api/views/vh2h-md7a/rows.csv'
school_graduation_table = pd.read_csv(url)
len(school_graduation_table)
Out[2]:
25096
In [3]:
school_graduation_table.iloc[0]
Out[3]:
Demographic                                                    Total Cohort
DBN                                                                  01M292
School Name                           HENRY STREET SCHOOL FOR INTERNATIONAL
Cohort                                                                 2003
Total Cohort                                                              5
Total Grads - n                                                           s
Total Grads - % of cohort                                               NaN
Total Regents - n                                                         s
Total Regents - % of cohort                                             NaN
Total Regents - % of grads                                              NaN
Advanced Regents - n                                                      s
Advanced Regents - % of cohort                                          NaN
Advanced Regents - % of grads                                           NaN
Regents w/o Advanced - n                                                  s
Regents w/o Advanced - % of cohort                                      NaN
Regents w/o Advanced - % of grads                                       NaN
Local - n                                                                 s
Local - % of cohort                                                     NaN
Local - % of grads                                                      NaN
Still Enrolled - n                                                        s
Still Enrolled - % of cohort                                            NaN
Dropped Out - n                                                           s
Dropped Out - % of cohort                                               NaN
Name: 0, dtype: object
In [4]:
school_graduation_table.groupby('DBN').mean()[:5]
Out[4]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Total Cohort Total Grads - % of cohort Total Regents - % of cohort Total Regents - % of grads Advanced Regents - % of cohort Advanced Regents - % of grads Regents w/o Advanced - % of cohort Regents w/o Advanced - % of grads Local - % of cohort Local - % of grads Still Enrolled - % of cohort Dropped Out - % of cohort
DBN
01M292 25.870370 61.755556 42.737037 70.448148 0.000000 0.000000 42.737037 70.448148 19.000000 29.551852 20.337037 12.100000
01M448 44.051948 58.637736 36.352830 61.415094 8.462264 13.481132 27.884906 47.928302 22.281132 38.588679 28.264151 9.958491
01M450 36.565789 70.462500 66.075000 93.714583 0.000000 0.000000 66.075000 93.714583 4.383333 6.285417 18.210417 9.985417
01M509 34.797297 49.902381 31.861905 61.852381 10.540476 19.133333 21.321429 42.711905 18.028571 38.150000 30.502381 14.266667
01M515 87.876712 49.151064 36.508511 74.361702 21.876596 44.129787 14.631915 30.225532 12.725532 25.787234 34.219149 16.370213
In [5]:
school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
len(school_graduation_table)
Out[5]:
25096
In [6]:
school_graduation_table = school_graduation_table.dropna()
sum(school_graduation_table['Total Grads - % of cohort'] == 's')
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pandas/core/ops.py:1649: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)
Out[6]:
0
In [7]:
# Define which values mean null
school_graduation_table = pd.read_csv(url, na_values=['s'])
school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
school_graduation_table = school_graduation_table.dropna()
school_graduation_table = school_graduation_table.rename(
    columns={'Total Grads - % of cohort': 'Graduation Rate'})
len(school_graduation_table)
Out[7]:
16704
In [8]:
school_graduation_table.dtypes
Out[8]:
DBN                 object
School Name         object
Graduation Rate    float64
dtype: object
In [9]:
school_graduation_table = school_graduation_table.groupby('DBN').mean()
school_graduation_table[:5]
Out[9]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Graduation Rate
DBN
01M292 61.755556
01M448 58.637736
01M450 70.462500
01M509 49.902381
01M515 49.151064

Merge School Locations by School Code

  1. Go to the page for your dataset (e.g. https://data.cityofnewyork.us/Education/2017-2018-School-Locations/p6h4-mpyy)
  2. Click API
  3. Copy the CSV API Endpoint (e.g. https://data.cityofnewyork.us/resource/r2nx-nhxe.csv)
In [10]:
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
In [11]:
# Load schools
school_location_table = load(
    endpoint_url,
    buffer_size=1000)
school_location_table[:5]
https://data.cityofnewyork.us/resource/r2nx-nhxe.csv?$limit=1000
1000
1823
Out[11]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
:@computed_region_92fq_4b7q :@computed_region_efsh_h5xi :@computed_region_f5dn_yrer :@computed_region_sbqj_enih :@computed_region_yeji_bk3q admin_district_location_code administrative_district_name ats_system_code beds_number borough_block_lot ... primary_building_code principal_name principal_phone_number principal_title school_support_team_leader_name school_support_team_name state_code status_descriptions x_coordinate y_coordinate
0 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M015 310100010015 1003740020 ... M015 IRENE SANCHEZ 212-228-8730 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 990141.0 202349.0
1 50.0 11724.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M019 310100010019 1004530034 ... M019 JACQUELINE FLANAGAN 212-533-5340 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988547.0 205239.0
2 32.0 11723.0 70.0 4.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M020 310100010020 1003550001 ... M020 SARAH PINTO VIAGRAN 212-254-9577 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988044.0 202068.0
3 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M034 310100010034 1003810038 ... M034 ANGELIKI LOUKATOS 212-228-4433 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 991163.0 203782.0
4 50.0 11729.0 70.0 5.0 4.0 M801 COMMUNITY SCHOOL DISTRICT 01 01M063 310100010063 1004310014 ... M063 DARLENE CAMERON 212-674-3180 PRINCIPAL NaN School Support Team 3- Manhattan NY Open 988071.0 203210.0
<p>5 rows × 48 columns</p>
In [12]:
school_location_table.iloc[0]
Out[12]:
:@computed_region_92fq_4b7q                                                         50
:@computed_region_efsh_h5xi                                                      11729
:@computed_region_f5dn_yrer                                                         70
:@computed_region_sbqj_enih                                                          5
:@computed_region_yeji_bk3q                                                          4
admin_district_location_code                                                      M801
administrative_district_name                              COMMUNITY SCHOOL DISTRICT 01
ats_system_code                                                           01M015      
beds_number                                                               310100010015
borough_block_lot                                                           1003740020
census_tract                                                                      2601
community_district                                                                 103
community_school_sup_name                                           PHILLIPS, DANIELLA
council_district                                                                     2
fax_number                                                                212-477-0931
field_support_center_leader_name                                             CHU, YUET
field_support_center_name                             Field Support Center - Manhattan
fiscal_year                                                                       2018
geographical_district_code                                                           1
grades_final_text                                                 PK,0K,01,02,03,04,05
grades_text                                                    PK,0K,01,02,03,04,05,SE
highschool_network_location_code                                                   NaN
highschool_network_name                                                            NaN
highschool_network_superintendent                                                  NaN
location_1                                                POINT (-73.978747 40.722075)
location_1_address                                                   333 EAST 4 STREET
location_1_city                                                              MANHATTAN
location_1_state                                                                    NY
location_1_zip                                                                   10009
location_category_description                                               Elementary
location_code                                                                     M015
location_name                                                P.S. 015 Roberto Clemente
location_type_description                                             General Academic
managed_by_name                                                                    DOE
nta                                                                               MN28
nta_name                             Lower East Side                               ...
open_date                                                      1904-07-01T00:00:00.000
primary_address_line_1                                              333 EAST  4 STREET
primary_building_code                                                             M015
principal_name                                                           IRENE SANCHEZ
principal_phone_number                                                    212-228-8730
principal_title                                                              PRINCIPAL
school_support_team_leader_name                                                    NaN
school_support_team_name                              School Support Team 3- Manhattan
state_code                                                                          NY
status_descriptions                                                               Open
x_coordinate                                                                    990141
y_coordinate                                                                    202349
Name: 0, dtype: object
In [13]:
school_location_table.iloc[0]['location_1']
Out[13]:
'POINT (-73.978747 40.722075)'
In [14]:
school_location_table.iloc[0]['ats_system_code']
Out[14]:
'01M015      '
In [15]:
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
In [16]:
school_location_table.iloc[0]['DBN']
Out[16]:
'01M015'
In [17]:
school_location_table = school_location_table.rename(columns={
    'location_1': 'WKT',
    'location_name': 'School Name',
})
In [18]:
trimmed_school_location_table = school_location_table[[
    'DBN',
    'WKT',
    'School Name',
]]
In [19]:
school_table = pd.merge(
    trimmed_school_location_table,
    school_graduation_table,
    left_on='DBN',
    right_on='DBN')
len(school_table)
Out[19]:
323
In [20]:
school_table.iloc[0]
Out[20]:
DBN                                      01M292
WKT                POINT (-73.986051 40.713362)
School Name          Orchard Collegiate Academy
Graduation Rate                         61.7556
Name: 0, dtype: object

Convert Spatial Reference If Needed

In [21]:
from geotable.projections import get_transform_shapely_geometry, LONGITUDE_LATITUDE_PROJ4
source_proj4 = LONGITUDE_LATITUDE_PROJ4
# http://spatialreference.org/ref/epsg/2263/
target_proj4 = '+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs '
f = get_transform_shapely_geometry(source_proj4, target_proj4)
# If you get an error here, try shutting down other running notebooks and run this code block again
In [22]:
from shapely import wkt
geometry = wkt.loads(school_table.iloc[0]['WKT'])
geometry.wkt
Out[22]:
'POINT (-73.986051 40.713362)'
In [23]:
f(geometry).wkt
Out[23]:
'POINT (988117.0150419661 199173.8250251808)'
In [24]:
school_location_table.iloc[0][['x_coordinate', 'y_coordinate']]
Out[24]:
x_coordinate    990141
y_coordinate    202349
Name: 0, dtype: object
In [25]:
from shapely.geometry import Point
Point(988117, 199174).wkt
Out[25]:
'POINT (988117 199174)'

Load Trees

  1. Go to the page for your dataset (e.g. https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/uvpi-gqnh)
  2. Click API
  3. Copy the CSV API Endpoint (e.g. https://data.cityofnewyork.us/resource/nwxe-4ae8.csv)
In [26]:
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'tree_dbh', 'latitude', 'longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table[:5]
https://data.cityofnewyork.us/resource/nwxe-4ae8.csv?$limit=100000&$select=tree_id,tree_dbh,latitude,longitude
100000
200000
300000
400000
500000
600000
683788
Out[26]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_dbh tree_id
0 40.723092 -73.844215 3 180683
1 40.794111 -73.818679 21 200540
2 40.717581 -73.936608 3 204026
3 40.713537 -73.934456 10 204337
4 40.666778 -73.975979 21 189565

Explore Raw Data

See Columns

In [27]:
school_table.columns
Out[27]:
Index(['DBN', 'WKT', 'School Name', 'Graduation Rate'], dtype='object')
In [28]:
tree_table.columns
Out[28]:
Index(['latitude', 'longitude', 'tree_dbh', 'tree_id'], dtype='object')

See Rows

In [29]:
school_table[:3]
Out[29]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500
In [30]:
tree_table[:3]
Out[30]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_dbh tree_id
0 40.723092 -73.844215 3 180683
1 40.794111 -73.818679 21 200540
2 40.717581 -73.936608 3 204026

See Specific Row

In [31]:
school_table.iloc[0]
Out[31]:
DBN                                      01M292
WKT                POINT (-73.986051 40.713362)
School Name          Orchard Collegiate Academy
Graduation Rate                         61.7556
Name: 0, dtype: object
In [32]:
tree_table.iloc[0]
Out[32]:
latitude         40.723092
longitude       -73.844215
tree_dbh          3.000000
tree_id      180683.000000
Name: 0, dtype: float64

Prepare Your Training Dataset

Decide What Constitutes One Sample in Your Training Dataset

Here we choose our individual sample to be a school. This means that each row of our dataset represents one school location.

Choose Target Variable You Want to Predict

We would like to predict graduation rate.

In [33]:
y = school_table['Graduation Rate']
y[:5]
Out[33]:
0    61.755556
1    58.637736
2    70.462500
3    49.902381
4    49.151064
Name: Graduation Rate, dtype: float64

Design Dummy Training Dataset

Let's design a dummy dataset to envision what our training dataset should look like.

In [34]:
dummy_table = pd.DataFrame([
    [10, 100, 5, 70],
    [20, 200, 3, 50],
    [30, 300, 4, 40],
], columns=[
    'Tree Count Within X Meters',
    'Sum of Tree Distances Within X Meters',
    'Average Tree Diameter in Inches Within X Meters',
    'Graduation Rate',
])
dummy_table
Out[34]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Tree Count Within X Meters Sum of Tree Distances Within X Meters Average Tree Diameter in Inches Within X Meters Graduation Rate
0 10 100 5 70
1 20 200 3 50
2 30 300 4 40

Choose Feature Variables You Will Use to Predict the Target Variable

  • Tree Count Within X Meters
  • Sum of Tree Distances Within X Meters
  • Average Tree Diameter in Inches Within X Meters
In [35]:
school_table[:2]
Out[35]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736
In [36]:
tree_table[:2]
Out[36]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_dbh tree_id
0 40.723092 -73.844215 3 180683
1 40.794111 -73.818679 21 200540
In [37]:
dataset_table = school_table.copy()

Get Tree Count

In [38]:
# Prepare kd tree of tree locations
tree_xys = tree_table[['longitude', 'latitude']].values
tree_xys[:3]
Out[38]:
array([[-73.84421522,  40.72309177],
       [-73.81867946,  40.79411067],
       [-73.9366077 ,  40.71758074]])
  • If you get -9 errors (or kernel has died errors) when running your notebook or previewing your tool, please shutdown other running notebooks and press ESC 0 0 to clear the memory and restart the kernel.
  • Alternatively, you can start your notebook session using SMALL or MEDIUM memory.
In [39]:
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM

"""
# Use the full tree when actually building your dataset
tree_tree = KDTree(
    tree_xys,
    distance_metric='Arc',
    radius=RADIUS_EARTH_KM * 1000)
"""
MAXIMUM_TREE_COUNT = 100000
# Build partial tree for fast demonstration purposes
partial_tree_tree = KDTree(
    tree_xys[:MAXIMUM_TREE_COUNT],
    distance_metric='Arc',
    radius=RADIUS_EARTH_KM * 1000)

tree_tree = partial_tree_tree
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/lib/weights/util.py:19: UserWarning: geopandas not available. Some functionality will be disabled.
  warn('geopandas not available. Some functionality will be disabled.')
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/model/spvcm/abstracts.py:10: UserWarning: The `dill` module is required to use the sqlite backend fully.
  from .sqlite import head_to_sql, start_sql
In [40]:
# Choose a reference location
from shapely import wkt
g = wkt.loads(school_table.iloc[0]['WKT'])
g.wkt
Out[40]:
'POINT (-73.986051 40.713362)'
In [41]:
xy = g.coords[0]
xy
Out[41]:
(-73.986051, 40.713362)
In [42]:
search_radius_in_meters = 10
In [43]:
distances, indices = tree_tree.query(
    xy, k=len(tree_xys),
    distance_upper_bound=search_radius_in_meters)
In [44]:
tree_count = len(tree_tree.data)
tree_count
Out[44]:
100000
In [45]:
radius_in_meters = 500

def get_tree_count(r):
    xy = wkt.loads(r['WKT']).coords[0]
    distances, indices = tree_tree.query(
        xy,
        k=tree_count,
        distance_upper_bound=radius_in_meters)
    return sum(indices < tree_count)
In [46]:
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[f'Tree Count Within {radius_in_meters} Meters'] = partial_dataset_table.apply(
    get_tree_count, axis=1)
In [47]:
partial_dataset_table[:5]
Out[47]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate Tree Count Within 500 Meters
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556 20
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736 37
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500 241
3 01M509 POINT (-73.986038 40.72004) Marta Valle High School 49.902381 143
4 01M515 POINT (-73.986038 40.72004) Lower East Side Preparatory High School 49.151064 143

Get Sum of Distances

In [48]:
geometry = wkt.loads(dataset_table['WKT'][0])
geometry.wkt
Out[48]:
'POINT (-73.986051 40.713362)'
In [49]:
xy = geometry.coords[0]
xy
Out[49]:
(-73.986051, 40.713362)
In [50]:
distances, indices = tree_tree.query(
    xy,
    k=len(tree_tree.data),
    distance_upper_bound=radius_in_meters)
In [51]:
distances
Out[51]:
array([ 44.63534248, 118.61966985, 125.00502394, ...,          inf,
                inf,          inf])
In [52]:
indices
Out[52]:
array([ 69633,  23047,  14874, ..., 100000, 100000, 100000])
In [53]:
distances = distances[indices < tree_count]
distances
Out[53]:
array([ 44.63534248, 118.61966985, 125.00502394, 205.44290216,
       295.56264424, 302.08859339, 302.99611334, 380.16940535,
       381.09048355, 424.6032968 , 437.07794663, 439.03489318,
       439.48327268, 444.23723726, 474.60666939, 478.85715324,
       479.76247682, 489.07506133, 491.8548681 , 493.5907635 ])
In [54]:
def get_sum_distance(r):
    xy = wkt.loads(r['WKT']).coords[0]
    distances, indices = tree_tree.query(
        xy,
        k=tree_count,
        distance_upper_bound=radius_in_meters)
    return sum(distances[indices < tree_count])
In [55]:
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[f'Sum of Tree Distances Within {radius_in_meters} Meters'] = partial_dataset_table.apply(
    get_sum_distance, axis=1)
In [56]:
partial_dataset_table[:5]
Out[56]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate Sum of Tree Distances Within 500 Meters
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556 7247.793817
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736 14736.467135
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500 83158.767475
3 01M509 POINT (-73.986038 40.72004) Marta Valle High School 49.902381 48503.391488
4 01M515 POINT (-73.986038 40.72004) Lower East Side Preparatory High School 49.151064 48503.391488

Get Average Tree Diameter

In [57]:
tree_index = tree_table.index
tree_index[:3]
Out[57]:
RangeIndex(start=0, stop=3, step=1)
In [58]:
tree_table[:3]
Out[58]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_dbh tree_id
0 40.723092 -73.844215 3 180683
1 40.794111 -73.818679 21 200540
2 40.717581 -73.936608 3 204026
In [59]:
def get_average_diameter(r):
    xy = wkt.loads(r['WKT']).coords[0]
    distances, indices = tree_tree.query(
        xy,
        k=tree_count,
        distance_upper_bound=radius_in_meters)
    relative_indices = indices[indices < tree_count]
    relative_indices = [int(x) for x in relative_indices]
    tree_indices = tree_index[relative_indices]
    selected_tree_table = tree_table.loc[tree_indices]
    return selected_tree_table['tree_dbh'].mean()
In [60]:
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[
    f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_average_diameter, axis=1)
In [61]:
partial_dataset_table[:5]
Out[61]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate Average Tree Diameter in Inches Within 500 Meters
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556 7.450000
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736 17.027027
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500 9.435685
3 01M509 POINT (-73.986038 40.72004) Marta Valle High School 49.902381 8.083916
4 01M515 POINT (-73.986038 40.72004) Lower East Side Preparatory High School 49.151064 8.083916

Normalize Variables

Typically you will have to normalize your variables. However, in this case our data is already normalized because each row is using the same search radius in meters.

Build and Save Partial Dataset

In [62]:
partial_dataset_table = dataset_table[:5].copy()

partial_dataset_table[
    f'Tree Count Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_tree_count, axis=1)

partial_dataset_table[
    f'Sum of Tree Distances Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_sum_distance, axis=1)

partial_dataset_table[
    f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_average_diameter, axis=1)

partial_dataset_table[:5]
Out[62]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate Tree Count Within 500 Meters Sum of Tree Distances Within 500 Meters Average Tree Diameter in Inches Within 500 Meters
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556 20 7247.793817 7.450000
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736 37 14736.467135 17.027027
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500 241 83158.767475 9.435685
3 01M509 POINT (-73.986038 40.72004) Marta Valle High School 49.902381 143 48503.391488 8.083916
4 01M515 POINT (-73.986038 40.72004) Lower East Side Preparatory High School 49.151064 143 48503.391488 8.083916
In [63]:
partial_dataset_table = partial_dataset_table[[
    'DBN',
    'School Name',
    f'Tree Count Within {radius_in_meters} Meters',
    f'Sum of Tree Distances Within {radius_in_meters} Meters',
    f'Average Tree Diameter in Inches Within {radius_in_meters} Meters',
    'Graduation Rate',
]]
partial_dataset_table[:5]
Out[63]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN School Name Tree Count Within 500 Meters Sum of Tree Distances Within 500 Meters Average Tree Diameter in Inches Within 500 Meters Graduation Rate
0 01M292 Orchard Collegiate Academy 20 7247.793817 7.450000 61.755556
1 01M448 University Neighborhood High School 37 14736.467135 17.027027 58.637736
2 01M450 East Side Community School 241 83158.767475 9.435685 70.462500
3 01M509 Marta Valle High School 143 48503.391488 8.083916 49.902381
4 01M515 Lower East Side Preparatory High School 143 48503.391488 8.083916 49.151064
In [64]:
partial_dataset_table.to_csv('/tmp/dataset.csv', index=False)

Train Model Using Full Dataset

If you are training your actual model, you will want to use the complete dataset. Note that preparing the full dataset might take several minutes.

In [65]:
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM

MAXIMUM_TREE_COUNT = 100000
# MAXIMUM_TREE_COUNT = len(tree_xys)  # Uncomment this line if you want to use the entire tree dataset

partial_tree_tree = KDTree(
    tree_xys[:MAXIMUM_TREE_COUNT],
    distance_metric='Arc',
    radius=RADIUS_EARTH_KM * 1000)

tree_tree = partial_tree_tree
In [66]:
MAXIMUM_SCHOOL_COUNT = 5
# MAXIMUM_SCHOOL_COUNT = len(dataset_table)  # Uncomment this line if you want to use the entire school dataset

partial_dataset_table = dataset_table[:MAXIMUM_SCHOOL_COUNT].copy()

partial_dataset_table[
    f'Tree Count Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_tree_count, axis=1)

partial_dataset_table[
    f'Sum of Tree Distances Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_sum_distance, axis=1)

partial_dataset_table[
    f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
    get_average_diameter, axis=1)

partial_dataset_table[:5]
Out[66]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
DBN WKT School Name Graduation Rate Tree Count Within 500 Meters Sum of Tree Distances Within 500 Meters Average Tree Diameter in Inches Within 500 Meters
0 01M292 POINT (-73.986051 40.713362) Orchard Collegiate Academy 61.755556 20 7247.793817 7.450000
1 01M448 POINT (-73.984128 40.712269) University Neighborhood High School 58.637736 37 14736.467135 17.027027
2 01M450 POINT (-73.982472 40.729152) East Side Community School 70.462500 241 83158.767475 9.435685
3 01M509 POINT (-73.986038 40.72004) Marta Valle High School 49.902381 143 48503.391488 8.083916
4 01M515 POINT (-73.986038 40.72004) Lower East Side Preparatory High School 49.151064 143 48503.391488 8.083916
In [67]:
X = partial_dataset_table[[
    f'Tree Count Within {radius_in_meters} Meters',
    f'Sum of Tree Distances Within {radius_in_meters} Meters',
    f'Average Tree Diameter in Inches Within {radius_in_meters} Meters',
]]
In [68]:
y = partial_dataset_table[
    'Graduation Rate'
]
In [69]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X, y)
Out[69]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
In [70]:
model1.predict([
    [20, 7000, 10],
    [40, 14000, 12],
])
Out[70]:
array([47.57777572, 44.06493586])
In [71]:
from sklearn.model_selection import cross_val_score

score1 = cross_val_score(model1, X, y, cv=3, scoring='neg_mean_absolute_error').mean()
score1
Out[71]:
-16.84490804608365
In [72]:
from sklearn.svm import SVR
model2 = SVR(gamma='auto')
model2.fit(X, y)
Out[72]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
In [73]:
model2.predict([
    [20, 7000, 10],
    [40, 14000, 12],
])
Out[73]:
array([58.63773585, 58.63773585])
In [74]:
from sklearn.model_selection import cross_val_score

score2 = cross_val_score(model2, X, y, cv=3, scoring='neg_mean_absolute_error').mean()
score2
Out[74]:
-9.706635382084498