import pandas as pd
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, ignore_index=True, sort=False)
url = 'https://data.cityofnewyork.us/api/views/vh2h-md7a/rows.csv'
school_graduation_table = pd.read_csv(url)
len(school_graduation_table)
school_graduation_table.iloc[0]
school_graduation_table.groupby('DBN').mean()[:5]
school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
len(school_graduation_table)
school_graduation_table = school_graduation_table.dropna()
sum(school_graduation_table['Total Grads - % of cohort'] == 's')
# Define which values mean null
school_graduation_table = pd.read_csv(url, na_values=['s'])
school_graduation_table = school_graduation_table[['DBN', 'School Name', 'Total Grads - % of cohort']].copy()
school_graduation_table = school_graduation_table.dropna()
school_graduation_table = school_graduation_table.rename(
columns={'Total Grads - % of cohort': 'Graduation Rate'})
len(school_graduation_table)
school_graduation_table.dtypes
school_graduation_table = school_graduation_table.groupby('DBN').mean()
school_graduation_table[:5]
endpoint_url = 'https://data.cityofnewyork.us/resource/r2nx-nhxe.csv'
# Load schools
school_location_table = load(
endpoint_url,
buffer_size=1000)
school_location_table[:5]
school_location_table.iloc[0]
school_location_table.iloc[0]['location_1']
school_location_table.iloc[0]['ats_system_code']
school_location_table['DBN'] = school_location_table['ats_system_code'].str.strip()
school_location_table.iloc[0]['DBN']
school_location_table = school_location_table.rename(columns={
'location_1': 'WKT',
'location_name': 'School Name',
})
trimmed_school_location_table = school_location_table[[
'DBN',
'WKT',
'School Name',
]]
school_table = pd.merge(
trimmed_school_location_table,
school_graduation_table,
left_on='DBN',
right_on='DBN')
len(school_table)
school_table.iloc[0]
from geotable.projections import get_transform_shapely_geometry, LONGITUDE_LATITUDE_PROJ4
source_proj4 = LONGITUDE_LATITUDE_PROJ4
# http://spatialreference.org/ref/epsg/2263/
target_proj4 = '+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs '
f = get_transform_shapely_geometry(source_proj4, target_proj4)
# If you get an error here, try shutting down other running notebooks and run this code block again
from shapely import wkt
geometry = wkt.loads(school_table.iloc[0]['WKT'])
geometry.wkt
f(geometry).wkt
school_location_table.iloc[0][['x_coordinate', 'y_coordinate']]
from shapely.geometry import Point
Point(988117, 199174).wkt
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'tree_dbh', 'latitude', 'longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table[:5]
school_table.columns
tree_table.columns
school_table[:3]
tree_table[:3]
school_table.iloc[0]
tree_table.iloc[0]
Here we choose our individual sample to be a school. This means that each row of our dataset represents one school location.
We would like to predict graduation rate.
y = school_table['Graduation Rate']
y[:5]
Let's design a dummy dataset to envision what our training dataset should look like.
dummy_table = pd.DataFrame([
[10, 100, 5, 70],
[20, 200, 3, 50],
[30, 300, 4, 40],
], columns=[
'Tree Count Within X Meters',
'Sum of Tree Distances Within X Meters',
'Average Tree Diameter in Inches Within X Meters',
'Graduation Rate',
])
dummy_table
school_table[:2]
tree_table[:2]
dataset_table = school_table.copy()
# Prepare kd tree of tree locations
tree_xys = tree_table[['longitude', 'latitude']].values
tree_xys[:3]
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
"""
# Use the full tree when actually building your dataset
tree_tree = KDTree(
tree_xys,
distance_metric='Arc',
radius=RADIUS_EARTH_KM * 1000)
"""
MAXIMUM_TREE_COUNT = 100000
# Build partial tree for fast demonstration purposes
partial_tree_tree = KDTree(
tree_xys[:MAXIMUM_TREE_COUNT],
distance_metric='Arc',
radius=RADIUS_EARTH_KM * 1000)
tree_tree = partial_tree_tree
# Choose a reference location
from shapely import wkt
g = wkt.loads(school_table.iloc[0]['WKT'])
g.wkt
xy = g.coords[0]
xy
search_radius_in_meters = 10
distances, indices = tree_tree.query(
xy, k=len(tree_xys),
distance_upper_bound=search_radius_in_meters)
tree_count = len(tree_tree.data)
tree_count
radius_in_meters = 500
def get_tree_count(r):
xy = wkt.loads(r['WKT']).coords[0]
distances, indices = tree_tree.query(
xy,
k=tree_count,
distance_upper_bound=radius_in_meters)
return sum(indices < tree_count)
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[f'Tree Count Within {radius_in_meters} Meters'] = partial_dataset_table.apply(
get_tree_count, axis=1)
partial_dataset_table[:5]
geometry = wkt.loads(dataset_table['WKT'][0])
geometry.wkt
xy = geometry.coords[0]
xy
distances, indices = tree_tree.query(
xy,
k=len(tree_tree.data),
distance_upper_bound=radius_in_meters)
distances
indices
distances = distances[indices < tree_count]
distances
def get_sum_distance(r):
xy = wkt.loads(r['WKT']).coords[0]
distances, indices = tree_tree.query(
xy,
k=tree_count,
distance_upper_bound=radius_in_meters)
return sum(distances[indices < tree_count])
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[f'Sum of Tree Distances Within {radius_in_meters} Meters'] = partial_dataset_table.apply(
get_sum_distance, axis=1)
partial_dataset_table[:5]
tree_index = tree_table.index
tree_index[:3]
tree_table[:3]
def get_average_diameter(r):
xy = wkt.loads(r['WKT']).coords[0]
distances, indices = tree_tree.query(
xy,
k=tree_count,
distance_upper_bound=radius_in_meters)
relative_indices = indices[indices < tree_count]
relative_indices = [int(x) for x in relative_indices]
tree_indices = tree_index[relative_indices]
selected_tree_table = tree_table.loc[tree_indices]
return selected_tree_table['tree_dbh'].mean()
# It is always a good idea to test your function on a small piece of your dataset
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[
f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_average_diameter, axis=1)
partial_dataset_table[:5]
Typically you will have to normalize your variables. However, in this case our data is already normalized because each row is using the same search radius in meters.
partial_dataset_table = dataset_table[:5].copy()
partial_dataset_table[
f'Tree Count Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_tree_count, axis=1)
partial_dataset_table[
f'Sum of Tree Distances Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_sum_distance, axis=1)
partial_dataset_table[
f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_average_diameter, axis=1)
partial_dataset_table[:5]
partial_dataset_table = partial_dataset_table[[
'DBN',
'School Name',
f'Tree Count Within {radius_in_meters} Meters',
f'Sum of Tree Distances Within {radius_in_meters} Meters',
f'Average Tree Diameter in Inches Within {radius_in_meters} Meters',
'Graduation Rate',
]]
partial_dataset_table[:5]
partial_dataset_table.to_csv('/tmp/dataset.csv', index=False)
If you are training your actual model, you will want to use the complete dataset. Note that preparing the full dataset might take several minutes.
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
MAXIMUM_TREE_COUNT = 100000
# MAXIMUM_TREE_COUNT = len(tree_xys) # Uncomment this line if you want to use the entire tree dataset
partial_tree_tree = KDTree(
tree_xys[:MAXIMUM_TREE_COUNT],
distance_metric='Arc',
radius=RADIUS_EARTH_KM * 1000)
tree_tree = partial_tree_tree
MAXIMUM_SCHOOL_COUNT = 5
# MAXIMUM_SCHOOL_COUNT = len(dataset_table) # Uncomment this line if you want to use the entire school dataset
partial_dataset_table = dataset_table[:MAXIMUM_SCHOOL_COUNT].copy()
partial_dataset_table[
f'Tree Count Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_tree_count, axis=1)
partial_dataset_table[
f'Sum of Tree Distances Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_sum_distance, axis=1)
partial_dataset_table[
f'Average Tree Diameter in Inches Within {radius_in_meters} Meters'
] = partial_dataset_table.apply(
get_average_diameter, axis=1)
partial_dataset_table[:5]
X = partial_dataset_table[[
f'Tree Count Within {radius_in_meters} Meters',
f'Sum of Tree Distances Within {radius_in_meters} Meters',
f'Average Tree Diameter in Inches Within {radius_in_meters} Meters',
]]
y = partial_dataset_table[
'Graduation Rate'
]
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X, y)
model1.predict([
[20, 7000, 10],
[40, 14000, 12],
])
from sklearn.model_selection import cross_val_score
score1 = cross_val_score(model1, X, y, cv=3, scoring='neg_mean_absolute_error').mean()
score1
from sklearn.svm import SVR
model2 = SVR(gamma='auto')
model2.fit(X, y)
model2.predict([
[20, 7000, 10],
[40, 14000, 12],
])
from sklearn.model_selection import cross_val_score
score2 = cross_val_score(model2, X, y, cv=3, scoring='neg_mean_absolute_error').mean()
score2