Given a spatial NYC dataset, return an augmented dataset with additional columns for tree statistics as taken from the 2015 Street Tree Census. You can use the tree statistics as feature variables to train a fun predictive model.
{ source_geotable : Source NYC Spatial Dataset ? Make sure that your CSV has a WKT column that defines the geometry of each row }
{ point_search_radius_in_meters : Point Search Radius in Meters ? Search for trees using this search distance (only for points, does not affect polygons) }
{ source_sample_count: Source Sample Count ? Set to -1 to use the full source dataset }
{ tree_sample_count : Tree Sample Count ? Set to -1 to use the full tree dataset }
# CrossCompute
source_geotable_path = 'geometry.csv'
point_search_radius_in_meters = 100
tree_sample_count = 100000
source_sample_count = 10
target_folder = '/tmp'
import geotable
source_geotable = geotable.load(source_geotable_path)
len(source_geotable)
utm_proj4 = geotable.load_utm_proj4(source_geotable_path)
utm_geotable = geotable.load(source_geotable_path, target_proj4=utm_proj4)
utm_geotable['Area in Sq Km'] = utm_geotable['geometry_object'].apply(lambda g: g.area / 1000000)
source_geotable['Area in Sq Km'] = utm_geotable['Area in Sq Km']
source_geotable
import pandas as pd
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, ignore_index=True, sort=False)
endpoint_url = 'https://data.cityofnewyork.us/resource/uvpi-gqnh.csv'
selected_columns = 'tree_id', 'tree_dbh', 'latitude', 'longitude'
buffer_size = 100000
tree_table = load(endpoint_url, selected_columns, buffer_size)
tree_table[:5]
tree_table.dropna(subset=['longitude', 'latitude'], inplace=True)
len(tree_table)
if tree_sample_count < 0:
tree_sample_count = len(tree_table)
tree_sample_count
import numpy as np
import random
sample_tree_labels = random.sample(list(tree_table.index), tree_sample_count)
sample_tree_labels = np.array(sample_tree_labels)
sample_tree_labels[:3]
sample_tree_table = tree_table.loc[sample_tree_labels]
sample_tree_table[:3]
sample_tree_xys = sample_tree_table[['longitude', 'latitude']].values
sample_tree_xys[:3]
from shapely.geometry import MultiPoint, Point
sample_tree_points = [Point(_) for _ in sample_tree_xys]
sample_tree_collection = MultiPoint(sample_tree_points)
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
kd_tree = KDTree(
sample_tree_xys,
distance_metric='Arc',
radius=RADIUS_EARTH_KM * 1000)
def query_kd_tree(g, k=tree_sample_count):
xy = g.coords[0]
distances, indices = kd_tree.query(
xy,
k=k,
distance_upper_bound=point_search_radius_in_meters)
if k == 1:
relative_distance = distances
relative_index = indices
return [relative_distance], [relative_index]
relative_distances = distances[indices < k]
relative_indices = indices[indices < k]
relative_indices = [int(_) for _ in relative_indices]
return relative_distances, relative_indices
from shapely.geometry import Point
def get_tree_count(r):
g = r['geometry_object']
if g.area:
selected_tree_collection = g.intersection(sample_tree_collection)
if selected_tree_collection.is_empty:
return 0
if isinstance(selected_tree_collection, Point):
return 1
return len(selected_tree_collection)
relative_distances, relative_indices = query_kd_tree(g)
return len(relative_indices)
from geopy.distance import distance as get_distance
def get_geodesic_distance_in_meters(g1, g2):
latitude_longitude1 = g1.centroid.coords[:2][::-1]
latitude_longitude2 = g2.centroid.coords[:2][::-1]
return get_distance(
latitude_longitude1,
latitude_longitude2).meters
def get_sum_distance(r):
g = r['geometry_object']
if g.area:
selected_tree_collection = g.intersection(sample_tree_collection)
if selected_tree_collection.is_empty:
return 0
g_centroid = g.centroid
return sum(get_geodesic_distance_in_meters(
g_centroid, _) for _ in selected_tree_collection)
relative_distances, relative_indices = query_kd_tree(g)
return sum(relative_distances)
import numpy as np
def get_average_diameter_from(relative_indices):
tree_labels = sample_tree_labels[relative_indices]
selected_tree_table = tree_table.loc[tree_labels]
return selected_tree_table['tree_dbh'].mean()
def get_average_diameter(r):
g = r['geometry_object']
if g.area:
selected_tree_collection = g.intersection(sample_tree_collection)
if selected_tree_collection.is_empty:
return 0
if isinstance(selected_tree_collection, Point):
selected_tree_collection = [selected_tree_collection]
values = []
for g in selected_tree_collection:
relative_distances, relative_indices = query_kd_tree(g, k=1)
values.append(get_average_diameter_from(relative_indices))
return np.mean(values)
relative_distances, relative_indices = query_kd_tree(g)
return get_average_diameter_from(relative_indices)
if source_sample_count < 0:
source_sample_count = len(source_geotable)
target_geotable = source_geotable[:source_sample_count].copy()
target_geotable[
'Tree Count'
] = target_geotable.apply(get_tree_count, axis=1)
target_geotable[
'Tree Count Per Sq Km'
] = target_geotable['Tree Count'] / target_geotable['Area in Sq Km']
target_geotable[
'Tree Distance Sum'
] = target_geotable.apply(get_sum_distance, axis=1)
target_geotable[
'Tree Distance Sum Per Sq Km'
] = target_geotable['Tree Distance Sum'] / target_geotable['Area in Sq Km']
target_geotable[
'Average Tree Diameter'
] = target_geotable.apply(get_average_diameter, axis=1)
target_geotable[:5]
target_path = target_folder + '/augmented-dataset.csv'
target_geotable.to_csv(target_path, index=False)
print(f'augmented_table_path = {target_path}')
target_path = target_folder + '/augmented-dataset-map.csv'
map_geotable = target_geotable.copy()
map_geotable['FillGreens'] = map_geotable['Tree Count']
map_geotable.to_csv(target_path, index=False)
print(f'augmented_geotable_path = {target_path}')
{ augmented_table : Dataset with Tree Statistics ? Your dataset with tree statistics }
{ augmented_geotable : Map with Tree Statistics ? Your dataset with tree statistics on a map }