KzO9K




Pay Notebook Creator: Rocky Singh0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
url = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.csv'
In [ ]:
 
In [2]:
import pandas as pd
In [3]:
pd.read_csv(url + '?$select=count(*)')
Out[3]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
count
0 683788
In [4]:
t = pd.read_csv(url + '?$limit=1')
t.iloc[0]
Out[4]:
address                                   108-005 70 AVENUE
block_id                                             348711
boro_ct                                             4073900
borocode                                                  4
boroname                                             Queens
brnch_ligh                                               No
brnch_othe                                               No
brnch_shoe                                               No
cb_num                                                  406
cncldist                                                 29
created_at                                       08/27/2015
curb_loc                                             OnCurb
guards                                                 None
health                                                 Fair
latitude                                            40.7231
longitude                                          -73.8442
nta                                                    QN17
nta_name                                       Forest Hills
problems                                               None
root_grate                                               No
root_other                                               No
root_stone                                               No
sidewalk                                           NoDamage
spc_common                                        red maple
spc_latin                                       Acer rubrum
st_assem                                                 28
st_senate                                                16
state                                              New York
status                                                Alive
steward                                                None
stump_diam                                                0
the_geom      POINT (-73.84421521958048 40.723091773924274)
tree_dbh                                                  3
tree_id                                              180683
trnk_light                                               No
trnk_other                                               No
trnk_wire                                                No
user_type                                  TreesCount Staff
x_sp                                            1.02743e+06
y_sp                                                 202757
zip_city                                       Forest Hills
zipcode                                               11375
Name: 0, dtype: object
In [5]:
# Import Tree data set 
In [6]:
# Also import air Pol dataset
In [7]:
import pandas as pd

def load(
    endpoint_url,
    selected_columns=None,
    buffer_size=1000,
    search_term_by_column=None,
    **kw,
):
    buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
    if selected_columns:
        select_string = ','.join(selected_columns)
        buffer_url += f'&$select={select_string}'
    for column, search_term in (search_term_by_column or {}).items():
        buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
    print(buffer_url)
    tables = []
    
    if endpoint_url.endswith('.json'):
        f = pd.read_json
    else:
        f = pd.read_csv

    t = f(buffer_url, **kw)
    while len(t):
        print(len(tables) * buffer_size + len(t))
        tables.append(t)
        offset = buffer_size * len(tables)
        t = f(buffer_url + f'&$offset={offset}', **kw)
    return pd.concat(tables, ignore_index=True, sort=False)
In [8]:
# endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
endpoint_url = url
selected_columns = 'tree_id', 'latitude', 'longitude'
buffer_size = 100000
t = load(endpoint_url, selected_columns, buffer_size)
https://data.cityofnewyork.us/resource/5rq2-4hqu.csv?$limit=100000&$select=tree_id,latitude,longitude
100000
200000
300000
400000
500000
600000
683788
In [9]:
tree_table = t
In [10]:
tree_table[:3]
Out[10]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_id
0 40.723092 -73.844215 180683
1 40.794111 -73.818679 200540
2 40.717581 -73.936608 204026
In [11]:
import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve

def download(target_path, source_url):
    if not exists(target_path):
        urlretrieve(source_url, target_path)    
    return target_path

def download_zip(target_folder, source_url):
    archive_path = download(target_folder + '.zip', source_url)
    return uncompress(archive_path, target_folder)
            
def download_gz(target_path, source_url):
    archive_path = download(target_path + '.gz', source_url)
    with gzip.open(archive_path, 'rb') as f:
        open(target_path, 'wb').write(f.read())
    return target_path
In [12]:
# Load air pollution raster
source_url = (
    'https://data.cityofnewyork.us/download/'
    'q68s-8qxv/application%2Fzip')
source_folder = '/tmp/pollution'
download_zip(source_folder, source_url)
Out[12]:
'/tmp/pollution'
In [ ]:
 
In [13]:
# Load air pollution raster
source_url = (
    'https://data.cityofnewyork.us/download/'
    'q68s-8qxv/application%2Fzip')
source_folder = '/tmp/pollution'
download_zip(source_folder, source_url)
Out[13]:
'/tmp/pollution'
In [14]:
# Show nitrogen dioxide images
from glob import glob
sorted(glob('/tmp/pollution/*no2*.ovr'))
Out[14]:
['/tmp/pollution/aa2_no2300m.ovr',
 '/tmp/pollution/aa3_no2300m.ovr',
 '/tmp/pollution/aa4_no2300m.ovr',
 '/tmp/pollution/aa5_no2300m.ovr',
 '/tmp/pollution/aa6_no2300m.ovr',
 '/tmp/pollution/aa7_no2300m.ovr']
In [15]:
# Show nitrogen dioxide images in aa2
from glob import glob
sorted(glob('/tmp/pollution/aa2_no2300m/*'))
Out[15]:
['/tmp/pollution/aa2_no2300m/dblbnd.adf',
 '/tmp/pollution/aa2_no2300m/hdr.adf',
 '/tmp/pollution/aa2_no2300m/log',
 '/tmp/pollution/aa2_no2300m/metadata.xml',
 '/tmp/pollution/aa2_no2300m/prj.adf',
 '/tmp/pollution/aa2_no2300m/sta.adf',
 '/tmp/pollution/aa2_no2300m/w001001.adf',
 '/tmp/pollution/aa2_no2300m/w001001x.adf']
In [16]:
# Install packages
# If this keeps failing, try starting the notebook with more memory (e.g. MEDIUM or LARGE)
import subprocess
assert subprocess.call('pip install rasterio'.split()) == 0
In [17]:
# Load image
import rasterio
pollution_raster = rasterio.open('/tmp/pollution/aa2_no2300m/prj.adf')
In [18]:
# Get coordinate reference (defines how each pixel corresponds to a specific location)
pollution_proj4 = pollution_raster.crs.to_string()
pollution_proj4
Out[18]:
'PROJCS["unnamed",GEOGCS["Unknown datum based upon the GRS 1980 ellipsoid",DATUM["Not_specified_based_on_GRS_1980_ellipsoid",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6019"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4019"]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["standard_parallel_1",40.66666666666666],PARAMETER["standard_parallel_2",41.03333333333333],PARAMETER["latitude_of_origin",40.16666666666666],PARAMETER["central_meridian",-74],PARAMETER["false_easting",984250],PARAMETER["false_northing",0],UNIT["Foot_US",0.3048006096012192]]'
In [19]:
# Get coordinate reference (defines how each pixel corresponds to a specific location)
pollution_proj4 = pollution_raster.crs.to_proj4()
pollution_proj4
Out[19]:
'+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0 +ellps=GRS80 +units=us-ft +no_defs=True'
In [20]:
# Load image array
x = pollution_raster.read()
In [21]:
x.shape
Out[21]:
(1, 155, 157)
In [22]:
pollution_raster.height, pollution_raster.width
Out[22]:
(155, 157)
In [23]:
# Get coordinates for specific pixel
pollution_raster.xy(pollution_raster.height // 2, pollution_raster.width // 2)
Out[23]:
(990501.3, 196319.36754292296)
In [24]:
x.max()
Out[24]:
46.87981
In [25]:
x.min()
Out[25]:
-3.4028235e+38
In [26]:
x.shape
Out[26]:
(1, 155, 157)
In [27]:
# Enable inline plots
%matplotlib inline
In [28]:
array = x[0]
array.shape
Out[28]:
(155, 157)
In [29]:
# from matplotlib.colors import Normalize
# normalize = Normalize(vmin=x.min(), vmax=x.max())
In [30]:
import matplotlib.pyplot as plt
plt.imshow(array, vmin=0);
In [31]:
import geotable 
In [32]:
# Convert neighborhood geometries into pollution coordinate reference
source_proj4 = neighborhood_proj4 = geotable.LONGITUDE_LATITUDE_PROJ4
target_proj4 = pollution_proj4
from geotable.projections import get_transform_shapely_geometry
f = get_transform_shapely_geometry(source_proj4, target_proj4)
In [33]:
# get zipcode gem
# for each zipcode gem, get the tree count
# sum the air poll.
#that will be our data set table
In [34]:
# get zipcode gem
In [35]:
url = 'https://data.cityofnewyork.us/download/i8iw-xf4u/application%2Fzip'
In [36]:
# # Get UTM projection
# import geotable
# utm_proj4 = geotable.load_utm_proj4(url)
In [37]:
# # Get zipcode area in square meters
# nyc_zipcode_table = geotable.load(url, target_proj4=utm_proj4)
# nyc_zipcode_table.iloc[0]['geometry_object'].area
In [38]:
from geotable.projections import LONGITUDE_LATITUDE_PROJ4
In [ ]:
 
In [39]:
nyc_zipcode_table = geotable.load(url, target_proj4=LONGITUDE_LATITUDE_PROJ4)
In [40]:
nyc_zipcode_table[:3]
Out[40]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
ZIPCODE BLDGZIP PO_NAME POPULATION AREA STATE COUNTY ST_FIPS CTY_FIPS URL SHAPE_AREA SHAPE_LEN geometry_object geometry_layer geometry_proj4
0 11436 0 Jamaica 18681.0 2.269930e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.80584847647393 40.68290932644246... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs
1 11213 0 Brooklyn 62426.0 2.963100e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9373976313981 40.6797295892508, ... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs
2 11212 0 Brooklyn 83866.0 4.197210e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.90294132545436 40.67083977590008... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs
In [60]:
utm_proj4 = geotable.load_utm_proj4(url)
utm_geotable = geotable.load(url, target_proj4=utm_proj4)
utm_geotable['Area in Sq Km'] = utm_geotable['geometry_object'].apply(lambda g: g.area / 1000000)
nyc_zipcode_table['Area in Sq Km'] = utm_geotable['Area in Sq Km']
nyc_zipcode_table
Out[60]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
ZIPCODE BLDGZIP PO_NAME POPULATION AREA STATE COUNTY ST_FIPS CTY_FIPS URL SHAPE_AREA SHAPE_LEN geometry_object geometry_layer geometry_proj4 Area in Sq Km
0 11436 0 Jamaica 18681.0 2.269930e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.80584847647393 40.68290932644246... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.107694
1 11213 0 Brooklyn 62426.0 2.963100e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9373976313981 40.6797295892508, ... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.751168
2 11212 0 Brooklyn 83866.0 4.197210e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.90294132545436 40.67083977590008... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.897041
3 11225 0 Brooklyn 56527.0 2.369863e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9579731604348 40.67065695897565,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.200345
4 11218 0 Brooklyn 72280.0 3.686880e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.97208109564255 40.65059658727607... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.423119
5 11226 0 Brooklyn 106132.0 3.940860e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.96190027968613 40.65487064531367... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.658957
6 11219 0 Brooklyn 92561.0 4.200274e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98905871487798 40.64411892870269... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.899752
7 11210 0 Brooklyn 67067.0 4.788702e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.95839960027368 40.63632662546082... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.446157
8 11230 0 Brooklyn 80857.0 4.992670e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.96451268730159 40.63668725312468... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.635499
9 11204 0 Brooklyn 77354.0 4.355518e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98108387453883 40.63528985591707... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.043899
10 10471 0 Bronx 23477.0 8.965141e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.88191900199823 40.90666565537614... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 8.324106
11 10470 0 Bronx 14740.0 2.154346e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.87790261636154 40.90555190505731... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.000330
12 10466 0 Bronx 68942.0 5.526249e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.84463568238043 40.90475077819391... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 5.131210
13 10467 0 Bronx 97932.0 6.933617e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.88010836796937 40.89519521108614... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 6.437914
14 10463 0 Bronx 70641.0 3.670338e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.92064636495626 40.88723726005072... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.407886
15 10475 0 Bronx 40931.0 3.863330e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.82722404306942 40.89092964709067... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.587193
16 10464 0 Bronx 4438.0 7.625748e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.81538709355007 40.88939419910433... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 7.080754
17 10469 0 Bronx 65101.0 6.804089e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.85588386083408 40.88386140903516... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 6.317708
18 10468 0 Bronx 72877.0 3.444760e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.88701149926719 40.88247162427221... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.198451
19 10463 0 Bronx 70641.0 3.119702e+06 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.91544105058509 40.87559063041356... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.289662
20 10458 0 Bronx 79362.0 3.596881e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.88480469568044 40.87845532589476... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.339711
21 10034 0 New York 39149.0 2.450389e+07 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.92062094886884 40.87300145746758... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.275160
22 10033 0 New York 54284.0 1.615605e+07 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.93213126543029 40.86945031866858... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 1.500064
23 10462 0 Bronx 75674.0 5.302251e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.87051512140579 40.8569744976658,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.923200
24 10040 0 New York 41033.0 1.634074e+07 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.93378898733546 40.86307153891337... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 1.517216
25 10453 0 Bronx 77576.0 2.574851e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.89994956018222 40.85741964281875... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.390729
26 10465 0 Bronx 42012.0 1.084237e+08 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.8391534647799 40.83567473125792,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 10.067423
27 10464 0 Bronx 4438.0 4.512531e+06 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.77200435714323 40.85711605937528... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.419009
28 10464 0 Bronx 4438.0 1.158795e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.79234936875058 40.85607420876183... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 1.075987
29 10461 0 Bronx 50549.0 6.282406e+07 NY Bronx 36 005 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.83739648053641 40.8567843873281,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 5.833332
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
233 10120 1 New York 0.0 3.517927e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98938483295653 40.75000537600457... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.003266
234 10278 1 New York 0.0 2.067060e+05 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.0047581751106 40.71507032820959,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.019192
235 10155 1 New York 0.0 2.478469e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.96843628441711 40.76110640977814... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.002301
236 10043 1 New York 0.0 3.826236e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.00708359659971 40.70386978270482... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.003552
237 10081 1 New York 0.0 3.024051e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.008714773259 40.7074893755098, -... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.002808
238 10096 1 New York 0.0 4.210611e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98301659173394 40.75485102653172... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.003909
239 10097 1 New York 0.0 6.582618e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98521646115499 40.76265216013989... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.006112
240 10196 1 New York 0.0 3.250244e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9824301517118 40.75680011816393,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.003018
241 10196 1 New York 0.0 3.154825e+03 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98133175627783 40.75673242462563... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.000293
242 10275 1 New York 0.0 4.828042e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.01450964009967 40.70555208010413... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.004483
243 10265 1 New York 0.0 1.722915e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.00806000177114 40.70489246189538... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.001600
244 10045 1 New York 0.0 4.780899e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.00910380250482 40.7087678695075,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.004439
245 10047 1 New York 0.0 1.014978e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98633231048143 40.73957889013666... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.000942
246 10047 1 New York 0.0 1.014978e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98633231048143 40.73957889013666... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.000942
247 10080 1 New York 0.0 7.711145e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.01287666105986 40.71051873297913... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.007159
248 10203 1 New York 0.0 3.722688e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.01193550921882 40.70719828992588... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.003456
249 10259 1 New York 0.0 2.106431e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.01036216523759 40.70876776311058... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.001956
250 10260 1 New York 0.0 5.251474e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.00908463330282 40.70626028948341... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.004876
251 10285 1 New York 0.0 6.735039e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.01476100812657 40.7130865913403,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.006253
252 10286 1 New York 0.0 1.126441e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.00936106898759 40.7064177757752,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.001046
253 10035 0 New York 34884.0 2.349487e+07 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.92741771645892 40.79839394486886... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.181470
254 11371 0 Flushing 0.0 3.055847e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.88509634398015 40.77846404500434... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.837371
255 11361 0 Bayside 28496.0 5.016352e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.76641719803655 40.77595272856865... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.657903
256 10036 0 New York 23543.0 1.139511e+07 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.98134106854107 40.75864499066453... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 1.057995
257 11414 0 Howard Beach 26148.0 6.392882e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.85068172809338 40.67164854067394... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 5.935843
258 10310 0 Staten Island 25003.0 5.346328e+07 NY Richmond 36 085 http://www.usps.com/ 0.0 0.0 POLYGON ((-74.12064635323901 40.64104377738636... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 4.963611
259 11693 0 Far Rockaway 11052.0 3.497516e+06 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.8407585999925 40.62536368150901,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.324747
260 11249 0 Brooklyn 28481.0 1.777221e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.95805126567375 40.72442275986043... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 1.650097
261 10162 1 New York 0.0 2.103489e+04 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.95132807302133 40.76931006893747... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.001953
262 10119 1 New York 0.0 1.263930e+05 NY New York 36 061 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.99354429600449 40.75145227487895... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 0.011735
<p>263 rows × 16 columns</p>
In [61]:
from shapely.geometry import MultiPoint, Point
In [ ]:
 
In [62]:
tree_sample_count = 100000
In [ ]:
 
In [63]:
if tree_sample_count < 0:
    tree_sample_count = len(tree_table)
tree_sample_count
Out[63]:
100000
In [64]:
import numpy as np
import random
sample_tree_labels = random.sample(list(tree_table.index), tree_sample_count)
sample_tree_labels = np.array(sample_tree_labels)
sample_tree_labels[:3]
Out[64]:
array([627548, 568510, 224655])
In [65]:
sample_tree_table = tree_table.loc[sample_tree_labels]
sample_tree_table[:3]
Out[65]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
latitude longitude tree_id
627548 40.687093 -73.955420 110661
568510 40.557809 -74.155265 74687
224655 40.647041 -73.976270 340783
In [66]:
sample_tree_xys = sample_tree_table[['longitude', 'latitude']].values
sample_tree_xys[:3]
Out[66]:
array([[-73.95542015,  40.68709277],
       [-74.1552647 ,  40.55780915],
       [-73.97626953,  40.64704107]])
In [67]:
sample_tree_points = [Point(_) for _ in sample_tree_xys]
In [68]:
sample_tree_collection = MultiPoint(sample_tree_points)
In [69]:
def get_tree_count(r):
    g = r['geometry_object']
    if g.area:
        selected_tree_collection = g.intersection(sample_tree_collection)
        if selected_tree_collection.is_empty:
            return 0
        try:
            return len(selected_tree_collection)
        except TypeError:
            return 1
    relative_distances, relative_indices = query_kd_tree(g)
    return len(relative_indices)
In [70]:
from geopy.distance import distance as get_distance

def get_geodesic_distance_in_meters(g1, g2):
    latitude_longitude1 = g1.centroid.coords[:2][::-1]
    latitude_longitude2 = g2.centroid.coords[:2][::-1]
    return get_distance(
        latitude_longitude1,
        latitude_longitude2).meters

def get_sum_distance(r):
    g = r['geometry_object']
    if g.area:
        selected_tree_collection = g.intersection(sample_tree_collection)
        if selected_tree_collection.is_empty:
            return 0
        g_centroid = g.centroid
        return sum(get_geodesic_distance_in_meters(
            g_centroid, _) for _ in selected_tree_collection)
    relative_distances, relative_indices = query_kd_tree(g)
    return sum(relative_distances)
In [71]:
whos
Variable                          Type             Data/Info
------------------------------------------------------------
LONGITUDE_LATITUDE_PROJ4          str              +proj=longlat +datum=WGS84 +no_defs
MultiPoint                        type             <class 'shapely.geometry.multipoint.MultiPoint'>
Point                             type             <class 'shapely.geometry.point.Point'>
array                             ndarray          155x157: 24335 elems, type `float32`, 97340 bytes
buffer_size                       int              100000
download                          function         <function download at 0x7f4a430f4e18>
download_gz                       function         <function download_gz at 0x7f4a40b03730>
download_zip                      function         <function download_zip at 0x7f4a40b036a8>
endpoint_url                      str              https://data.cityofnewyor<...>us/resource/5rq2-4hqu.csv
exists                            function         <function exists at 0x7f4a73529d08>
f                                 function         <function get_transform_s<...>ometry at 0x7f4a074481e0>
geotable                          module           <module 'geotable' from '<...>es/geotable/__init__.py'>
get_distance                      type             <class 'geopy.distance.geodesic'>
get_geodesic_distance_in_meters   function         <function get_geodesic_di<...>meters at 0x7f4a07260d08>
get_pollutant_sum                 function         <function get_pollutant_sum at 0x7f4a072607b8>
get_sum_distance                  function         <function get_sum_distance at 0x7f4a07446d90>
get_transform_shapely_geometry    function         <function get_transform_s<...>ometry at 0x7f4a0a147950>
get_tree_count                    function         <function get_tree_count at 0x7f4a07446f28>
glob                              function         <function glob at 0x7f4a728d8b70>
gzip                              module           <module 'gzip' from '/usr<...>lib64/python3.6/gzip.py'>
load                              function         <function load at 0x7f4a430f4730>
mask                              function         <function mask at 0x7f4a072602f0>
neighborhood_proj4                str              +proj=longlat +datum=WGS84 +no_defs
np                                module           <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
nyc_zipcode_table                 GeoTable             ZIPCODE BLDGZIP      <...>\n[263 rows x 16 columns]
pd                                module           <module 'pandas' from '/h<...>ages/pandas/__init__.py'>
plt                               module           <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
pollution_proj4                   str              +proj=lcc +lat_1=40.66666<...>units=us-ft +no_defs=True
pollution_raster                  DatasetReader    <open DatasetReader name=<...>o2300m/prj.adf' mode='r'>
random                            module           <module 'random' from '/h<...>b64/python3.6/random.py'>
rasterio                          module           <module 'rasterio' from '<...>es/rasterio/__init__.py'>
sample_tree_collection            MultiPoint       MULTIPOINT (-73.955420149<...>-74.07914629 40.63943313)
sample_tree_labels                ndarray          100000: 100000 elems, type `int64`, 800000 bytes (781.25 kb)
sample_tree_points                list             n=100000
sample_tree_table                 DataFrame                 latitude  longit<...>[100000 rows x 3 columns]
sample_tree_xys                   ndarray          100000x2: 200000 elems, type `float64`, 1600000 bytes (1.52587890625 Mb)
selected_columns                  tuple            n=3
source_folder                     str              /tmp/pollution
source_proj4                      str              +proj=longlat +datum=WGS84 +no_defs
source_sample_count               int              263
source_url                        str              https://data.cityofnewyor<...>8s-8qxv/application%2Fzip
subprocess                        module           <module 'subprocess' from<...>python3.6/subprocess.py'>
t                                 DataFrame                 latitude  longit<...>[683788 rows x 3 columns]
target_geotable                   GeoTable             ZIPCODE BLDGZIP      <...>\n[263 rows x 18 columns]
target_proj4                      str              +proj=lcc +lat_1=40.66666<...>units=us-ft +no_defs=True
tree_sample_count                 int              100000
tree_table                        DataFrame                 latitude  longit<...>[683788 rows x 3 columns]
uncompress                        function         <function uncompress at 0x7f4a40b008c8>
url                               str              https://data.cityofnewyor<...>iw-xf4u/application%2Fzip
urlretrieve                       function         <function urlretrieve at 0x7f4a6d0247b8>
utm_geotable                      GeoTable             ZIPCODE BLDGZIP      <...>\n[263 rows x 16 columns]
utm_proj4                         str              +proj=utm +zone=18 +ellps<...>m=WGS84 +units=m +no_defs
x                                 ndarray          1x155x157: 24335 elems, type `float32`, 97340 bytes
In [72]:
nyc_zipcode_table[:3]
Out[72]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
ZIPCODE BLDGZIP PO_NAME POPULATION AREA STATE COUNTY ST_FIPS CTY_FIPS URL SHAPE_AREA SHAPE_LEN geometry_object geometry_layer geometry_proj4 Area in Sq Km
0 11436 0 Jamaica 18681.0 2.269930e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.80584847647393 40.68290932644246... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.107694
1 11213 0 Brooklyn 62426.0 2.963100e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9373976313981 40.6797295892508, ... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.751168
2 11212 0 Brooklyn 83866.0 4.197210e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.90294132545436 40.67083977590008... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.897041
In [73]:
source_sample_count = len(nyc_zipcode_table)

target_geotable = nyc_zipcode_table[:source_sample_count].copy()

target_geotable[
    'Tree Count'
] = target_geotable.apply(get_tree_count, axis=1)

# target_geotable[
#     'Sum of Tree Distances'
# ] = target_geotable.apply(get_sum_distance, axis=1)

# target_geotable[
#     'Average Tree Diameter'
# ] = target_geotable.apply(get_average_diameter, axis=1)

target_geotable[:5]
Out[73]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
ZIPCODE BLDGZIP PO_NAME POPULATION AREA STATE COUNTY ST_FIPS CTY_FIPS URL SHAPE_AREA SHAPE_LEN geometry_object geometry_layer geometry_proj4 Area in Sq Km Tree Count
0 11436 0 Jamaica 18681.0 2.269930e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.80584847647393 40.68290932644246... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.107694 347
1 11213 0 Brooklyn 62426.0 2.963100e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9373976313981 40.6797295892508, ... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.751168 570
2 11212 0 Brooklyn 83866.0 4.197210e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.90294132545436 40.67083977590008... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.897041 596
3 11225 0 Brooklyn 56527.0 2.369863e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9579731604348 40.67065695897565,... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.200345 393
4 11218 0 Brooklyn 72280.0 3.686880e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.97208109564255 40.65059658727607... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.423119 702
In [74]:
# Convert neighborhood geometries into pollution coordinate reference
from geotable.projections import get_transform_shapely_geometry
source_proj4  = geotable.LONGITUDE_LATITUDE_PROJ4
target_proj4 = pollution_proj4
f = get_transform_shapely_geometry(source_proj4, target_proj4)

# For each neighborhood polygon, get sum of nitrogen dioxide particulate matter
from rasterio.mask import mask
In [75]:
def get_pollutant_sum(old_polygon):
    new_polygon = f(old_polygon)
    output_image, output_transform = mask(pollution_raster, [new_polygon], crop=True)
    output_image[output_image < 0] = 0  # Zero null values so that we do not get -inf
    return output_image.sum()

# Drop rows that do not have a geometry
target_geotable.dropna(subset=['geometry_object'], inplace=True)
# Define new column
target_geotable['pollutant_sum'] = target_geotable[
    'geometry_object'].apply(get_pollutant_sum)
In [81]:
target_geotable[:3]
Out[81]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
ZIPCODE BLDGZIP PO_NAME POPULATION AREA STATE COUNTY ST_FIPS CTY_FIPS URL SHAPE_AREA SHAPE_LEN geometry_object geometry_layer geometry_proj4 Area in Sq Km Tree Count pollutant_sum Tree Count Per Sq Ft
0 11436 0 Jamaica 18681.0 2.269930e+07 NY Queens 36 081 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.80584847647393 40.68290932644246... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.107694 347 545.928101 0.000015
1 11213 0 Brooklyn 62426.0 2.963100e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.9373976313981 40.6797295892508, ... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 2.751168 570 762.125000 0.000019
2 11212 0 Brooklyn 83866.0 4.197210e+07 NY Kings 36 047 http://www.usps.com/ 0.0 0.0 POLYGON ((-73.90294132545436 40.67083977590008... ZIP_CODE_040114 +proj=longlat +datum=WGS84 +no_defs 3.897041 596 988.006592 0.000014
In [82]:
target_geotable['Tree Count Per Sq km'] = target_geotable['Tree Count'] / target_geotable['Area in Sq Km']
In [83]:
# target_path = target_folder + '/augmented-dataset.csv'
target_geotable.to_csv("zipCodeAndTreeCountDataset.csv", index=False)
# print(f'augmented_table_path = {target_path}')
In [84]:
# train the model
In [ ]:
 
In [ ]:
 
In [61]:
# for each zipcode gem, get the tree count
# sum the air poll.
#that will be our data set table
In [ ]: