Here are different ways to load NYC Open Data.
url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
assert url.endswith('.csv'), 'Use CSV endpoint because it is faster'
import pandas as pd
pd.read_csv(url + '?$select=count(*)')
t = pd.read_csv(url + '?$limit=1')
t.iloc[0]
pd.read_csv(url + '?$select=count(*)&$where=status!="Alive"')
pd.read_csv(url + '?$select=tree_id,latitude,longitude&$limit=3')
Use buffering when if you would like to load a large dataset but memory is limited (i.e. you keep getting a message that the kernel died).
total_count = pd.read_csv(url + '?$select=count(*)')['count'][0]
total_count
import pandas as pd
def load(
endpoint_url,
selected_columns=None,
buffer_size=1000,
search_term_by_column=None,
**kw,
):
buffer_url = (f'{endpoint_url}?$limit={buffer_size}')
if selected_columns:
select_string = ','.join(selected_columns)
buffer_url += f'&$select={select_string}'
for column, search_term in (search_term_by_column or {}).items():
buffer_url += f'&$where={column}+like+"%25{search_term}%25"'
print(buffer_url)
tables = []
if endpoint_url.endswith('.json'):
f = pd.read_json
else:
f = pd.read_csv
t = f(buffer_url, **kw)
while len(t):
print(len(tables) * buffer_size + len(t))
tables.append(t)
offset = buffer_size * len(tables)
t = f(buffer_url + f'&$offset={offset}', **kw)
return pd.concat(tables, ignore_index=True, sort=False)
endpoint_url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.csv'
selected_columns = 'tree_id', 'latitude', 'longitude'
buffer_size = 100000
t = load(endpoint_url, selected_columns, buffer_size)
assert len(t) == total_count