"""
from pandas import HDFStore, read_csv
def load_csv(path):
return read_csv(path, index_col=1, parse_dates=True)[[
'Agency',
'Agency Name',
'Complaint Type',
'Borough',
'X Coordinate (State Plane)',
'Y Coordinate (State Plane)',
]].sort()
store = HDFStore('311-20111030-20111105.h5')
store['issues'] = load_csv('311-20111030-20111105.csv')
"""
from pandas import HDFStore
store = HDFStore('datasets/NYC-311-ServiceRequests.h5')
issues = store['issues']
issues.ix[0]
# How many 311 issues were reported that week?
len(issues)
# How many issues were reported on Halloween?
len(issues.ix['2011-10-31'])
# What were the top five categories reported that week?
issues['Complaint Type'].value_counts()[:5]
# What was the daily distribution of issues?
issues['Complaint Type'].resample('D', how=len).plot();
# How did the categorical distribution of issues differ between Brooklyn and the Bronx?
get_borough_counts = lambda borough: issues[issues.Borough == borough]['Complaint Type'].value_counts()
brooklyn = get_borough_counts('BROOKLYN')
bronx = get_borough_counts('BRONX')
difference = brooklyn.sub(bronx, fill_value=0).order()
print difference[:3]
print
print difference[-3:]
# How many issues were reported in each borough?
issues.groupby('Borough')['Complaint Type'].count()
# What was the spatial distribution of complaints?
points = issues[[
'X Coordinate (State Plane)',
'Y Coordinate (State Plane)',
]]
points.index = range(len(points))
points = points.dropna() / 500
minX, minY = points.min().values
maxX, maxY = points.max().values
import pylab as pl
from scipy.ndimage import gaussian_filter
image = np.zeros((maxY - minY + 1, maxX - minX + 1))
for x, y in points.values:
image[y - minY, x - minX] += 1
image = gaussian_filter(image, (1, 1))
pl.imshow(image, origin='lower');