The following example is adapted from http://darribas.org/gds_scipy16/ipynb_md/08_spatial_regression.html
For other Airbnb Listing URLs, please see http://insideairbnb.com/get-the-data.html
{ airbnb_listing_url : Airbnb Listing URL ? Specify a URL containing Airbnb Listings }
{ model1_feature_select : Features for Model 1 ? Select features to include in model }
{ model2_feature_select : Features for Model 2 ? Select features to include in model }
# Press the blue paper plane to preview this as a CrossCompute Tool
target_folder = '/tmp'
airbnb_listing_url = 'http://data.insideairbnb.com/canada/bc/vancouver/2018-11-07/data/listings.csv.gz'
model1_feature_select = """
host_listings_count
bathrooms
bedrooms
beds
guests_included
host_acceptance_rate
host_listings_count
host_total_listings_count
accommodates
bathrooms
bedrooms
beds
square_feet
guests_included
minimum_nights
maximum_nights
availability_30
availability_60
availability_90
availability_365
number_of_reviews
review_scores_rating
review_scores_accuracy
review_scores_cleanliness
review_scores_checkin
review_scores_communication
review_scores_location
review_scores_value
calculated_host_listings_count
reviews_per_month
"""
model2_feature_select = """
square_feet
number_of_reviews
review_scores_rating
host_acceptance_rate
host_listings_count
host_total_listings_count
accommodates
bathrooms
bedrooms
beds
square_feet
guests_included
minimum_nights
maximum_nights
availability_30
availability_60
availability_90
availability_365
number_of_reviews
review_scores_rating
review_scores_accuracy
review_scores_cleanliness
review_scores_checkin
review_scores_communication
review_scores_location
review_scores_value
calculated_host_listings_count
reviews_per_month
"""
# Get selected features
def get_selected_lines(select_text):
lines = []
for line in select_text.strip().splitlines():
line = line.strip()
if not line:
break
lines.append(line)
return lines
model1_features = get_selected_lines(model1_feature_select)
model2_features = get_selected_lines(model2_feature_select)
# Enable inline plots
# %matplotlib inline
# Install packages
# import subprocess
# subprocess.call('pip install -U pysal'.split())
import pandas as pd
import numpy as np
# Download listings
def download(target_path, source_url):
from urllib.request import urlretrieve
urlretrieve(source_url, target_path)
return target_path
"""
source_url = (
'http://data.insideairbnb.com/united-states/ny/new-york-city/'
'2018-12-06/data/listings.csv.gz')
"""
source_archive_path = '/tmp/listings.csv.gz'
source_path = '/tmp/listings.csv'
download(source_archive_path, airbnb_listing_url)
# Unpack gzip archive
import gzip
with gzip.open(source_archive_path, 'rb') as f:
open(source_path, 'wb').write(f.read())
import pandas as pd
t = pd.read_csv(source_path)
t.iloc[0]
t.dtypes
# Select columns that have all numerical values
numerics = 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'
selected_t = t.select_dtypes(include=numerics)
selected_t.dtypes
sorted(t.columns)
# Limit table to selected columns
selected_t = t[model1_features + ['price', 'longitude', 'latitude']].dropna()
selected_t.head()
# Prepare target value that we want to predict
import numpy as np
y = np.log(selected_t['price'].apply(
lambda x: float(x.strip('$').replace(',', ''))) + 0.000001)
y[:5]
xys = selected_t[['longitude', 'latitude']].values
xys[:5]
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
kd_tree = KDTree(xys, distance_metric='Arc', radius=RADIUS_EARTH_KM)
# Prepare spatial weights
from pysal.lib.weights import KNN
w = KNN(kd_tree, k=2)
w.set_transform('R')
w
# Fit model using ordinary least squares
from pysal.model.spreg import OLS
model1 = OLS(
y.values[:, None],
selected_t.drop('price', axis=1).values,
w=w,
spat_diag=True,
name_x=selected_t.drop('price', axis=1).columns.tolist(),
name_y='ln(price)')
print(model1.summary)
# Here is an example model that tries to predict listing price
# based on whether NEARBY listings have high prices
from pysal.model.spreg import GM_Lag
model2 = GM_Lag(
y.values[:, None],
selected_t.drop('price', axis=1).values,
w=w,
spat_diag=True,
name_x=selected_t.drop('price', axis=1).columns.tolist(),
name_y='ln(price)')
print(model2.summary)
model2.betas
from sklearn.metrics import mean_squared_error as mse
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
from pysal.lib.weights import KNN
from pysal.model.spreg import GM_Lag, OLS
result_lines = []
for features in [model1_features, model2_features]:
# Limit table to selected columns
selected_t = t[features + ['price', 'longitude', 'latitude']].dropna()
# Prepare target value we want to predict
y = np.log(selected_t['price'].apply(
lambda x: float(x.strip('$').replace(',', ''))) + 0.000001)
# Prepare spatial weights
xys = selected_t[['longitude', 'latitude']].values
kd_tree = KDTree(xys, distance_metric='Arc', radius=RADIUS_EARTH_KM)
w = KNN(kd_tree, k=2)
w.set_transform('R')
# Fit using ordinary least squares
ols_model = OLS(
y.values[:, None],
selected_t.drop('price', axis=1).values,
w=w,
spat_diag=True,
name_x=selected_t.drop('price', axis=1).columns.tolist(),
name_y='ln(price)')
mean_squared_error = mse(y, ols_model.predy.flatten())
result_lines.append('OLS Model Mean Squared Error %s' % mean_squared_error)
for feature, coefficient in zip(features, ols_model.betas):
result_lines.append('%s %s' % (coefficient, feature))
result_lines.append('')
# Fit using spatial lag model
lag_model = GM_Lag(
y.values[:, None],
selected_t.drop('price', axis=1).values,
w=w,
spat_diag=True,
name_x=selected_t.drop('price', axis=1).columns.tolist(),
name_y='ln(price)')
mean_squared_error = mse(y, lag_model.predy_e)
result_lines.append('LAG Model Mean Squared Error %s' % mean_squared_error)
for feature, coefficient in zip(features, ols_model.betas):
result_lines.append('%s %s' % (coefficient, feature))
result_lines.append('')
from os.path import join
result_text = '\n'.join(result_lines)
result_text_path = join(target_folder, 'result.txt')
open(result_text_path, 'wt').write(result_text)
print('result_text_path = %s' % result_text_path)
print(result_text)
{ result_text : Result Summary ? Review mean squared error and coefficients }