from pandas import read_csv
alcohol = read_csv('datasets/UN-Alcohol.csv')
chocolate = read_csv('datasets/UN-Chocolate.csv')
marriage = read_csv('datasets/UN-Marriage.csv').drop('Value Footnotes', axis=1).dropna()
for table in alcohol, chocolate, marriage:
table.rename(columns={
'Country or Area': 'Country',
'Comm. Code': 'CommodityCode',
}, inplace=True)
alcohol = alcohol[alcohol['CommodityCode'] >= 220300][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
chocolate = chocolate[chocolate['CommodityCode'] >= 180620][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
marriage = marriage[['Country', 'Subgroup', 'Value']]
alcohol.ix[alcohol.index[0]]
alcohol[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
chocolate.ix[chocolate.index[0]]
chocolate[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
marriage.ix[marriage.index[0]]
marriage.groupby('Subgroup').mean()
marriagePivot = marriage.pivot('Country', 'Subgroup', 'Value')
marriagePivot
from pandas import Series
def compute_datasetRow(row):
country = row.name
sum_by_flow = lambda trades: trades[trades.Country == country].groupby('Flow').sum()['Quantity']
alcoholFlow = sum_by_flow(alcohol)
chocolateFlow = sum_by_flow(chocolate)
return Series(dict(
AlcoholImported=alcoholFlow.get('Import', 0),
AlcoholExported=alcoholFlow.get('Export', 0),
ChocolateImported=chocolateFlow.get('Import', 0),
ChocolateExported=chocolateFlow.get('Export', 0),
MarriageAgeFemale=row['Female'],
MarriageAgeMale=row['Male']), name=country)
dataset = marriagePivot.apply(compute_datasetRow, axis=1)
dataset.AlcoholImported.idxmax()
dataset.AlcoholImported.order(ascending=False)[:5]
import numpy as np
from pandas import DataFrame
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
featureColumns = [_ for _ in dataset.columns if not _.startswith('Marriage')]
def score_model(model, targetColumn):
pruned = dataset[featureColumns + [targetColumn]].dropna()
return np.mean(cross_val_score(model, pruned[featureColumns], pruned[targetColumn]))
def score_models(models):
modelNames = [model.__class__.__name__ for model in models]
results = []
for model in models:
results.append([
score_model(model, 'MarriageAgeFemale'),
score_model(model, 'MarriageAgeMale'),
])
return DataFrame(results, index=modelNames, columns=['Female', 'Male'])
score_models([
LinearRegression(),
SVR(),
Pipeline([
('StandardScaler', StandardScaler()),
('Model', SVR()),
]),
])
from sklearn.feature_selection import RFECV
def rank_features(model, targetColumn):
pruned = dataset[featureColumns + [targetColumn]].dropna()
featureSelector = RFECV(model)
featureSelector.fit(pruned[featureColumns], pruned[targetColumn])
return sorted(zip(featureSelector.ranking_, featureColumns))
rank_features(LinearRegression(), 'MarriageAgeFemale')
rank_features(LinearRegression(), 'MarriageAgeMale')