Introduction to Computational Analysis




Pay Notebook Creator: Roy Hyunjin Han0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
from pandas import read_csv
alcohol = read_csv('datasets/UN-Alcohol.csv')
chocolate = read_csv('datasets/UN-Chocolate.csv')
marriage = read_csv('datasets/UN-Marriage.csv').drop('Value Footnotes', axis=1).dropna()
In [2]:
for table in alcohol, chocolate, marriage:
    table.rename(columns={
        'Country or Area': 'Country',
        'Comm. Code': 'CommodityCode',
    }, inplace=True)
In [3]:
alcohol = alcohol[alcohol['CommodityCode'] >= 220300][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
chocolate = chocolate[chocolate['CommodityCode'] >= 180620][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
marriage = marriage[['Country', 'Subgroup', 'Value']]
In [4]:
alcohol.ix[alcohol.index[0]]
In [5]:
alcohol[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
In [6]:
chocolate.ix[chocolate.index[0]]
In [7]:
chocolate[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
In [8]:
marriage.ix[marriage.index[0]]
In [9]:
marriage.groupby('Subgroup').mean()
In [10]:
marriagePivot = marriage.pivot('Country', 'Subgroup', 'Value')
marriagePivot
In [11]:
from pandas import Series

def compute_datasetRow(row):
    country = row.name
    sum_by_flow = lambda trades: trades[trades.Country == country].groupby('Flow').sum()['Quantity']
    alcoholFlow = sum_by_flow(alcohol)
    chocolateFlow = sum_by_flow(chocolate)
    return Series(dict(
        AlcoholImported=alcoholFlow.get('Import', 0),
        AlcoholExported=alcoholFlow.get('Export', 0),
        ChocolateImported=chocolateFlow.get('Import', 0),
        ChocolateExported=chocolateFlow.get('Export', 0),
        MarriageAgeFemale=row['Female'],
        MarriageAgeMale=row['Male']), name=country)

dataset = marriagePivot.apply(compute_datasetRow, axis=1)
In [12]:
dataset.AlcoholImported.idxmax()
In [13]:
dataset.AlcoholImported.order(ascending=False)[:5]
In [14]:
import numpy as np
from pandas import DataFrame
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

featureColumns = [_ for _ in dataset.columns if not _.startswith('Marriage')]

def score_model(model, targetColumn):
    pruned = dataset[featureColumns + [targetColumn]].dropna()
    return np.mean(cross_val_score(model, pruned[featureColumns], pruned[targetColumn]))

def score_models(models):
    modelNames = [model.__class__.__name__ for model in models]
    results = []
    for model in models:
        results.append([
            score_model(model, 'MarriageAgeFemale'),
            score_model(model, 'MarriageAgeMale'),
        ])
    return DataFrame(results, index=modelNames, columns=['Female', 'Male'])
In [15]:
score_models([
    LinearRegression(),
    SVR(),
    Pipeline([
        ('StandardScaler', StandardScaler()),
        ('Model', SVR()),
    ]),
])
In [16]:
from sklearn.feature_selection import RFECV
                    
def rank_features(model, targetColumn):
    pruned = dataset[featureColumns + [targetColumn]].dropna()
    featureSelector = RFECV(model)
    featureSelector.fit(pruned[featureColumns], pruned[targetColumn])
    return sorted(zip(featureSelector.ranking_, featureColumns))
In [17]:
rank_features(LinearRegression(), 'MarriageAgeFemale')
In [18]:
rank_features(LinearRegression(), 'MarriageAgeMale')