from pandas import DataFrame, Series, ExcelFile, concat, read_csv
excelFile = ExcelFile('datasets/Economist-BigMacIndex-201207.xls')
sheetName = excelFile.sheet_names[0]
economistBigMacPrices = excelFile.parse(sheetName)
economistBigMacPrices.tail()
worldBankContracts = read_csv('datasets/WorldBank-ContractAwards.csv')
# Examine first row
worldBankContracts.ix[0]
# Parse currency
def parse_currency(x):
return float(x.replace('$', ''))
worldBankContracts['Contract Amount'] = worldBankContracts['Total Contract Amount (USD)'].apply(parse_currency)
# Extract columns we will use
worldBankContracts = worldBankContracts[[
'Borrower Country',
'Supplier Country',
'Procurement Type',
'Contract Amount',
]]
# Preview table
worldBankContracts.head()
def get_countries():
return worldBankContracts[[
'Borrower Country',
'Supplier Country',
]].unstack().unique()
def get_unmatched_countries(countries):
return set(countries) - set(get_countries())
# Check whether the Economist dataset contains countries
# that are not in the World Bank dataset
get_unmatched_countries(economistBigMacPrices['Country'])
from pandas import read_csv
countryNames = read_csv('datasets/World-CountryNames.csv')
countryNames
# Translate world bank country names from formal to informal
for rowIndex, row in countryNames.iterrows():
worldBankContracts.replace(row['Formal'], row['Informal'], inplace=True)
# Define features
def compute_country_role(country, role):
contracts = worldBankContracts[worldBankContracts[role + ' Country'] == country]
# Count contracts by procurement type
countsByType = contracts['Procurement Type'].value_counts()
countsByType.index = [role + 'Count(%s)' % _ for _ in countsByType.index]
# Sum contracts by procurement type
sumsByType = contracts.groupby('Procurement Type').sum()['Contract Amount']
sumsByType.index = [role + 'Sum(%s)' % _ for _ in sumsByType.index]
# Assemble
return concat([
Series({
# Count contracts
role + 'Count': len(contracts),
# Sum contracts
role + 'Sum': sum(contracts['Contract Amount']),
}),
countsByType,
sumsByType,
])
def compute_country(country):
return concat([
compute_country_role(country, 'Borrower'),
compute_country_role(country, 'Supplier'),
])
# Build dataset
def compute_row(economistBigMacRow):
country = economistBigMacRow['Country']
price = economistBigMacRow['USD']
return concat([
Series({'Country': country}),
compute_country(country),
Series({'Price': price}),
])
dataset = economistBigMacPrices.apply(compute_row, axis=1).fillna(0)
euCountries = [_.strip() for _ in open('datasets/EuropeanUnion-MemberStates.txt')]
# Check whether the EU dataset contains countries
# that are not in the World Bank dataset
get_unmatched_countries(euCountries)
# Compute features for Euro area
euStatistics = concat([compute_country(_) for _ in euCountries], axis=1).fillna(0)
euSummary = euStatistics.sum(axis=1)
# Update features for Euro area
euIndex = (dataset['Country'] == 'Euro area').argmax()
dataset.ix[euIndex] = euSummary.combine_first(dataset.ix[euIndex])
# Check that we have all the Big Mac countries in the dataset
all(dataset['Country'] == economistBigMacPrices['Country'])
import numpy as np
from sklearn.cross_validation import cross_val_score
def score_features(model, featureColumns):
return np.mean(cross_val_score(
model,
dataset[featureColumns],
dataset['Price']))
def score_models(models):
featureCombinations = [
['BorrowerSum'],
['BorrowerCount'],
['BorrowerSum', 'BorrowerCount'],
['SupplierSum'],
['SupplierCount'],
['SupplierSum', 'SupplierCount'],
['BorrowerSum', 'SupplierSum'],
['BorrowerCount', 'SupplierCount'],
['BorrowerCount', 'BorrowerSum', 'SupplierCount', 'SupplierSum'],
[_ for _ in dataset.columns if _ not in ['Country', 'Price']],
]
featureCombinationNames = [', '.join(_)[:30] for _ in featureCombinations]
modelNames = [model.__class__.__name__ for model in models]
results = []
for model in models:
results.append([score_features(model, _) for _ in featureCombinations])
return DataFrame(results,
index=modelNames,
columns=featureCombinationNames).transpose()
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
score_models([
LinearRegression(),
SVR(),
Pipeline([
('Normalizer', Normalizer()),
('Model', SVR()),
]),
Pipeline([
('Decomposer', PCA()),
('Model', SVR()),
]),
])
selectedColumns = ['BorrowerSum', 'SupplierSum']
# Fit model
model = SVR()
model.fit(dataset[selectedColumns], dataset['Price'])
# Estimate Big Mac prices in countries not included in original Big Mac index
model.predict(compute_country('Bolivia')[selectedColumns])
# Batch estimate
selectedCountries = get_countries()
selectedStatistics = concat((compute_country(_) for _ in selectedCountries), axis=1).transpose()[selectedColumns]
selectedPrices = model.predict(selectedStatistics)
selectedRankings = sorted(zip(selectedPrices, selectedCountries))
# Find the top ten countries that will give us the best deal for a Big Mac
selectedRankings[:10]