import pandas
import numpy as np
import os
Look at spinach prices from the US food prices dataset.
excelFile = pandas.ExcelFile('datasets/USA-SpinachPrices.xls')
excelFile.sheet_names
organicSpinachFrame = excelFile.parse('organic', index_col=0)
conventionalSpinachFrame = excelFile.parse('conventional', index_col=0)
conventionalSpinachFrame
Rearrange DataFrame into a Series with a MultiIndex.
organicSpinachHierarchicalSeries = organicSpinachFrame.stack()
conventionalSpinachHierarchicalSeries = conventionalSpinachFrame.stack()
conventionalSpinachHierarchicalSeries
Combine year and month into a single index.
organicSpinachHierarchicalSeries.index
organicSpinachHierarchicalSeries.index
from pandas import Timestamp
def parse_indexEntry(indexEntry):
return Timestamp('%d %s' % indexEntry)
parse_indexEntry(organicSpinachHierarchicalSeries.index[0])
def flatten_index(hierarchicalSeries):
return pandas.Series(hierarchicalSeries.values,
index=[parse_indexEntry(x) for x in hierarchicalSeries.index])
organicSpinachSeries = flatten_index(organicSpinachHierarchicalSeries)
conventionalSpinachSeries = flatten_index(conventionalSpinachHierarchicalSeries)
conventionalSpinachSeries[:5]
Compare monthly prices of organic vs conventional spinach.
import pylab as pl
pl.figure(figsize=(12,6))
organicSpinachSeries.plot(label='organic')
conventionalSpinachSeries.plot(label='conventional')
pl.legend()
organicSpinachFrame - conventionalSpinachFrame
organicSpinachSeries.describe()
organicSpinachSeries.describe() - conventionalSpinachSeries.describe()
Look at dates when the price of organic spinach was above average.
subIndex = organicSpinachSeries > organicSpinachSeries.mean()
organicSpinachSeries[subIndex]
Look at the price of conventional spinach for dates when the price of organic spinach was above average.
conventionalSpinachSeries[subIndex]
Was the price of conventional spinach above average on dates when the price of organic spinach was above average?
conventionalSpinachSeries[subIndex] > conventionalSpinachSeries.mean()
Assemble prices and their differences into a table and save it to a spreadsheet.
# Create dataFrame
spinachFrame = pandas.DataFrame.from_items([
('organic', organicSpinachSeries),
('conventional', conventionalSpinachSeries)])
spinachFrame.head()
# Add column of differences
spinachFrame['difference'] = organicSpinachSeries - conventionalSpinachSeries
spinachFrame.head()
# Save it to a spreadsheet
spinachFrame.to_csv(os.path.expandvars('$HOME/Downloads/USA-SpinachPrices.csv'))
Convert spinachFrame from US dollars to Singapore dollars using an exchange rate of 1 USD = 1.28135 SGD.
# Type your solution here and press CTRL-ENTER
Compute the ratio of organic prices to conventional prices.
# Option 1
spinachFrame['organic'] / spinachFrame['conventional']
# Option 2
spinachFrame.apply(lambda row: row['organic'] / row['conventional'], axis=1)
Compute average prices by year.
from pandas.core.datetools import YearEnd
spinachFrame.groupby(YearEnd().rollforward).aggregate(np.mean)
Predict the price of organic spinach given the price of conventional spinach using ordinary least squares.
import statsmodels.api as sm
y = organicSpinachSeries[:-1] # Exclude last price
X = conventionalSpinachSeries[:-1] # Exclude last price
model = sm.OLS(y, X)
results = model.fit()
results.params
results.predict(conventionalSpinachSeries[-1])