import numpy as np
from scripts import make_shirts
shirts = make_shirts()
Stacy runs an online custom t-shirt business. She is experimenting with layout design to increase sales. Here are the different tweaks she has tried:
Since budget is limited, Stacy wants to focus on the layout enhancements that actually affect sales. Please rank the layout enhancements based on 500 product records.
# Look at the first record
zip(shirts.feature_names, shirts.data[0])
# Check whether the first product sold
print shirts.target[0]
Count the number of shirts that sold.
# Type your solution here and press CTRL-ENTER
Compare price histograms between shirts that sold and shirts that didn't sell.
# Type your solution here and press CTRL-ENTER
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.metrics import zero_one
def evaluate_model(model):
return np.mean(cross_val_score(
model,
shirts.data,
shirts.target,
score_func=zero_one,
cv=StratifiedKFold(shirts.target, 3),
n_jobs=-1))
from sklearn.naive_bayes import GaussianNB
evaluate_model(GaussianNB())
from sklearn.neighbors import KNeighborsClassifier
evaluate_model(KNeighborsClassifier())
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = [
DecisionTreeClassifier(),
KNeighborsClassifier(),
LogisticRegression(),
GaussianNB(),
SVC(),
]
bestScore = 0
bestModel = None
for model in models:
score = evaluate_model(model)
if score > bestScore:
bestScore = score
bestModel = model
print bestModel
print bestScore
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
featureSelector = RFE(estimator=LogisticRegression(), n_features_to_select=1, step=1)
featureSelector.fit(shirts.data, shirts.target)
sorted(zip(featureSelector.ranking_, shirts.feature_names))
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import zero_one
featureSelector = RFECV(
estimator=LogisticRegression(),
step=1,
cv=StratifiedKFold(shirts.target, 3),
loss_func=zero_one)
featureSelector.fit(shirts.data, shirts.target)
# Plot number of features against cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel('# of features selected')
pl.ylabel('# of misclassifications')
pl.plot(xrange(1, len(featureSelector.cv_scores_) + 1), featureSelector.cv_scores_)
pl.show()
print 'Optimal number of features = %d' % featureSelector.n_features_
print sorted(zip(featureSelector.ranking_, shirts.feature_names))[:featureSelector.n_features_]