import numpy as np
from scripts import make_shirts
shirts = make_shirts()
Stacy runs an online custom t-shirt business. She is experimenting with layout design to increase sales. Here are the different tweaks she has tried:
Since budget is limited, Stacy wants to focus on the layout enhancements that actually affect sales. Please rank the layout enhancements based on 500 product records.
# Look at the first record
zip(shirts.feature_names, shirts.data[0])
# Check whether the first product sold
print shirts.target[0]
Count the number of shirts that sold.
# Type your solution here and press CTRL-ENTER
Compare price histograms between shirts that sold and shirts that didn't sell.
# Type your solution here and press CTRL-ENTER
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.metrics import zero_one
def evaluate_model(model):
    return np.mean(cross_val_score(
        model, 
        shirts.data, 
        shirts.target, 
        score_func=zero_one,
        cv=StratifiedKFold(shirts.target, 3),
        n_jobs=-1))
from sklearn.naive_bayes import GaussianNB
evaluate_model(GaussianNB())
from sklearn.neighbors import KNeighborsClassifier
evaluate_model(KNeighborsClassifier())
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = [
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(),
    GaussianNB(),
    SVC(),
]
bestScore = 0
bestModel = None
for model in models:
    score = evaluate_model(model)
    if score > bestScore:
        bestScore = score
        bestModel = model
print bestModel
print bestScore
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
featureSelector = RFE(estimator=LogisticRegression(), n_features_to_select=1, step=1)
featureSelector.fit(shirts.data, shirts.target)
sorted(zip(featureSelector.ranking_, shirts.feature_names))
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import zero_one
featureSelector = RFECV(
    estimator=LogisticRegression(), 
    step=1, 
    cv=StratifiedKFold(shirts.target, 3),
    loss_func=zero_one)
featureSelector.fit(shirts.data, shirts.target)
# Plot number of features against cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel('# of features selected')
pl.ylabel('# of misclassifications')
pl.plot(xrange(1, len(featureSelector.cv_scores_) + 1), featureSelector.cv_scores_)
pl.show()
print 'Optimal number of features = %d' % featureSelector.n_features_
print sorted(zip(featureSelector.ranking_, shirts.feature_names))[:featureSelector.n_features_]