The goal is to build a machine that make decisions automatically using information it has not seen before, and whose performance improves with experience. The approach in machine learning is to develop algorithms that make decisions using a model fitted on data.
The scikit-learn package is a collection of machine learning algorithms that share a common usage pattern:
from sklearn import datasets, neighbors
iris = datasets.load_iris()
model = neighbors.KNeighborsClassifier()
model.fit(iris.data, iris.target)
model.predict([7.5, 3, 6.5, 2.1])
# Take a moment to browse the official tutorials and examples
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
trainingSet = X[:-100], y[:-100]
testSet = X[-100:], y[-100:]
def evaluate_model(model):
return model.fit(*trainingSet).score(*testSet)
from sklearn.gaussian_process import GaussianProcess
evaluate_model(GaussianProcess())
from sklearn.tree import DecisionTreeClassifier
evaluate_model(DecisionTreeClassifier())
from sklearn.svm import SVC
evaluate_model(SVC(kernel='linear', C=0.001))
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
model = LogisticRegression()
cross_val_score(model, iris.data, iris.target)
cross_val_score(model, iris.data, iris.target, cv=4)
from sklearn.cross_validation import LeaveOneOut
cross_val_score(model, iris.data, iris.target, cv=LeaveOneOut(len(iris.target)))
from IPython.lib.display import YouTubeVideo
YouTubeVideo('1uS5b8aQ6z8')
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import load_digits; digits = load_digits()
model = Pipeline([
('pca', PCA()),
('logistic', LogisticRegression()),
])
np.mean(cross_val_score(model, digits.data, digits.target))
from sklearn.preprocessing import StandardScaler
model = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('logistic', LogisticRegression()),
])
np.mean(cross_val_score(model, digits.data, digits.target))
Let's vectorize a stanza from Zbigniew Herbert's A Knocker.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
documents = [
'my imagination',
'is a piece of board',
'my sole instrument',
'is a wooden stick',
]
X = vectorizer.fit_transform(documents)
documentVectors = X.toarray()
documentVectors
featureNames = vectorizer.get_feature_names()
for bagOfWords in documentVectors:
print zip(featureNames, bagOfWords)
# Adapted from
# http://scikit-learn.org/dev/tutorial/statistical_inference/putting_together.html
from sklearn import linear_model, decomposition, datasets
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('pca', decomposition.PCA()),
('logistic', linear_model.LogisticRegression()),
])
gridSearch = GridSearchCV(pipeline, dict(
pca__n_components=[20, 40],
logistic__C=[1, 1000]))
digits = datasets.load_digits()
gridSearch.fit(digits.data, digits.target)
valueByParameter = gridSearch.best_estimator_.get_params()
for parameter in gridSearch.param_grid:
print '%s: %r' % (parameter, valueByParameter[parameter])
from archiveIO import Archive, TemporaryFolder
archive = Archive('datasets/ZbigniewHerbert.tar.gz')
documents = []
categories = []
with TemporaryFolder() as temporaryFolder:
for documentPath in archive.load(temporaryFolder):
text = open(documentPath).read()
documents.append(text)
categories.append('Carpenter' in text)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
parameters = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_n': (1, 2),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
}
from sklearn.grid_search import GridSearchCV
gridSearch = GridSearchCV(pipeline, parameters, n_jobs=-1)
gridSearch.fit(documents, categories)
valueByParameter = gridSearch.best_estimator_.get_params()
for parameter in gridSearch.param_grid:
print '%s: %r' % (parameter, valueByParameter[parameter])
print "Best score: %0.3f" % gridSearch.best_score_
print documents[27]