import pylab as pl
from scripts import make_users
users1 = make_users(teenCount=250, twentyCount=500, thirtyCount=150, fortyCount=100)
users2 = make_users(teenCount=500, twentyCount=400, thirtyCount=90, fortyCount=10)
Trevor runs a news website. He wants to identify the demographics of readers for each article in order to show targeted ads. For each user, he has the following data:
# Look at the characteristics of the first user
zip(users1.feature_names, users1.data[0])
# Scatterplot age and device
pl.scatter(users1.data[:, 0], users1.data[:, 2]);
# Scatterplot locations
pl.scatter(users1.data[:, 3], users1.data[:, 4]);
pl.xticks([]);
pl.yticks([]);
import pylab as pl
def plot_embedding(X):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
pl.scatter(X[:, 0], X[:, 1])
pl.xticks([]), pl.yticks([])
from sklearn import manifold
visualization = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard')
# Visualize data
plot_embedding(visualization.fit_transform(users1.data))
# Visualize scaled data
from sklearn.preprocessing import scale
plot_embedding(visualization.fit_transform(scale(users1.data)))
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(users1.data)
users1DataScaled = scaler.transform(users1.data)
cluster = KMeans(init='k-means++', n_clusters=4, n_init=10)
cluster.fit(users1DataScaled)
# Look at the characteristics of the first user
zip(users1.feature_names, users1.data[0])
# Get the category of the first user
cluster.predict(scaler.transform(users1.data[0]))
# Get the category of a similar user
cluster.predict(scaler.transform(np.array([10., 0., 4., 5., -10.])))