In [31]:

import pylab as pl
from scripts import make_users
users1 = make_users(teenCount=250, twentyCount=500, thirtyCount=150, fortyCount=100)
users2 = make_users(teenCount=500, twentyCount=400, thirtyCount=90, fortyCount=10)

Market segmentation¶

Trevor runs a news website. He wants to identify the demographics of readers for each article in order to show targeted ads. For each user, he has the following data:

Age
Gender
Device (desktop, laptop, cell phone, smartphone, tablet)
Location

Explore dataset¶

In [32]:

# Look at the characteristics of the first user
zip(users1.feature_names, users1.data[0])

Visualize dataset ¶

In [33]:

# Scatterplot age and device
pl.scatter(users1.data[:, 0], users1.data[:, 2]);

In [34]:

# Scatterplot locations
pl.scatter(users1.data[:, 3], users1.data[:, 4]);
pl.xticks([]);
pl.yticks([]);

In [35]:

import pylab as pl

def plot_embedding(X):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    pl.scatter(X[:, 0], X[:, 1])
    pl.xticks([]), pl.yticks([])
    
from sklearn import manifold
visualization = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard')

In [36]:

# Visualize data
plot_embedding(visualization.fit_transform(users1.data))

In [37]:

# Visualize scaled data
from sklearn.preprocessing import scale
plot_embedding(visualization.fit_transform(scale(users1.data)))

Cluster users ¶

In [38]:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(users1.data)
users1DataScaled = scaler.transform(users1.data)

cluster = KMeans(init='k-means++', n_clusters=4, n_init=10)
cluster.fit(users1DataScaled)

In [39]:

# Look at the characteristics of the first user
zip(users1.feature_names, users1.data[0])

In [40]:

# Get the category of the first user
cluster.predict(scaler.transform(users1.data[0]))

In [41]:

# Get the category of a similar user
cluster.predict(scaler.transform(np.array([10., 0., 4., 5., -10.])))

Pay Notebook Creator: Roy Hyunjin Han	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

Introduction to Computational Analysis

Market segmentation¶

Explore dataset¶

Visualize dataset¶

Cluster users¶

Visualize dataset ¶

Cluster users ¶