TP2-M1ProML-Correction

Posted on Tue 30 April 2019 in posts

In [1]:

import numpy as np
import matplotlib.pyplot as plt

Implémentation de KMeans¶

Dans cette première partie, le but est de coder l'algorithme des k-moyennes. Vous coderez donc les deux fonctions principales

Assign(X,centers), cette fonction renvoie un tableau où chaque donnée est assignée au centre le plus proche
Move(X,centers,ass), cette fonction déplace les centres au barycentre des points qui lui ont été assignés.

In [2]:

def Assign(X,centers):
    ass = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        dist = np.linalg.norm(X[i,:] - centers,axis=1)
        ass[i] =  np.argmin(dist)
    return ass
        
def Move(X,centers,ass):
    for k in range(K):
        data_k = np.where(ass==k)
        if(data_k[0].shape[0] != 0):
            centers[k,:] = np.mean(X[data_k[0],:],axis=0)

On regarde maintenant le jeu de données suivant¶

In [3]:

D=2 # dim des données
M=500 # nb de données
L =105 # taille du cube
Nc = 5 # nb de clusters

centers = np.array([])
for c in range(Nc):
    m = np.random.uniform(2*L,size=(D))-L
    centers = np.append(centers,[m])
    
centers = centers.reshape(Nc,D)
ch = [c for c in range(Nc)]
data_lab = np.random.choice(ch,size=(M))
data = np.zeros((D,M))
for i in range(Nc):
    idx = np.where(data_lab==i)
    data[:,idx[0]] = centers[i,:].reshape(D,1)
    data[:,idx[0]] = centers[i].reshape(D,1) + 3*np.random.normal(size=(D,len(idx[0])))

On fait tourner l'algo des k-moyennes¶

In [4]:

K = 5# nb de centres

c = np.random.uniform(2*L,size=(K,2))-L # initialisation des centres
init_v = np.copy(c)

tmax = 100
for t in range(tmax):
    ass = Assign(data.T,c)
    Move(data.T,c,ass)

In [5]:

ass = Assign(data.T,c)
for k in range(K):
    plt.scatter(data[0,np.where(ass==k)],data[1,np.where(ass==k)])
plt.scatter(init_v[:,0],init_v[:,1],color='lightblue')
plt.scatter(c[:,0],c[:,1],color='black')

Out[5]:

<matplotlib.collections.PathCollection at 0x7f68669080b8>

Application : segmentation d'une image en couleurs¶

In [6]:

# On commence par utiliser skimage
from skimage import data, io, filters

In [9]:

# chargement de l'image
path = "..."
im1 = data.load(path+"im1_small.png")

In [10]:

# Visualisation
getImRGB = im1[:,:,:3]/255.0
plt.imshow(getImRGB)

Out[10]:

<matplotlib.image.AxesImage at 0x7f68628bf400>

In [11]:

# On récupère tous les points dans un tableau de dimension N X 3
PosRGB = getImRGB.reshape(getImRGB.shape[0]*getImRGB.shape[1],3)

In [12]:

K = 10
c = np.random.random((K,3))
Ns = PosRGB.shape[0]

tmax = 10
for t in range(tmax):
    ass = Assign(PosRGB,c)
    Move(PosRGB,c,ass)

ass = Assign(PosRGB,c)

In [13]:

f,ax = plt.subplots(1,K,figsize=(20,5))

for k in range(K):
    idk = np.where(ass==k)[0]
    test_copy = np.zeros(PosRGB.shape)
    test_copy[idk,:] = PosRGB[idk,:]
    ax[k].imshow(test_copy.reshape((getImRGB.shape[0],getImRGB.shape[1],3)))

Mixture de Gaussiennes¶

Regardons maintenant la différence avec des mixtures de Gaussiennes

In [14]:

# Nouvelles données artificielles
nc = 4
d = 2
mean = []
var = []
npts = 200
data = np.zeros((nc*npts,d))
for c in range(nc):
    xm = np.random.random(d)*10-5
    v = np.random.random(d)
    mean.append(xm)
    var.append(v)
    data[npts*c:npts*(c+1),:] = np.random.multivariate_normal(xm,np.diag(v),npts)
    
for c in range(nc):
    plt.scatter(data[npts*c:npts*(c+1),0],data[npts*c:npts*(c+1),1])
plt.show()

Sur ces données, utilisée l'algorithme de mixtures de gaussiennes de la bibliothèque scikit-learn. Faites varier le nombre de clusters et regarder le score obtenu pour chacun. Conclure ?

In [15]:

from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=4, covariance_type="full", max_iter=20)
gmm.fit(data)

plt.scatter(data[:,0],data[:,1])
plt.scatter(gmm.means_[:,0],gmm.means_[:,1])
plt.show()

In [16]:

# look at ellipses !
import matplotlib as mpl
colors = ['navy', 'turquoise', 'darkorange', 'red']

def make_ellipses(gmm, ax, nc):
    for n,col in enumerate(colors[0:nc]):
        if gmm.covariance_type == 'full':
            covariances = gmm.covariances_[n][:2, :2]
        elif gmm.covariance_type == 'tied':
            covariances = gmm.covariances_[:2, :2]
        elif gmm.covariance_type == 'diag':
            covariances = np.diag(gmm.covariances_[n][:2])
        elif gmm.covariance_type == 'spherical':
            covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
        v, w = np.linalg.eigh(covariances)
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi  # convert to degrees
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
                                  180 + angle, color=col)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)

In [17]:

f = plt.subplot()
make_ellipses(gmm,f,4)
axes = plt.gca()
plt.scatter(data[:,0],data[:,1])
plt.show()

We can also plot the contour of the probability distribution

In [34]:

# display predicted scores by the model as a contour plot
from matplotlib.colors import LogNorm

x = np.linspace(-6., 7.)
y = np.linspace(-5., 5.)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -gmm.score_samples(XX)
Z = Z.reshape(X.shape)

CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1, vmax=100.0),
                 levels=np.logspace(0, 2, 10))
CB = plt.colorbar(CS, shrink=1.0, extend='both')
plt.scatter(data[:, 0], data[:, 1], .8)

Out[34]:

<matplotlib.collections.PathCollection at 0x7f685a86dcc0>

In [ ]: