TP2-M1ProML-Correction
Posted on Tue 30 April 2019 in posts
In [1]:
import numpy as np
import matplotlib.pyplot as plt
Implémentation de KMeans¶
Dans cette première partie, le but est de coder l'algorithme des k-moyennes. Vous coderez donc les deux fonctions principales
- Assign(X,centers), cette fonction renvoie un tableau où chaque donnée est assignée au centre le plus proche
- Move(X,centers,ass), cette fonction déplace les centres au barycentre des points qui lui ont été assignés.
In [2]:
def Assign(X,centers):
ass = np.zeros(X.shape[0])
for i in range(X.shape[0]):
dist = np.linalg.norm(X[i,:] - centers,axis=1)
ass[i] = np.argmin(dist)
return ass
def Move(X,centers,ass):
for k in range(K):
data_k = np.where(ass==k)
if(data_k[0].shape[0] != 0):
centers[k,:] = np.mean(X[data_k[0],:],axis=0)
On regarde maintenant le jeu de données suivant¶
In [3]:
D=2 # dim des données
M=500 # nb de données
L =105 # taille du cube
Nc = 5 # nb de clusters
centers = np.array([])
for c in range(Nc):
m = np.random.uniform(2*L,size=(D))-L
centers = np.append(centers,[m])
centers = centers.reshape(Nc,D)
ch = [c for c in range(Nc)]
data_lab = np.random.choice(ch,size=(M))
data = np.zeros((D,M))
for i in range(Nc):
idx = np.where(data_lab==i)
data[:,idx[0]] = centers[i,:].reshape(D,1)
data[:,idx[0]] = centers[i].reshape(D,1) + 3*np.random.normal(size=(D,len(idx[0])))
On fait tourner l'algo des k-moyennes¶
In [4]:
K = 5# nb de centres
c = np.random.uniform(2*L,size=(K,2))-L # initialisation des centres
init_v = np.copy(c)
tmax = 100
for t in range(tmax):
ass = Assign(data.T,c)
Move(data.T,c,ass)
In [5]:
ass = Assign(data.T,c)
for k in range(K):
plt.scatter(data[0,np.where(ass==k)],data[1,np.where(ass==k)])
plt.scatter(init_v[:,0],init_v[:,1],color='lightblue')
plt.scatter(c[:,0],c[:,1],color='black')
Out[5]:
Application : segmentation d'une image en couleurs¶
In [6]:
# On commence par utiliser skimage
from skimage import data, io, filters
In [9]:
# chargement de l'image
path = "..."
im1 = data.load(path+"im1_small.png")
In [10]:
# Visualisation
getImRGB = im1[:,:,:3]/255.0
plt.imshow(getImRGB)
Out[10]:
In [11]:
# On récupère tous les points dans un tableau de dimension N X 3
PosRGB = getImRGB.reshape(getImRGB.shape[0]*getImRGB.shape[1],3)
In [12]:
K = 10
c = np.random.random((K,3))
Ns = PosRGB.shape[0]
tmax = 10
for t in range(tmax):
ass = Assign(PosRGB,c)
Move(PosRGB,c,ass)
ass = Assign(PosRGB,c)
In [13]:
f,ax = plt.subplots(1,K,figsize=(20,5))
for k in range(K):
idk = np.where(ass==k)[0]
test_copy = np.zeros(PosRGB.shape)
test_copy[idk,:] = PosRGB[idk,:]
ax[k].imshow(test_copy.reshape((getImRGB.shape[0],getImRGB.shape[1],3)))
Mixture de Gaussiennes¶
Regardons maintenant la différence avec des mixtures de Gaussiennes
In [14]:
# Nouvelles données artificielles
nc = 4
d = 2
mean = []
var = []
npts = 200
data = np.zeros((nc*npts,d))
for c in range(nc):
xm = np.random.random(d)*10-5
v = np.random.random(d)
mean.append(xm)
var.append(v)
data[npts*c:npts*(c+1),:] = np.random.multivariate_normal(xm,np.diag(v),npts)
for c in range(nc):
plt.scatter(data[npts*c:npts*(c+1),0],data[npts*c:npts*(c+1),1])
plt.show()
Sur ces données, utilisée l'algorithme de mixtures de gaussiennes de la bibliothèque scikit-learn. Faites varier le nombre de clusters et regarder le score obtenu pour chacun. Conclure ?
In [15]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4, covariance_type="full", max_iter=20)
gmm.fit(data)
plt.scatter(data[:,0],data[:,1])
plt.scatter(gmm.means_[:,0],gmm.means_[:,1])
plt.show()
In [16]:
# look at ellipses !
import matplotlib as mpl
colors = ['navy', 'turquoise', 'darkorange', 'red']
def make_ellipses(gmm, ax, nc):
for n,col in enumerate(colors[0:nc]):
if gmm.covariance_type == 'full':
covariances = gmm.covariances_[n][:2, :2]
elif gmm.covariance_type == 'tied':
covariances = gmm.covariances_[:2, :2]
elif gmm.covariance_type == 'diag':
covariances = np.diag(gmm.covariances_[n][:2])
elif gmm.covariance_type == 'spherical':
covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
v, w = np.linalg.eigh(covariances)
u = w[0] / np.linalg.norm(w[0])
angle = np.arctan2(u[1], u[0])
angle = 180 * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
180 + angle, color=col)
ell.set_clip_box(ax.bbox)
ell.set_alpha(0.5)
ax.add_artist(ell)
In [17]:
f = plt.subplot()
make_ellipses(gmm,f,4)
axes = plt.gca()
plt.scatter(data[:,0],data[:,1])
plt.show()
We can also plot the contour of the probability distribution
In [34]:
# display predicted scores by the model as a contour plot
from matplotlib.colors import LogNorm
x = np.linspace(-6., 7.)
y = np.linspace(-5., 5.)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -gmm.score_samples(XX)
Z = Z.reshape(X.shape)
CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1, vmax=100.0),
levels=np.logspace(0, 2, 10))
CB = plt.colorbar(CS, shrink=1.0, extend='both')
plt.scatter(data[:, 0], data[:, 1], .8)
Out[34]:
In [ ]: