GMM in python and EM

发表于 2019-12-16

更新于: 2019-12-16

| 阅读量: 118次

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np

from sklearn.datasets.samples_generator import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4,
                       cluster_std=0.60, random_state=0)
X = X[:, ::-1] # flip axes for better plotting

# Plot the data with K Means Labels
from sklearn.cluster import KMeans
kmeans = KMeans(4, random_state=0)
labels = kmeans.fit(X).predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

def plot_kmeans(kmeans, X, n_clusters=4, rseed=0, ax=None):
    labels = kmeans.fit_predict(X)

# plot the input data
    ax = ax or plt.gca()
    ax.axis('equal')
    ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)

# plot the representation of the KMeans model
    centers = kmeans.cluster_centers_
    radii = [cdist(X[labels == i], [center]).max()
             for i, center in enumerate(centers)]
    for c, r in zip(centers, radii):
        ax.add_patch(plt.Circle(c, r, fc='#CCCCCC', lw=3, alpha=0.5, zorder=1))

kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X)

rng = np.random.RandomState(13)
X_stretched = np.dot(X, rng.randn(2, 2))

kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X_stretched)

使用GMM

from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

probs = gmm.predict_proba(X)
print(probs[:5].round(3))

[[0.    0.463 0.    0.537]
 [0.    0.    1.    0.   ]
 [0.    0.    1.    0.   ]
 [0.    0.    0.    1.   ]
 [0.    0.    1.    0.   ]]

size = 50 * probs.max(1) ** 2  # square emphasizes differences
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=size);

mnist数据及生成

from sklearn.mixture import GaussianMixture

from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

(1797, 64)

def plot_digits(data):
    fig, ax = plt.subplots(10, 10, figsize=(8, 8),
                           subplot_kw=dict(xticks=[], yticks=[]))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    for i, axi in enumerate(ax.flat):
        im = axi.imshow(data[i].reshape(8, 8), cmap='binary')
        im.set_clim(0, 16)
plot_digits(digits.data)

from sklearn.decomposition import PCA
pca = PCA(0.99, whiten=True)
data = pca.fit_transform(digits.data)
data.shape

(1797, 41)

n_components = np.arange(50, 210, 10)
models = [GaussianMixture(n, covariance_type='full', random_state=0)
          for n in n_components]
aics = [model.fit(data).aic(data) for model in models]
plt.plot(n_components, aics);

gmm = GaussianMixture(110, covariance_type='full', random_state=0)
gmm.fit(data)
print(gmm.converged_)

True

data_new = gmm.sample(100)
print(len(data_new))

data_new = data_new[0]

digits_new = pca.inverse_transform(data_new)
plot_digits(digits_new)

第二部分：使用python从头开始实现gmm

Decide how many sources/clusters (c) you want to fit to your data
Initialize the parameters meanμcμc, covarianceσcσc, and fraction_per_class πcπc per cluster c

E-step

Calculate for each datapoint $xitheprobabilitytheprobabilityr{ic}thatdatapointthatdatapointx_i$ belongs to cluster c with:

ric=πcN(xi|μc,Σc)∑Kk=1πkN(xi|μk,Σk)(2-1)(2-1)ric=πcN(xi|μc,Σc)∑k=1KπkN(xi|μk,Σk)

where N(x|μ,Σ)N(x|μ,Σ)describes the mulitvariate Gaussian with:

N(xi,μc,Σc)=1(2π)n2|Σc|1/2exp(−1/2(xi−μc)⊤Σ−1c(xi−μc))(2-2)(2-2)N(xi,μc,Σc)=1(2π)n2|Σc|1/2exp⁡(−1/2(xi−μc)⊤Σc−1(xi−μc))

ricric gives us for each datapoint xixi the measure of:

$\frac{\mathrm{probability\ that\ xi\ belongs\ to\ class\ c}}{\mathrm{probability\ of\ x_i\ over\ all\ classes}},henceif,henceifx_iisveryclosetoonegaussianc,itwillgetahighisveryclosetoonegaussianc,itwillgetahighr{ic}$ value for this gaussian and relatively low values otherwise.

M-step

for each cluster c: Calculate the total weight $mc(looselyspeakingthefractionofpointsallocatedtoclusterc)andupdate(looselyspeakingthefractionofpointsallocatedtoclusterc)andupdate\pi_c,,\mu_candand\Sigma_cusingusingr{ic}$with:

mc=Σiric(2-3)(2-3)mc=Σiric

πc=mcm(2-4)(2-4)πc=mcm

μc=1mcΣiricxi(2-5)(2-5)μc=1mcΣiricxi

Σc=1mcΣiric(xi−μc)⊤(xi−μc)(2-6)(2-6)Σc=1mcΣiric(xi−μc)⊤(xi−μc)

注意：最好一个式子所使用的μcμc是前面更新后的μcμc。

Iteratively repeat the E and M step until the log-likelihood function of our model converges where the log likelihood is computed with:

lnp(X|π,μ,Σ)=∑i=1Nln(ΣKk=1πkN(xi|μk,Σk))(2-7)(2-7)ln⁡p(X|π,μ,Σ)=∑i=1Nln⁡(Σk=1KπkN(xi|μk,Σk))

E-Step

Decide how many sources/clusters(c) you want to fit to your data —> Mind that each cluster c is represented by gaussian g
Initialize the parameters mean μcμc,covariance ΣcΣc,and fraction_per_class πcπc per cluster c
calculate for each datapoint $xitheprobabilitytheprobabilityr{ic}thatdatapointthatdatapointx_i$ belongs to cluster c with:

ric=πcN(xi|μc,Σc)∑Kk=1πkN(xi|μk,Σk)(2-1)(2-1)ric=πcN(xi|μc,Σc)∑k=1KπkN(xi|μk,Σk)

where N(x|μ,Σ)N(x|μ,Σ)describes the mulitvariate Gaussian with:

N(xi,μc,Σc)=1(2π)n2|Σc|1/2exp(−1/2(xi−μc)⊤Σ−1c(xi−μc))(2-2)(2-2)N(xi,μc,Σc)=1(2π)n2|Σc|1/2exp⁡(−1/2(xi−μc)⊤Σc−1(xi−μc))

ricric gives us for each datapoint xixi the measure of:

from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

import numpy as np
from scipy.stats import norm
np.random.seed(0)

X = np.linspace(-5,5,num=20)
X0 = X*np.random.rand(len(X))+10 #create data cluster 1
X1 = X*np.random.rand(len(X))-10 # Create data cluster 2
X2 = X*np.random.rand(len(X)) # Create data cluster 3
X_tot = np.stack((X0,X1,X2)).flatten()

"""
E-Step
"""

"""Create the array r with dimensionality nxK"""
r = np.zeros((len(X_tot),3))
print('Dimensionality','=',np.shape(r))

"""Instantiate the random gaussians"""
gauss_1 = norm(loc=-5,scale=5) 
gauss_2 = norm(loc=8,scale=3)
gauss_3 = norm(loc=1.5,scale=1)

"""Instantiate the random pi_c,m_c"""
m_c = np.array([1/3,1/3,1/3]) #wwe expect to have three clusters
pi_c = m_c/np.sum(m_c)

"""Probability for each datapoint x_i to belong to gaussian g """
for c,g,p in zip(range(3),[gauss_1,gauss_2,gauss_3],pi_c):
    r[:,c] = p*g.pdf(X_tot) # Write the probability that x belongs to gaussian c in column c. 

"""Normalize the probabilities such that each row of r sums to 1"""
for i in range(len(r)):
    r[i] = r[i]/np.sum(r,axis=1)[i]

print(r)
print(np.sum(r,axis=1)) # As we can see, as result each row sums up to one, just as we want it.

Dimensionality = (60, 3)
[[2.97644006e-02 9.70235407e-01 1.91912550e-07]
 [3.85713024e-02 9.61426220e-01 2.47747304e-06]
 [2.44002651e-02 9.75599713e-01 2.16252823e-08]
 [1.86909096e-02 9.81309090e-01 8.07574590e-10]
 [1.37640773e-02 9.86235923e-01 9.93606589e-12]
 [1.58674083e-02 9.84132592e-01 8.42447356e-11]
 [1.14191259e-02 9.88580874e-01 4.48947365e-13]
 [1.34349421e-02 9.86565058e-01 6.78305927e-12]
 [1.11995848e-02 9.88800415e-01 3.18533028e-13]
 [8.57645259e-03 9.91423547e-01 1.74498648e-15]
 [7.64696969e-03 9.92353030e-01 1.33051021e-16]
 [7.10275112e-03 9.92897249e-01 2.22285146e-17]
 [6.36154765e-03 9.93638452e-01 1.22221112e-18]
 [4.82376290e-03 9.95176237e-01 1.55549544e-22]
 [7.75866904e-03 9.92241331e-01 1.86665135e-16]
 [7.52759691e-03 9.92472403e-01 9.17205413e-17]
 [8.04550643e-03 9.91954494e-01 4.28205323e-16]
 [3.51864573e-03 9.96481354e-01 9.60903037e-30]
 [3.42631418e-03 9.96573686e-01 1.06921949e-30]
 [3.14390460e-03 9.96856095e-01 3.91217273e-35]
 [1.00000000e+00 2.67245688e-12 1.56443629e-57]
 [1.00000000e+00 4.26082753e-11 9.73970426e-49]
 [9.99999999e-01 1.40098281e-09 3.68939866e-38]
 [1.00000000e+00 2.65579518e-10 4.05324196e-43]
 [9.99999977e-01 2.25030673e-08 3.11711096e-30]
 [9.99999997e-01 2.52018974e-09 1.91287930e-36]
 [9.99999974e-01 2.59528826e-08 7.72534540e-30]
 [9.99999996e-01 4.22823192e-09 5.97494463e-35]
 [9.99999980e-01 1.98158593e-08 1.38414545e-30]
 [9.99999966e-01 3.43722391e-08 4.57504394e-29]
 [9.99999953e-01 4.74290492e-08 3.45975850e-28]
 [9.99999876e-01 1.24093364e-07 1.31878573e-25]
 [9.99999878e-01 1.21709730e-07 1.17161878e-25]
 [9.99999735e-01 2.65048706e-07 1.28402556e-23]
 [9.99999955e-01 4.53370639e-08 2.60841891e-28]
 [9.99999067e-01 9.33220139e-07 2.02379180e-20]
 [9.99998448e-01 1.55216175e-06 3.63693167e-19]
 [9.99997285e-01 2.71542629e-06 8.18923788e-18]
 [9.99955648e-01 4.43516655e-05 1.59283752e-11]
 [9.99987200e-01 1.28004505e-05 3.20565446e-14]
 [9.64689131e-01 9.53405294e-03 2.57768163e-02]
 [9.77001731e-01 7.96383733e-03 1.50344317e-02]
 [9.96373670e-01 2.97775078e-03 6.48579562e-04]
 [3.43634425e-01 2.15201653e-02 6.34845409e-01]
 [9.75390877e-01 8.19866977e-03 1.64104537e-02]
 [9.37822997e-01 1.19363656e-02 5.02406373e-02]
 [4.27396946e-01 2.18816340e-02 5.50721420e-01]
 [3.28570544e-01 2.14190231e-02 6.50010433e-01]
 [3.62198108e-01 2.16303800e-02 6.16171512e-01]
 [2.99837196e-01 2.11991858e-02 6.78963618e-01]
 [2.21768797e-01 2.04809383e-02 7.57750265e-01]
 [1.76497129e-01 2.01127714e-02 8.03390100e-01]
 [8.23252013e-02 2.50758227e-02 8.92598976e-01]
 [2.11943183e-01 2.03894641e-02 7.67667353e-01]
 [1.50351209e-01 2.00499057e-02 8.29598885e-01]
 [1.54779991e-01 2.00449518e-02 8.25175057e-01]
 [7.92109803e-02 5.93118654e-02 8.61477154e-01]
 [9.71905134e-02 2.18698473e-02 8.80939639e-01]
 [7.60625670e-02 4.95831879e-02 8.74354245e-01]
 [8.53513721e-02 2.40396004e-02 8.90609028e-01]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

"""Plot the data"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
    ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100) # We have defined the first column as red, the second as
                                                                        # green and the third as blue
"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.linspace(-15,15)),gauss_2.pdf(np.linspace(-15,15)),gauss_3.pdf(np.linspace(-15,15))],['r','g','b']):
    ax0.plot(np.linspace(-15,15),g,c=c,zorder=0)


plt.show()

M-step

如何利用E-step中得到的信息？

So why did this help us? Well, we now know the probability for each point to belong to each gaussian.
What can we do with this information? Well, with this information we can calculate a new mean as well as a new variance (in 1D) or covariance matrix in > 1D datasets. What will be the result of that? Well, this would change the location of each gaussian in the direction of the “real” mean and would re-shape each gaussian using a value for the variance which is closer to the “real” variance. This procedure is called the Maximization step of the EM algorithm. The Maximization step looks as follows:

mc=Σiric(2-3)(2-3)mc=Σiric

πc=mcm(2-4)(2-4)πc=mcm

μc=1mcΣiricxi(2-5)(2-5)μc=1mcΣiricxi

Σc=1mcΣiric(xi−μc)⊤(xi−μc)(2-6)(2-6)Σc=1mcΣiric(xi−μc)⊤(xi−μc)

注意：最好一个式子所使用的μcμc是前面更新后的μcμc。

so let’s look at out plot if we do the above updates,that is run the first EM iteration.

"""
M-step
"""

"""calculate m_c"""
m_c = []
for c in range(len(r[0])):
    m = np.sum(r[:,c])
    m_c.append(m) #for each cluster c, calculate the m_c and add it to the list m_c

"""calculate pi_c"""
pi_c = []
for m in m_c:
    pi_c.append(m/np.sum(m_c)) #For each cluster c, calculate the fraction of points pi_c which belongs to cluster c

"""calculate mu_c"""
mu_c = np.sum(X_tot.reshape(len(X_tot),1)*r, axis=0)/m_c

"""calculate var_c"""
var_c = []
for c in range(len(r[0])):
    var_c.append((1/m_c[c])*np.dot(((np.array(r[:,c]).reshape(60,1))*(
        X_tot.reshape(len(X_tot),1)-mu_c[c])).T, (X_tot.reshape(len(X_tot),1)-mu_c[c])))

"""update the gaussians"""
gauss_1 = norm(loc=mu_c[0],scale=var_c[0])
gauss_2 = norm(loc=mu_c[1],scale=var_c[1])
gauss_3 = norm(loc=mu_c[2],scale=var_c[2])

#可视化更新后的高斯分布
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
    ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100)

"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.sort(X_tot).reshape(60,1)),gauss_2.pdf(np.sort(X_tot).reshape(60,1))\
                ,gauss_3.pdf(np.sort(X_tot).reshape(60,1))],['r','g','b']):
    ax0.plot(np.sort(X_tot),g,c=c)

plt.show()

so as you can see the occurence of our gaussians changed dramatically after the first EM iteration.Let’s update r and illustrate the coloring of the points.

"""update r"""
#mind that we use the new pi_c here
for c,g,p in zip(range(3),[gauss_1,gauss_2,gauss_3],pi_c):
    r[:,c] = p*g.pdf(X_tot)

"""
Normalize the probabilities such that each row of r sums to 1 and 
weight it by mu_c==the fraction of points belonging to cluster c
"""

for i in range(len(r)):
    r[i] = r[i]/(np.sum(pi_c)*np.sum(r,axis=1)[i])

"""plot the data"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
    ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100) #we have defined the first column as red,the second as
                                                                        #green and the third as blue
"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.sort(X_tot).reshape(60,1)),gauss_2.pdf(
    np.sort(X_tot).reshape(60,1)),gauss_3.pdf(np.sort(X_tot).reshape(60,1))],['r','g','b']):
    ax0.plot(np.sort(X_tot),g,c=c)

plt.show()

迭代10次EM过程

import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
import numpy as np
from scipy.stats import norm
np.random.seed(0)
X = np.linspace(-5,5,num=20)
X0 = X*np.random.rand(len(X))+15 # Create data cluster 1
X1 = X*np.random.rand(len(X))-15 # Create data cluster 2
X2 = X*np.random.rand(len(X)) # Create data cluster 3
X_tot = np.stack((X0,X1,X2)).flatten() # Combine the clusters to get the random datapoints from above
class GM1D:
    def __init__(self,X,iterations):
        self.iterations = iterations
        self.X = X
        self.mu_c = None
        self.pi_c = None
        self.var_c = None

def run(self):

"""
        Instantiate the random mu, pi and var
        """
        self.mu_c = [-8,8,5]
        self.pi_c = [1/3,1/3,1/3]
        self.var_c = [5,3,1]

"""
        E-Step
        """

for iter in range(self.iterations):
            """Create the array r with dimensionality nxK"""
            r = np.zeros((len(X_tot),3))  

"""
            Probability for each datapoint x_i to belong to gaussian g 
            """
            for c,g,p in zip(range(3),[norm(loc=self.mu_c[0],scale=self.var_c[0]),
                                       norm(loc=self.mu_c[1],scale=self.var_c[1]),
                                       norm(loc=self.mu_c[2],scale=self.var_c[2])],self.pi_c):
                r[:,c] = p*g.pdf(X_tot) # Write the probability that x belongs to gaussian c in column c. 
                                      # Therewith we get a 60x3 array filled with the probability that each x_i belongs to one of the gaussians
            """
            Normalize the probabilities such that each row of r sums to 1 and weight it by mu_c == the fraction of points belonging to 
            cluster c
            """
            for i in range(len(r)):
                r[i] = r[i]/(np.sum(pi_c)*np.sum(r,axis=1)[i])
            """Plot the data"""
            fig = plt.figure(figsize=(5,5))
            ax0 = fig.add_subplot(111)
            for i in range(len(r)):
                ax0.scatter(self.X[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100) 
            """Plot the gaussians"""
            for g,c in zip([norm(loc=self.mu_c[0],scale=self.var_c[0]).pdf(np.linspace(-20,20,num=60)),
                            norm(loc=self.mu_c[1],scale=self.var_c[1]).pdf(np.linspace(-20,20,num=60)),
                            norm(loc=self.mu_c[2],scale=self.var_c[2]).pdf(np.linspace(-20,20,num=60))],['r','g','b']):
                ax0.plot(np.linspace(-20,20,num=60),g,c=c)

"""M-Step"""

"""calculate m_c"""
            m_c = []
            for c in range(len(r[0])):
                m = np.sum(r[:,c])
                m_c.append(m) # For each cluster c, calculate the m_c and add it to the list m_c
            """calculate pi_c"""
            for k in range(len(m_c)):
                self.pi_c[k] = (m_c[k]/np.sum(m_c)) # For each cluster c, calculate the fraction of points pi_c which belongs to cluster c
            """calculate mu_c"""
            self.mu_c = np.sum(self.X.reshape(len(self.X),1)*r,axis=0)/m_c
            """calculate var_c"""
            var_c = []
            for c in range(len(r[0])):
                var_c.append((1/m_c[c])*np.dot(((np.array(r[:,c]).reshape(60,1))*(self.X.reshape(len(self.X),1)-self.mu_c[c])).T,(self.X.reshape(len(self.X),1)-self.mu_c[c])))
            plt.show()

GM1D = GM1D(X_tot,10)
GM1D.run()

multi dimensional case

import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
from sklearn.datasets.samples_generator import make_blobs
import numpy as np
from scipy.stats import multivariate_normal

#0. Create dataset
X,Y = make_blobs(cluster_std=1.5, random_state=20,n_samples=500,centers=3)
#stratch dataset to get ellipsoid data
X = np.dot(X, np.random.RandomState(0).randn(2,2))

class GMM(object):
    def __init__(self, X, number_of_sources, iterations):
        self.iterations = iterations
        self.number_of_sources = number_of_sources
        self.X = X
        self.mu = None
        self.pi = None
        self.cov = None
        self.XY = None

def run(self):
        '''define a function which runs for iterations, iterations'''
        self.reg_cov = 1e-6*np.identity(len(self.X[0]))
        x,y = np.meshgrid(np.sort(self.X[:,0]),np.sort(self.X[:,1]))
        self.XY = np.array([x.flatten(),y.flatten()]).T

"""
        1. set the initial mu, covariance and pi values
        this is a nxm matrix,since we assume n sources(nGaussians) where each has m dimensions
        """
        self.mu = np.random.randint(min(self.X[:,0]), max(self.X[:,0]), size=(self.number_of_sources, len(self.X[0])))
        """
        we need a nxmxm covariance matrix for each source since we have m features-->we create symmetric 
        covariance matrices with ones on the digonal
        """
        self.cov = np.zeros((self.number_of_sources, len(X[0]),len(X[0])))

for dim in range(len(self.cov)):
            np.fill_diagonal(self.cov[dim],5)

self.pi = np.ones(self.number_of_sources)/self.number_of_sources #Are 'fractions'
        log_likelihoods = [] #In this list we store the log likehoods per iteration and plot them in the end to check if we have converged

"""plot the initial state"""
        fig = plt.figure(figsize=(10,10))
        ax0 = fig.add_subplot(111)
        ax0.scatter(self.X[:,0],self.X[:,1])
        ax0.set_title('Initial state')
        for m,c in zip(self.mu, self.cov):
            c+=self.reg_cov
            multi_normal = multivariate_normal(mean=m,cov=c)
            ax0.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),
                       colors='black',alpha=0.3)
            ax0.scatter(m[0],m[1],c='grey',zorder=10,s=100)

for i in range(self.iterations):
            '''E step'''
            r_ic = np.zeros((len(self.X),len(self.cov)))
            for m,co,p,r in zip(self.mu, self.cov, self.pi, range(len(r_ic[0]))):
                co+=self.reg_cov
                mn = multivariate_normal(mean=m, cov=co)
                r_ic[:,r] = p*mn.pdf(self.X)/np.sum([pi_c*multivariate_normal(mean=mu_c,cov=cov_c).pdf(X) 
                                                     for pi_c,mu_c,cov_c in zip(self.pi,self.mu,self.cov+self.reg_cov)],axis=0)
            """
            the above calculation of r_ic is not that obvious why I want to quickly derive what we have done above.

First of all the nominator:
            we calculate for each source c which is defined by m,co and p for every instance x_i, the multivatiate_normal.pdf()value.

for each loop this gives us a 100x1 matrix(this value divided by the denominator is then assigned to 
            r_ic[:,r] which is in the end a 100x3 matrix).

Second the denominator:

What we do here is, we calculate the multivariate_normal.pdf() for every instance x_i for every source c
            which is defined by pi_c,mu_c,and cov_c and write this into a list. This gives us a 3x100 matrix where we
            have 100 entrances per source c.

Now the formula wants us to add up the pdf() values given by the 3 sources for each x_i. Hence we sum up
            this list over axis=0.

this gives us then a list with 100 entries.

What we have now is FOR EACH LOOP a list with 100 entries in the nominator and a list with 100 entries 
            in the denominator where each element is the pdf per class c for each instance x_i(nominator) respectively 
            the summed pdf's of classes c for each instance x_i.

Consequently we can now divide the nominator by the denominator and have as result a list with 100 elements 
            which we can then assign to r_ic[:,r] --> One row r per source c. In the end after we have done this for all 
            three sources (three loops) and run from r==0 to r==2 we get a matrix with dimensionallity 100x3 which is 
            exactly what we want.

If we check the entries of r_ic we see that there mostly one element which is much larger than the other two. 
            This is because every instance x_i is much closer to one of the three gaussians (that is, much more likely to 
            come from this gaussian) than it is to the other two. That is practically speaing, r_ic gives us the fraction 
            of the probability that x_i belongs to class c over the probability that x_i belonges to any of the classes c 
            (Probability that x_i occurs given the 3 Gaussians).
            """

"""
            M-Step

Calculate the new mean vector and new covariance matrices, based on the probable membership of the single x_i to
            classes c-->r_ic
            """
            self.mu = []
            self.cov = []
            self.pi = []
            log_likelihood = []
            for c in range(len(r_ic[0])):
                m_c = np.sum(r_ic[:,c],axis=0)
                mu_c = (1/m_c)*np.sum(self.X*r_ic[:,c].reshape(len(self.X),1),axis=0)
                self.mu.append(mu_c)
                #Calculate the covariance matrix per source based on the new mean
                self.cov.append(((1/m_c)*np.dot((np.array(r_ic[:,c]).reshape(len(self.X),1)*(self.X-mu_c)).T,(self.X-mu_c)))+self.reg_cov)

#Calculate pi_new which is the 'fraction of points' reprectively the fraction of the probability assigned to each source
                """
                Here np.sum(r_ic) gives as result the number of instances. This is logical since we know that the columns of each row of r_ic adds up to 1. 
                Since we add up all elements, we sum up all columns per row which gives 1 and then all rows which gives then the number of instances (rows)
                in X --> Since pi_new contains the fractions of datapoints, assigned to the sources c,The elements in pi_new must add up to 1
                """
                self.pi.append(m_c/np.sum(r_ic))

"""Log likelihood"""
            log_likelihoods.append(np.log(np.sum([k*multivariate_normal(self.mu[i],self.cov[j]).pdf(X) for k,i,j in 
                                                         zip(self.pi,range(len(self.mu)),range(len(self.cov)))])))

"""
            This process of E step followed by a M step is now iterated a number of n times. In the second step for instance,
            we use the calculated pi_new, mu_new and cov_new to calculate the new r_ic which are then used in the second M step
            to calculat the mu_new2 and cov_new2 and so on....
            """
        fig2 = plt.figure(figsize=(10,10))
        ax1 = fig2.add_subplot(111)
        ax1.set_title('Log-Likelihood')
        ax1.plot(range(0,self.iterations,1),log_likelihoods)
        #plt.show()

def predict(self,Y):
        '''predict the membership of an unseen,new datapoint'''
        #Plot the point onto the fittet gaussians
        fig3 = plt.figure(figsize=(10,10))
        ax2 = fig3.add_subplot(111)
        ax2.scatter(self.X[:,0],self.X[:,1])
        for m,c in zip(self.mu, self.cov):
            multi_normal = multivariate_normal(mean=m,cov=c)
            ax2.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),colors='black',alpha=0.3)
            ax2.scatter(m[0],m[1],c='grey',zorder=10,s=100)
            ax2.set_title('Final state')
            for y in Y:
                ax2.scatter(y[0],y[1],c='orange',zorder=10,s=100)

prediction = []
        for m,c in zip(self.mu, self.cov):
            prediction.append(multivariate_normal(mean=m,cov=c).pdf(Y)/np.sum([multivariate_normal(mean=mean,cov=cov).pdf(Y) for mean,cov in zip(self.mu,self.cov)]))

plt.show()
        return prediction

GMM = GMM(X,3,50)
GMM.run()
GMM.predict([[0.5,0.5]])

[0.8429320897452095, 0.15706464234352321, 3.2679112672282063e-06]

GMM in python and EM

GMM in python and EM

使用GMM

mnist数据及生成

第二部分：使用python从头开始实现gmm

E-step

M-step

E-Step

M-step

迭代10次EM过程

multi dimensional case

Recommend

在 AVD 模拟器上安装 Magisk (Android 11)

2022 常见翻墙协议测试

Vice hires advisors to seek buyer, may sell itself in pieces: sources

ZTE to bring designated security chip with the Axon 40 series, Pro incoming with...

伴随超高压、特高压电网建设加速，变压器用电磁线需求稳步增长

巴菲特的苹果

分享四款H5怀旧小游戏魔塔+伏魔记+三国霸业+寻仙纪_前端_echeverra_InfoQ写作社区

Python原生数据结构增强模块collections_Python_Java全栈架构师_InfoQ写作社区

Realme GT Neo3 begins global rollout, Pad Mini joins in

预见2022：《2022年中国集成灶产业全景图谱》(附市场规模、竞争格局和发展前景等)

About Joyk