Section 06: mixed feelings

Verena Volf 10/19/2018, adapted from Joseph Sedlak's notes from the previous year.

You can also download this page in jupyter notebook format.

In [57]:
import numpy as np
import math
import pandas as pd
import scipy.spatial.distance as dis
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy.random as rand
import scipy.stats as stats
import scipy.misc as misc
from collections import Counter

Overview of k-means clustering algorithm

The k-means clustering algorithm aims to partition observations into $k$ clusters based on feature similarity in the data in a way that minimizes the total distance between the points and the centroid of their assigned cluster. $K$ has to be specified.

After initialization, in which the initial centroids are created, the algorithm is composed of two steps which are repeated iteratively until the all centroids convergence to fixed points.

1.) Assignment step:

$k$ clusters are created by assigning each data point $x_{i}$ to the nearest centroid $\mu_{k}$, here based on their Euclidian distance $\sqrt{(x_{i}-\mu_{k})^{2}}$.

2.) Update step:

The centroid of the clusters are updated by calculating the mean of all points assigned to that cluster:

$\frac{\Sigma_{i\epsilon k}x_{i}}{|C_{k}|}$

In [88]:
coords = []
means = [3,7,10,15]
#generate 4 *40 data points:
for x in means:
    coords.extend(list(np.random.normal(x,1,40)))
In [89]:
# Quick dot plot to visualize the points
# Give the data points a random y value just for aesthetics
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y,marker = 'o')
plt.xlim(0,20)
plt.ylim(-0.5,1.5)
plt.show()
In [98]:
def assignment(coords,centers):  
    
    '''
    Return current cluster assigned to each point, as well as the 
    sum of this distances between each point and its assigned cluster
    
    Parameters
    ----------
    coords   : list or 1D ndarray
               list X-coordinates of all points
    centers  : list or 1D array
               list of current estimates for cluster centers
       
    Returns
    -------
    clusters : list or 1D array
               list of current cluster assignment for each point
    distance : float
               sum of distances between each point and its assigned cluster center
    '''
    
    # Find minimum distance between a point and each center
    distances = [np.min([math.sqrt((point-i)**2) for i in centers]) for point in coords]
    
    # Assign each point to the closest center
    clusters =  [np.argmin([math.sqrt((point-i)**2) for i in centers]) for point in coords]
    
    # Count how many points in each cluster
    c = Counter(clusters)
    
    # Look for empty clusters
    for x in set(range(len(centers))).difference(set(c.keys())):
        clusters,distances =  assign_empty(x,clusters,distances)
    distance = sum(distances)
    
    return clusters, distance


def assign_empty(c,clusters,distances):
    
    '''
    Assigns the point furthest from its current cluster center to an empty cluster
    Returns an updated cluster list and updatd distances list
    
    Parameters
    ----------
    c        : int
               number of empty cluster   
    clusters : list or 1D array
               list of current cluster assignment for each point
    distances: list or 1D array
               list of current distances between each point and its assigned center
       
    Returns
    -------
    clusters : list or 1D array
               list of current cluster assignment for each point
    distances: list or 1D array
               list of current distances between each point and its assigned center
    '''
    
    # find index of point furthest from its assigned center
    max_idx = np.argmax(distances)
    
    # set distances at that index to 0
    distances[max_idx] = 0
    
    # set clusters at that index to empty cluster
    clusters[max_idx] = c
    
    return clusters,distances


def update(coords, clusters, num_clust):
    
    '''
    Calculates the new cluster centers as an average of the positions of all 
    points included in the cluster.
    
    Parameters
    ----------
    coords   : list or 1D ndarray
               list X-coordinates of all points   
    clusters : list or 1D array
               list of current cluster assignment for each point
    num_clust: int
               number of clusters
       
    Returns
    -------
    centers  : list or 1D array
               list of current estimated center for each cluster
    '''
    coords = np.asarray(coords)
    
    # Find indices of each point assigned to each cluster
    clusters = ([np.where(np.asarray(clusters) == x) for x in range(num_clust)])
    
    # Average position of each point assigned to each cluster
    centers = [np.mean(coords[clusters[x]],0) for x in range(len(clusters))]    
    
    return centers


def plot_centers(centers,c,lw):
    
    '''
    Plots the current estimates for cluster centers as vertical bars
    
    Parameters
    ----------
    centers  : list or 1D ndarray
               list X-coordinates of all centers   
    c        : str
               color to plot centers in
    lw       : int
               weight of line with which to plot centers
       
    Returns
    -------
    None
    '''    
    for center in centers:            
            plt.axvline(x = center,color = c,linewidth = lw)
In [129]:
# Initialize figure, plot raw data
plt.close('all')
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y,marker = 'o')
plt.xlim(0,20)
plt.ylim(-0.5,1.5)

# Define number of iterations (iters) and number of clusters (num_means)
#iters=1
iters = 20
num_clust = 4

# Initalize best clusters
min_dist = float('inf')
clusters_best = []

# Loop on iters
for i in range(iters):
    
    # Each iteration starts with a random set of centers
    #centers = [5, 10, 15, 20] #evenly distributed center
    centers = np.random.choice(np.linspace(min(coords),max(coords),1000),num_clust)
    plot_centers(centers,'g',2) 
    
    # Start with empty clusters
    clusters_old = []*len(coords)
    clusters = [0]* len(coords)
   
    
    # Iterate until clusters don't change
    while clusters != clusters_old:
        
        # Update clusters
        clusters_old = clusters
        
        # New Assignment 
        clusters,distance = assignment(coords,centers)        
        
        # New centers
        centers = update(coords, clusters, num_clust)
        
    # Plot best centers    
    plot_centers(centers,'r',2) 
    
    # Save clusters and centers if it has minimal summ distance to date
    if distance < min_dist:            
        min_dist = distance
        centers_best = centers
        clusters_best = clusters 
        

plt.show()

Avoiding local minima

k means clustering is prone to local minima, and it's therefore important to consider your choice of initial conditions. One way to address this problem is to run the algorithm multiple times with different initial centroids and select the best solution out of those runs. Initial centroids can be generated by choosing random coordinates within the range of interest, by selecting the coordinates of existing data points or in other ways.

In [130]:
# Initialize new figure
plt.close('all')
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y,marker = 'o',c = clusters_best) # Plot raw data, colored by cluster

plt.xlim(0,20)
plt.ylim(-0.5,1.5)

# Plot best centroids (means)
plot_centers(centers_best,'k',2)

plt.show()

print(means)
print(np.round(sorted(centers_best),1))
clusters_final,distance_final = assignment(coords,centers_best) 
print(np.round(distance_final))
[3, 7, 10, 15]
[ 2.9  6.9  9.9 14.8]
117.0

Soft K-Means

In Hard K-Means, we assigned each point only to its closest center and then we updated each center with the average position of only the points assigned to it. Soft K-Means will use essentially the same strategy, but now our assignment and update steps can include every point for each center, weighted by distance and an extra parameter, $\beta$. Now we can control the fuzziness of our clusters, i.e. how much we weight points distant from a center, relative to closer points, with $\beta$. (Soft K-Means is also sometimes called Fuzzy K-Means) Again, in each iteration two things happen:

1) Assignnment step:

Measure the distance from each point ($x_{i}$), to each center ($\mu_{k}$) and assign responsibility ($r_{ik}$) of that center to the point. We define distance here as $\sqrt{(x_{i}-\mu_{k})^{2}}$ and responsibility as $\frac{e^{-\beta(x_{i}-\mu_{k})^{2}}}{\Sigma_{k}e^{-\beta(x_{i}-\mu_{k})^{2}}}$.

2) Update step:

Update the centers of the clusters based on the responsibility weighted mean of all the coordinates defined as: $\frac{\Sigma_{i} r_{ik} x_{i} }{\Sigma_{i} r_{ik}}$.

Higher $\beta$s are stiffer since they cause the responsibilities of centers close to a point to be much higher than centers further away, to the extreme case where $\beta = \infty$ and all the responsibility of a point comes from it closest center. In the opposite extreme, when $\beta = 0$ then each point gets equal responsibility from each center. The plots below were made with $\beta = 0.1$. Now I'll change it to $\beta = 1$. Notice how the demarcation between the blue cluster and the red cluster become much clearer.

In [131]:
def soft_assignment(coords,centers,beta):  
    
    '''
    Return the resposibilities of each center to each point
    
    Parameters
    ----------
    coords   : dD ndarray
               list X-coordinates of all points
    centers  : list or 1D array
               list of current estimates for cluster centers
    beta     : float
               parameter that adjusts the "stiffness" of the responsibility.
                      
    Returns
    -------
    resps    : 2D array
               Responsibilities of each point from each center
    '''
    
    # Calcualte the reponsibilites for each point
    resps = [responsibilities(point,centers,beta) for point in coords]
    
    return resps

def responsibilities(point, means, beta):
    '''
    Return the resposibilities of each center to one point
    
    Parameters
    ----------
    point    : float
               X-coordinate of current point
    centers  : list or 1D array
               list of current estimates for cluster centers
    beta     : float
               parameter that adjusts the "stiffness" of the responsibility.
                      
    Returns
    -------
    resps    : list or 1D array
               Responsibilities of current point from each center
    '''
    
    # Calculate the responsibilities from each center to current point
    num = np.exp(np.multiply(-beta,[(point-i)**2 for i in means]))
    # Normalize by sum of responsibilites of point from all centers
    resps = np.divide(num,sum(num))
    
    return resps
    
def soft_update(coords, resps,num_clust,beta):
    
    '''
    Calculates the new cluster centers as an responsibility weighted average of the positions of all 
    points.
    
    Parameters
    ----------
    coords   : list or 1D array
               list X-coordinates of all points   
    resps    : list or 1D array
               Responsibilities of current point from each center
    num_clust: int
               number of clusters
    beta     : float
               parameter that adjusts the "stiffness" of the responsibility.
               
       
    Returns
    -------
    centers  : list or 1D array
               list of current estimated center for each cluster
    clusters : list or 1D array
               list of current cluster assignment for each point
    distance : float
               sum current distances between each point and its assigned center
   
    '''
    
    # Find responsibility weighted mean of point positions for each center
    num = ([sum(np.multiply(coords,[r[i] for r in resps])) for i in range(num_clust)])
    denom = [sum([r[i] for r in resps]) for i in range(num_clust)]
    centers = np.divide(num,denom)
    
    # Assign each point to it's closest cluster
    clusters = [np.argmax(np.multiply(responsibilities(point,centers,beta),point)) for point in coords]
    
    # calcuate the distance between each point and its assigned center
    distance = sum([np.min([math.sqrt((point-i)**2) for i in centers]) for point in coords])
      
    return centers,clusters,distance
In [132]:
# Initialize figure, plot raw data
plt.close('all')
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y,marker = 'o')
plt.xlim(0,20)
plt.ylim(-0.5,1.5)

# Define number of iterations (iters), number of clusters (num_clust), and beta (beta)
iters = 5
num_clust = 4
beta = 1

# define a best distance to center (min_dist)

min_dist = float('inf')
clusters_best = []*len(coords)


# Loop on iters
for _ in range(iters):
    
    # Each iteration starts with a random set of centers
    centers = np.random.choice(np.linspace(min(coords),max(coords),1000),num_clust)
    plot_centers(centers,'g',2) 
    
    # Start with empty clusters
    center_diff = float('inf')    
    centers_old = [0]*num_clust
    
    
    # Loop until centers stop moving significantly    
    while center_diff > 0.1:        
        
        # Assign responisbilities
        resps = soft_assignment(coords,centers,beta) 
        # Update centers
        centers,clusters,distance = soft_update(coords,resps,num_clust,beta)
        
        # Find distance new centers are from previous centers
        center_diff = dis.euclidean(centers,centers_old)        
        centers_old = centers
        
        
    # plot converged centers                                
    plot_centers(centers,'r',2)
    
    # Save "best" centers and clusters
    
    if distance < min_dist:
        
        min_dist = distance
        centers_best = centers
        clusters_best = clusters 
        
plt.show()
In [134]:
plt.close('all')
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y, marker = 'o', c = clusters_best,edgecolor = 'None')
plt.xlim(0,20)
plt.ylim(-0.5,1.5)
plot_centers(centers_best,'k',2)
plt.show()

print(means)
print(np.round(sorted(centers_best),1))
[3, 7, 10, 15]
[ 2.8  6.9  9.9 14.8]

fitting mixture modeling expectation maximization

First we want to define our mixture model, before fitting it iteratively with expectation maximization. A mixture model is a generative probability model composed of other probability distributions. It is the underlying combinations of probability distributions that our data were generated from. They don't all have to be the same distribution; they don't even have to be the same type of distribution (you can mix distributions any which way you want). In this case, we are going to assume all our points were generated from 1 of Q diffenent Gaussian distribtions with parameters $\mu$ and $\sigma$. Since we are working with multiple Gaussian distributions, we need to add one more parameter, $\pi$, the mixing coefficient that is our best guess as to what fraction of the data was generated by each Gaussian.

We aim to optimize the parameters $\mu$ and $\pi$ and can choose arbitrary initial parameters. Similar to the p-set (in which we use a negative binomial distribution), our choice of sigma will be the same through every iteration.

Sigma will behave much like $\beta$ in the Soft K-means. Accordingly, low sigmas will make your models "stiffer" with lower probability assigned to points further from a given $\mu$.

There are two steps:

1.) Expectation step:

For each point $x_{i}$, we calculate a posterior probability $q$, given our current model with estimates of $\mu$, $\pi$, and our set $\sigma$: $P(q|x_{i}\pi,\mu,\sigma)$.

2.) Maximization step:

Instead of simply updating the centroids, we estimate new maximum likelihood parameters of our mixture model, $μ$ and $πq$, based on the posterior probabilities calculated in the previous step.

We can follow the total log likelihood $P(X∣θ)$ through our iterations and stop once the increase falls below a threshold value.

The posterior probability that we want to solve is $P(q \mid X,\mu,\sigma,\pi)$. The relative proportions of the components $\pi$ are unknown. The posterior probability we are trying to solve during the Expcetiation step in this problem set is ultimately derived from the marginalization of hidden variables in a joint probability. In this case: $P(q \mid X,\mu,\sigma,\pi) = \frac{P(X,q \mid \mu,\sigma,\pi)}{\Sigma_{q}P(X,q \mid \mu,\sigma,\pi)}$. Conviently, this joint probability also factors, since the selection of a component from the group of components is independent from the selelection of data point from the component. Therefore, $P(X,q \mid \mu,\sigma,\pi) = P(X \mid q,\mu,\sigma)P(q \mid \pi)$. Again, these components are easily solvable on their own. The first is simply the likelihood of the data given a particular component. In this example, the components are all Gaussian, so $P(X \mid q,\mu,\sigma) = \frac{1}{\sqrt{2\sigma^{2}\pi}}e^{-\frac{(X-\mu)^{2}}{2\sigma^{2}}}$, i.e. the Gaussian PDF evaulauted at $X$ with $\sigma$and $\mu$ from the given component $q$ ($\pi$ in that equation is number $\pi$ not the mixture coefficient $\pi$). The $P(q \mid \pi)$ is even easier. It's just $\pi$ (the mixture coefficient $\pi$ not the number $\pi$). Together then, the equation for the posterior probability, generalized for any form of $q$, becomes $P(q \mid X,\mu,\sigma,\pi) = \frac{\pi P(X \mid q,\mu,\sigma)}{\Sigma_{q}\pi P(X \mid q,\mu,\sigma)}$.

In [30]:
def expectation(coords, mus, sigma, mix_coeffs):
    '''
    Return the Posterior Probability of each data point coming from each componenent
    
    Parameters
    ----------
    coords    : list or1D array, list X-coordinates of all points
    mus       : list or 1D array, list of current estimates for component means
    sigma     : float, set parameter for component variance
    mix_coeffs: list or 1D array, list of current extiamtes for mixture coefficients of components
                      
    Returns
    -------
    resps    : 2D array, Posterior probabilities of each component being the
               source of each point.
    clusters : list or 1D array, list of current cluster assignment for each point 
    '''
    
    # Initialze P(q|x)
    Pqx = []
    
    # Loop through points
    for point in coords: 
        # Append pi * P(x|q)
        Pqx.append([mix_coeffs[i]*stats.norm.pdf(point,mu,sigma) for i,mu in enumerate(mus)])
    
    # Normalize
    posts = [np.divide(x,sum(x)) for x in Pqx]
    
    # Assign clusters based on max posterior probability
    clusters = [np.argmax(post) for post in posts]
    
    return posts,clusters

def maximization(coords, posts, mix_coeffs):
    '''
    Return the update mu and mixture coefficient for each component
    
    Parameters
    ----------
    coords    : list or1D array, list X-coordinates of all points
    mus       : list or 1D array, list of current estimates for component means
    sigma     : float, set parameter for component variance
    mix_coeffs: list or 1D array, list of current extiamtes for mixture coefficients of components
                             
    Returns
    -------
    mus       : list or 1D array, current estimate of component means
    mix_coeffs: list or 1D array, list of current extiamtes for mixture coefficients of components 
    '''
    
    # calculate the new mean as the posterior weighted average of point positions
    num_mu = np.sum([np.multiply(posts[i],x) for i,x in enumerate(coords)],0)    
    denom_mu = np.sum(posts,0)
    mus = np.divide(num_mu,denom_mu)
    
    # calculate the new mixture coefficients as the mean of the posteriors
    mix_coeffs = np.divide(np.sum(posts,0),len(coords))    
    
    return mus,mix_coeffs

def negll(coords, mus, sigma, mix_coeffs):
    '''
    Return the negative log likelihood of data given the current mixture model
    
    Parameters
    ----------
    coords    : list or1D array, list X-coordinates of all points
    posts     : list or 1D array, list of posterior probabilites of each point coming from each component    
    mix_coeffs: list or 1D array, list of current extiamtes for mixture coefficients of components
                      
    Returns
    -------
    nll       : float, negative log likelihood of data given model
    '''

    # Compute the nll as the normal logpdf of the data, given mu and sigma, plus the log of the mixture coefficient
    # Summed for each point
    ll = np.sum([[np.log(mix_coeffs[i]) + stats.norm.logpdf(point,mu,sigma) for i,mu in enumerate(mus)] for point in coords])
    
    return -ll
In [36]:
# Initialize figure, plot raw data
plt.close('all')
f = plt.figure(figsize = (20,1))
y = rand.rand(len(coords))
plt.scatter(coords,y,marker = 'o')
plt.xlim(0,20)
plt.ylim(-0.5,1.5)

# Define number of iterations (iters), number of clusters (num_clust), and beta (beta)
iters = 10
num_clust = 4
beta = 0.5

# define a threshold for nll change that will stop iteration (nll_thresh)
nll_thresh = .001

# Sigma calculated from beta
sigma = math.sqrt (1/(2*beta))

# define best mus (mus_best), best distance to mu (min_dist), best mixture coefficients (mix_coeffs_best
    # and intialize as starting best nll (nll_best) 
min_dist = float('inf')
mus_best = []*len(coords)
nll_best = float('inf')
mix_coeffs_best = []

for _ in range(iters):
    
    # Each iteration starts with a random set of mus (mus) with random mixture coeffcients (mix_coeffs)
    mix_coeffs = rand.rand(num_clust)
    mix_coeffs = np.divide(mix_coeffs,sum(mix_coeffs))
    mus = np.random.choice(np.linspace(min(coords),max(coords),1000),num_clust)
    plot_centers(mus,'g',2) 
    
    # We will iterate until the nll stops changing muc
    # Here we initialize holder variables for the last nll and the difference between the current and last nll
    nll_diff = float('inf')
    nll_old = float(0)    
    
    # Iterate while the differnece between consecutive nlls is above a threshold
    while nll_diff > (nll_thresh* nll_old):
        
        # Calculate posterior probabilities and assign points to clusters
        posts,clusters = expectation(coords, mus, sigma, mix_coeffs)
        
        # Calculate new mus and mixture coefficients given current posterior probabilities
        mus, mix_coeffs = maximization(coords, posts, mix_coeffs)    
        
        # Calcualte the nll of hte current mixture model
        nll = negll(coords, mus, sigma, mix_coeffs) 
         
        # find difference in consecutive nlls and update the nll_old
        nll_diff = abs(nll-nll_old)
        nll_old = nll
        
                
    # plot converged centers                                
    plot_centers(mus,'r',2)
    
    # Update best estimates for mus, mixture coefficients and cluster assignments
    if nll < nll_best:
        mix_coeffs_best = mix_coeffs
        nll_best = nll
        mus_best = mus
        clusters_best = clusters
       
        
#plt.show()
14481.4614928134
14466.412000648455
14455.32269870985
In [84]:
def Gaussian(X, mu, sigma):
    
    '''
    Return the negative log likelihood of data given the current mixture model
    
    Parameters
    ----------
    X         : list or1D array
                list X-coordinates to plot
    mu        : int or float 
                mean of Gaussian distribtion to plot   
    sigma     : int or float
                variance of Gaussian distribution to plot
                      
    Returns
    -------
    gauss     : list or1D array
                list of Y corrdinates calculated from X with Gaussian equation
    '''
    
    gauss = [stats.norm.pdf(x, mu, sigma) for x in X]
        
    return gauss
In [137]:
# Define X coords to plot over
X = np.linspace(0,20,2000)

# calculate Y coords for each mixture with Gaussian function
Y= [np.multiply(mix_coeffs_best[i],Gaussian(X, mu, sigma)) for i,mu in enumerate(mus_best)]

# Sum mixtures
Ysum = np.sum(Y,0)

# y axis scaling variable for plotting aesthetics
ysmall = np.add(np.multiply(y,np.max(Ysum)),np.max(Ysum)*0.25)

# close any open plots
plt.close('all')
f = plt.figure(figsize = (20,1))

# plot each gaussian mixture component
for fit in Y:
    plt.plot(X, fit, linewidth = 1, c = 'k')

# Plot sum of components
plt.plot(X,Ysum, linewidth = 2, c = 'k')

# Plot data poiits
plt.scatter(coords,ysmall, marker = 'o', c = clusters_best, edgecolor = 'None')

# Plot best mus
plot_centers(mus_best,'k',2)

# Pretty up the graph a bit
plt.xlim(0,20)
plt.ylim(0,max(Ysum)+0.5*(max(Ysum)))

plt.show()