#!/usr/bin/env python
# coding: utf-8

# Optimal Transport with Linear Programming
# =========================================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$

# This numerical tours details how to solve the discrete optimal transport
# problem (in the case of measures that are sums of Diracs) using linear
# programming.

# In[1]:


from __future__ import division

import numpy as np
import scipy as scp
import pylab as pyl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# Optimal Transport of Discrete Distribution
# ------------------------------------------
# We consider two dicretes distributions
# $$ \forall k=0,1, \quad \mu_k = \sum_{i=1}^{n_k} p_{k,i} \de_{x_{k,i}} $$
# where $n_0,n_1$ are the number of points, $\de_x$ is the Dirac at
# location $x \in \RR^d$, and $ X_k = ( x_{k,i} )_{i=1}^{n_k} \subset \RR^d$ for $k=0,1$
# are two point clouds.
# 
# 
# We define the set of couplings between $\mu_0,\mu_1$ as
# 
# $$ \Pp = \enscond{ (\ga_{i,j})_{i,j} \in (\RR^+)^{n_0 \times n_1} }{
#      \forall i, \sum_j \ga_{i,j} = p_{0,i}, \:
#      \forall j, \sum_i \ga_{i,j} = p_{1,j} } $$
# 
# 
# The Kantorovitch formulation of the optimal transport reads
# 
# $$ \ga^\star \in \uargmin{\ga \in \Pp} \sum_{i,j} \ga_{i,j} C_{i,j}  $$
# where $C_{i,j} \geq 0$ is the cost of moving some mass from $x_{0,i}$
# to $x_{1,j}$.
# 
# 
# The optimal coupling $\ga^\star$ can be shown to be a sparse matrix
# with less than $n_0+n_1-1$ non zero entries. An entry $\ga_{i,j}^\star \neq 0$
# should be understood as a link between $x_{0,i}$
# and $x_{1,j}$ where an amount of mass equal to $\ga_{i,j}^\star$ is transfered.
# 
# 
# In the following, we concentrate on the $L^2$ Wasserstein distance.
# $$ C_{i,j}=\norm{x_{0,i}-x_{1,j}}^2. $$
# 
# 
# The $L^2$ Wasserstein distance is then defined as
# $$ W_2(\mu_0,\mu_1)^2 = \sum_{i,j} \ga_{i,j}^\star C_{i,j}. $$
# 
# 
# The coupling constraint
# $$
#      \forall i, \sum_j \ga_{i,j} = p_{0,i}, \:
#      \forall j, \sum_i \ga_{i,j} = p_{1,j}
# $$
# can be expressed in matrix form as
# $$ \Sigma(n_0,n_1) \ga = [p_0;p_1] $$
# where $ \Sigma(n_0,n_1) \in \RR^{ (n_0+n_1) \times (n_0 n_1) } $.

# In[2]:


from scipy import sparse

Rows  = lambda n0,n1: sparse.coo_matrix((np.ones(n0*n1), 
                                         (np.ravel(np.tile(np.arange(0,n0),(n1,1)),order='C'),
                                          np.arange(0,n0*n1))))
Cols  = lambda n0,n1: sparse.coo_matrix((np.ones(n0*n1), 
                                        (np.ravel(np.tile(np.arange(0,n1),(n0,1)),order='F'),
                                         np.arange(0,n0*n1))))
Sigma = lambda n0,n1: sparse.vstack((Rows(n0,n1),Cols(n0,n1)))


# In[3]:


# plot the constraint matrix.


# We use a simplex algorithm to compute the optimal transport coupling
# $\ga^\star$.

# In[4]:


from nt_toolbox.perform_linprog import *
maxit = 1e4
tol = 1e-9
otransp = lambda C,p0,p1: np.reshape(perform_linprog(Sigma(len(p0),len(p1)).toarray(),np.vstack((p0,p1)),C,maxit,tol),(n0,n1),order="F")


# Dimensions $n_0, n_1$ of the clouds.

# In[5]:


n0 = 60
n1 = 80


# Compute a first point cloud $X_0$ that is Gaussian, and a second point cloud $X_1$ that is Gaussian mixture.

# In[6]:


from numpy import random
gauss = lambda q,a,c: a*random.randn(2, q) + np.transpose(np.tile(c, (q,1)))
X0 = random.randn(2,n0)*.3
X1 = np.hstack((gauss(int(n1/2),.5,[0,1.6]),np.hstack((gauss(int(n1/4),.3,[-1,-1]),gauss(int(n1/4),.3,[1,-1])))))


# Density weights $p_0, p_1$.

# In[7]:


normalize = lambda a: a/np.sum(a)
p0 = normalize(random.rand(n0, 1))
p1 = normalize(random.rand(n1, 1))


# Shortcut for display.

# In[8]:


myplot = lambda x,y,ms,col: plt.scatter(x,y, s=ms*20, edgecolors="k", c=col, linewidths=2)


# Display the point clouds.
# The size of each dot is proportional to its probability density weight.

# In[9]:


plt.figure(figsize = (10,7))
plt.axis("off")

for i in range(len(p0)):
    myplot(X0[0,i], X0[1,i], p0[i]*len(p0)*10, 'b')
    
for i in range(len(p1)):
    myplot(X1[0,i], X1[1,i], p1[i]*len(p1)*10, 'r')
    
plt.xlim(np.min(X1[0,:])-.1,np.max(X1[0,:])+.1)
plt.ylim(np.min(X1[1,:])-.1,np.max(X1[1,:])+.1)

plt.show()


# Compute the weight matrix $ (C_{i,j})_{i,j}. $

# In[10]:


C = np.transpose(np.tile(np.transpose(np.sum(X0**2,0)),(n1,1))) + np.tile(np.sum(X1**2,0),(n0,1)) - 2*np.dot(np.transpose(X0),X1)


# Compute the optimal transport plan.

# In[11]:


gamma = otransp(C, p0, p1)


# Check that the number of non-zero entries in $\ga^\star$ is $n_0+n_1-1$.

# In[12]:


print("Number of non-zero: %d (n0 + n1-1 = %d)" %(len(gamma[gamma>0]), n0 + n1-1))


# Check that the solution satifies the constraints $\ga \in \Cc$.

# In[13]:


from numpy import linalg
print("Constraints deviation (should be 0): %.2f, %.2f" %(linalg.norm(np.sum(gamma,1)-np.ravel(p0)),linalg.norm(np.transpose(np.sum(gamma, 0))-np.ravel(p1))))


# Displacement Interpolation
# --------------------------
# For any $t \in [0,1]$, one can define a distribution $\mu_t$ such
# that $t \mapsto \mu_t$ defines a geodesic for the Wasserstein metric.
# 
# 
# Since the $W_2$ distance is a geodesic distance, this geodesic path solves the
# following variational problem
# 
# $$ \mu_t = \uargmin{\mu} (1-t)W_2(\mu_0,\mu)^2 + t W_2(\mu_1,\mu)^2. $$
# This can be understood as a generalization of the usual Euclidean
# barycenter to barycenter of distribution. Indeed, in the case that
# $\mu_k = \de_{x_k}$, one has $\mu_t=\de_{x_t}$ where $ x_t =
# (1-t)x_0+t x_1 $.
# 
# 
# Once the optimal coupling $\ga^\star$ has been computed, the
# interpolated distribution is obtained as
# 
# $$ \mu_t = \sum_{i,j} \ga^\star_{i,j} \de_{(1-t)x_{0,i} + t x_{1,j}}. $$
# 
# 
# Find the $i,j$ with non-zero $\ga_{i,j}^\star$.

# In[14]:


I,J = np.nonzero(gamma)
gammaij = gamma[I,J]


# Display the evolution of $\mu_t$ for a varying value of $t \in [0,1]$.

# In[15]:


plt.figure(figsize =(15,10))
tlist = np.linspace(0, 1, 6)

for i in range(len(tlist)):
    t = tlist[i]
    Xt = (1-t)*X0[:,I] + t*X1[:,J]
    plt.subplot(2,3,i+1)
    plt.axis("off")
    for j in range(len(gammaij)):
        myplot(Xt[0,j],Xt[1,j],gammaij[j]*len(gammaij)*6,[t,0,1-t])
    
    plt.title("t = %.1f" %t) 
    plt.xlim(np.min(X1[0,:])-.1,np.max(X1[0,:])+.1)
    plt.ylim(np.min(X1[1,:])-.1,np.max(X1[1,:])+.1)
    
plt.show()


# Optimal Assignement
# -------------------
# In the case where the weights $p_{0,i}=1/n, p_{1,i}=1/n$ (where $n_0=n_1=n$) are
# constants, one can show that the optimal transport coupling is actually a
# permutation matrix. This properties comes from the fact that
# the extremal point of the polytope $\Cc$ are permutation matrices.
# 
# 
# This means that there exists an optimal permutation $ \si^\star \in \Sigma_n $ such
# that
# 
# $$ \ga^\star_{i,j} = \choice{
#       1 \qifq j=\si^\star(i), \\
#       0 \quad\text{otherwise}.
#   } $$
#   
# where $\Si_n$ is the set of permutation (bijections) of
# $\{1,\ldots,n\}$.
# 
# 
# This permutation thus solves the so-called optimal assignement problem
# 
# $$ \si^\star \in \uargmin{\si \in \Sigma_n}
#       \sum_{i} C_{i,\si(j)}. $$
# 
# 
# Same number of points.

# In[16]:


n0 = 40
n1 = n0


# Compute points clouds.

# In[17]:


X0 = random.randn(2,n0)*.3
X1 = np.hstack((gauss(int(n1/2),.5,[0,1.6]),np.hstack((gauss(int(n1/4),.3,[-1,-1]),gauss(int(n1/4),.3,[1,-1])))))


# Constant distributions.

# In[18]:


p0 = np.ones([n0,1])/n0
p1 = np.ones([n1,1])/n1


# Compute the weight matrix $ (C_{i,j})_{i,j}. $

# In[19]:


C = np.transpose(np.tile(np.transpose(np.sum(X0**2,0)),(n1,1))) + np.tile(np.sum(X1**2,0),(n0,1)) - 2*np.dot(np.transpose(X0),X1)


# Display the coulds.

# In[20]:


plt.figure(figsize = (10,7))
plt.axis('off')

myplot(X0[0,:],X0[1,:],10,'b')
myplot(X1[0,:],X1[1,:],10,'r')
    
plt.xlim(np.min(X1[0,:])-.1,np.max(X1[0,:])+.1)
plt.ylim(np.min(X1[1,:])-.1,np.max(X1[1,:])+.1)
plt.show()


# Solve the optimal transport.

# In[21]:


gamma = otransp(C, p0, p1)


# Show that $\ga$ is a binary permutation matrix.

# In[22]:


plt.figure(figsize = (5,5))
plt.imshow(gamma);


# Display the optimal assignement.

# In[23]:


I,J = np.nonzero(gamma)

plt.figure(figsize = (10,7))
plt.axis('off')

for k in range(len(I)):
    h = plt.plot(np.hstack((X0[0,I[k]],X1[0,J[k]])),np.hstack(([X0[1,I[k]], X1[1,J[k]]])),'k', lw = 2)

myplot(X0[0,:], X0[1,:], 10, 'b')
myplot(X1[0,:], X1[1,:], 10, 'r')

plt.xlim(np.min(X1[0,:])-.1,np.max(X1[0,:])+.1)
plt.ylim(np.min(X1[1,:])-.1,np.max(X1[1,:])+.1)
plt.show()

<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>