The dual of constrained KL-Divergence is the MLE of the log-linear model
Autor
Dingquan Wang
Last Updated
hace 9 años
License
Creative Commons CC BY 4.0
Resumen
The dual of constrained KL-Divergence is the MLE of the log-linear model
The dual of constrained KL-Divergence is the MLE of the log-linear model
\documentclass[a4paper]{article}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\title{The dual of constrained KL-Divergence is the MLE of the log-linear model}
\author{Dingquan Wang}
\date{\today}
\begin{document}
\maketitle
Given a distribution of interest $p$. We are looking for an estimation $q$ that approaches $p$ by minimizing the KL-Divergence with constraints:
\begin{align}
q* = \displaystyle\arg\max_{q\in\mathcal{Q}} KL(q||p) &= \arg_{q\in\mathcal{Q}}\max E_q[\log\frac{q(X)}{p(X)}]\\
\text{s.t.}~~~E_{q}[f(X)]-E_{p}[f(X)] &\leq \xi ; ||\xi ||_\beta < \epsilon
\end{align}
$\mathcal{Q}$ is a distribution family. $f(X)$ is a measurement vector of $X$ that we are interested. The general goal is while minimizing $q \in \mathcal{Q}$ approaching the true distribution we force some quantities agree with some observation in expectation.
Consider the Lagrangian:
\begin{align}
\max_{\lambda\geq 0,\alpha \geq 0}\min_{q(X), \xi} L(q(X),\epsilon,\lambda,\alpha,\gamma)
\end{align}
where:
\begin{align}
L(q(X),\epsilon,\lambda,\alpha,\gamma) &= KL(q||p)+\lambda\cdot(E_{q}[f(X)]-E_{p}[f(X)] - \xi)\\
&+\alpha\cdot(||\xi||_\beta - \epsilon) + \gamma \cdot(\int_{X}q(X)-1)
\end{align}
In order to compute the dual of this Lagrangian, we first represent:
\begin{align}
\alpha||\xi||_\beta=\max\xi\cdot\eta~~\text{s.t.} ||\eta||_\beta\leq \alpha
\end{align}
This results in a variational Lagrangian:
\begin{align}
\max_{\lambda\geq 0,\alpha \geq 0}\max_{||\eta||_\beta\leq \alpha}\min_{q(X), \xi} L(q(X),\epsilon,\lambda,\alpha,\gamma)
\end{align}
with $L(q(X),\epsilon,\lambda,\alpha,\gamma)$ defined as:
\begin{align}
L(q(X),\epsilon,\lambda,\alpha,\gamma) &= E_q[\log\frac{q(X)}{p(X)}]+\lambda\cdot(E_{q}[f_(X)]-E_{p}[f_(X)] - \xi)\\
&+\xi\eta-\alpha\epsilon + \gamma \cdot(\int_{X}q(X)-1)
\end{align}
\begin{align}
\frac{\partial L(q(X),\epsilon,\lambda,\alpha,\gamma)}{\partial q(X)} &= \log q(X)-\log p(X)+ 1+\lambda\cdot f(X) + \gamma = 0\\
\end{align}
\begin{align}
\frac{\partial L(q(X),\epsilon,\lambda,\alpha,\gamma)}{\partial \xi_i} &= \eta_i-\lambda_i\rightarrow \eta=\lambda\\
\end{align}
Plugging $q(Y)$, $\eta = \lambda$ in $L(q(X),\epsilon,\lambda,\alpha,\gamma)$ and taking the derivative with respect to $\gamma$
\begin{align}
\frac{\partial L(\lambda,\alpha,\gamma)}{\partial \gamma} &= \int_X \frac{p(X)\exp(-\lambda\cdot f(X))}{e\exp(\gamma)}-1 = 0\\
&\rightarrow \gamma =\log(\frac{\int_Xp(X)\exp(-\lambda\cdot f(X))}{e})
\end{align}
plug $\gamma$ into (10)
\begin{align}
\log q(X) &= \log p(X) - 1 - \lambda \cdot f(X) - \log(\frac{\int_Xp(X)\exp(-\lambda\cdot f(X))}{e})\\
q(X) &= \exp(\log p(X))\exp(-1)\exp(-\lambda \cdot f(X))\exp(-\log(\frac{\int_Xp(X)\exp(-\lambda\cdot f(X))}{e}))\\
&= \frac{p(X)\exp(-\lambda \cdot f(X))}{\int_Xp(X)\exp(-\lambda\cdot f(X))}\\
&= \frac{p(X)\exp(-\lambda\cdot f(X))}{Z_\lambda}
\end{align}
where $Z_{\lambda}=\int_Xp(X)\exp(-\lambda\cdot f(X))$. Plugging $\gamma$ and $q(X)$ into $L(q(X),\epsilon,\lambda,\alpha,\gamma)$.
\begin{align}
L(\lambda,\alpha) &= E_q[\log\frac{q(X)}{p(X)}]+\lambda\cdot(E_{q}[f_(X)]-E_{p}[f_(X)] - \xi)\\
&+\xi\lambda-\alpha\epsilon + \gamma \cdot(\int_{X}q(X)-1)\\
L(\lambda,\alpha) &= E_q[\log\frac{q(X)}{p(X)}]+\lambda\cdot(E_{q}[f_(X)]-E_{p}[f_(X)])-\alpha\epsilon\\
&=\int_Xq(X)\log \frac{q(X)}{p(X)} + \lambda\cdot E_{q}[f_(X)] - \lambda\cdot(E_{p}[f_(X)] - \alpha\epsilon\\
&=\int_X\frac{p(X)\exp(-\lambda\cdot f(X))}{Z_\lambda}\log \frac{\exp(-\lambda\cdot f(X))}{Z_\lambda} + \lambda\cdot E_{q}[f_(X)] - \lambda\cdot(E_{p}[f_(X)] - \alpha\epsilon\\
&=\int_X\frac{p(X)\exp(-\lambda\cdot f(X))}{Z_\lambda}\cdot-\lambda\cdot f(X)-log{Z_\lambda}\int_X\frac{p(X)\exp(-\lambda\cdot f(X))}{{Z_\lambda}} \\
&+ \lambda\cdot E_{q}[f_(X)] - \lambda\cdot(E_{p}[f_(X)] - \alpha\epsilon\\
&=-\lambda\cdot E_{q}[f_(X)]-log{Z_\lambda}+ \lambda\cdot E_{q}[f_(X)] - \lambda\cdot(E_{p}[f_(X)] - \alpha\epsilon\\
&=-\log Z_{\lambda} - \lambda\cdot E_{p}[f_(X)]-\alpha\epsilon
\end{align}
The new objective is:
\begin{align}
\max_{\lambda\geq 0,\alpha \geq 0}L(\lambda,\alpha) &= -\log(Z_{\lambda}) - \lambda\cdot E_{p}[f_(X)]-\alpha\epsilon~\text{s.t.}~||\lambda||_\beta^*\leq \alpha
\end{align}
We can analytically see that the optimum of this objective with respect to $\alpha$ is $\alpha=||\lambda||_\beta^*$ and placing this in $L(\lambda,\alpha)$ we get the dual objective:
\begin{align}
\max_{\lambda\geq 0}L(\lambda) &= -\log(Z_{\lambda}) - \lambda\cdot E_{p}[f_(X)]-\epsilon ||\lambda||_\beta^*
\end{align}
This is the same objective as the MLE of the log-linear model. as desired.
\end{document}