\documentclass{beamer}
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\mode<presentation>
{
  \usetheme{Madrid}       % or try default, Darmstadt, Warsaw, ...
  \usecolortheme{default} % or try albatross, beaver, crane, ...
  \usefonttheme{serif}    % or try default, structurebold, ...
  \setbeamertemplate{navigation symbols}{}
  \setbeamertemplate{caption}[numbered]
} 
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{chemfig}
\usepackage[version=3]{mhchem}
\usepackage{wrapfig}
% On Overleaf, these lines give you sharper preview images.
% You might want to `comment them out before you export, though.
\usepackage{pgfpages}
\pgfpagesuselayout{resize to}[%
  physical paper width=8in, physical paper height=6in]
% Here's where the presentation starts, with the info for the title slide
\title[Seminar]{Neural Network. Basic to application}
\subtitle{(painting style transfer)}
\author{Kim Woo Hyun}
\date{\today}
\begin{document}
\begin{frame}
  \titlepage
\end{frame}
% These three lines create an automatically generated table of contents.
\begin{frame}{Outline}
  \tableofcontents
\end{frame}
\section{Neural Network}
\subsection{First Generation (ANN, Perceptron)}
\begin{frame}{First Generation}
	\begin{block}{Artificial Neural Network : ANN}
		At 1943 \textbf{\textit{McCulloch, Warren S.}}, and \textbf{\textit{Walter Pitts}} suggested
	\end{block}
	
    \centering
    \includegraphics[scale=0.5]{ANN.PNG}
    
	\begin{itemize}
		\item Mimic the human neural structure by connecting switches
	\end{itemize}
\end{frame}
\begin{frame}{First Generation}
	\begin{block}{Perceptron}
		In 1958 \textbf{\textit{Frank Rosenblatt}} suggested Linear Classifier.
	\end{block}
    
    \includegraphics[scale=0.2]{1_neuron.png}
	\includegraphics[scale=0.2]{1_neuron_model.jpeg}
	\begin{itemize}
		
		\item Expected computer can do things human can do better at that time.
		\item Basic structure is not changed until now.
        \item Using sigmoid with \textbf{Activation function}. 
        (Make output $\in$ [0,1]) 
	\end{itemize}
\end{frame}
\begin{frame}{First Generation}
	
    \begin{block}{Problem}
    	In 1969 \textbf{\textit{Marvin Minsky, Seymour Papert}} proved limitations of perceptron.
    \end{block}
	
    \includegraphics[scale = 0.3]{1_minsky_book.jpg}
    \includegraphics[scale = 0.6]{1_xor_unsolve.PNG}
    
    It can't solve XOR problem even.
    
\end{frame}
\subsection{Second Generation (MLP, Back-propagation)}
\begin{frame}{Second Generation}
	\begin{block}{Multi-Layer Perception : MLP}
		Make neurons deeper by make \textbf{hidden layers} of perception        
	\end{block}
	\includegraphics[scale = 0.3]{1_xor_solve.PNG}
	\includegraphics[scale = 0.2]{1_MLP}
    \begin{itemize}
    	\item Solve the Non-Linear problems with multiple linear classifier.
        \item \textbf{Too many parameters!!}
		\item Needs parameter controller.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
	\begin{block}{Back-propagation}
		Feedback algorithm controls the weights of neural network.
	\end{block}
    \centering
    \includegraphics[scale = 0.5]{1_BackP.png}
    
	\begin{itemize}
    	\item $i$ : input layer
        \item $h$ : hidden layer
        \item $o$ : output layer
		\item $w_{ij}$ : weight connected to the neuron i to j.    	
	\end{itemize}
	
    
\end{frame}
\begin{frame}{Second Generation}
	\includegraphics[scale = 0.4]{1_BackP.png}
    \includegraphics[scale = 0.3]{1_sigmoid.png}
    \begin{itemize}
    	\item $out$ : Output value of a neuron.
        \item $in$ : sum of weighted output of connected neurons. ($in = \sum w*out$)
        \item $t$ : Target value (Choose yourself!)
        \item \textbf{Sigmoid} activation function. Ex) $out_{h3} = \sigma(in_{h3}) = \frac{1}{1+e^{-in_{h3}}}$
    \end{itemize}
\end{frame}
\begin{frame}{Second Generation}
	Error with Sum of square (Euclidean Distance)
    \[E = \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2\]
    We want to see how much each weights influence to $E$ $\Rightarrow $ Calculate $\frac{\partial E}{\partial w_{ij}}$
    Example) Calculate $\frac{\partial E}{\partial w_{35}}$ with \textbf{Chain-rule}
    \[\frac{\partial E}{\partial w_{35}} = \frac{\partial E}{\partial out_{o5}}*\frac{\partial out_{o5}}{\partial in_{o5}}*\frac{\partial in_{o5}}{\partial w_{35}}\]
    
    \centering
    \includegraphics[scale = 0.4]{1_BackP.png}
    
\end{frame}
\begin{frame}{Second Generation}
	First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2  \right ] = out_{o5}-t_5\]
    Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}}\]
\end{frame}
\begin{frame}{Second Generation}
	The sigmoid function $\sigma(x)$ is
	\[\sigma(x) = \frac{1}{1+e^{-ax}}\]
	The differential of sigmoid $\sigma(x)$
    \begin{align*}
    	\sigma'(x) &= \frac{ae^{-ax}}{(1+e^{-ax})^2} \\
        		  &= a\frac{1}{(1+e^{-ax})}\frac{e^{-ax}}{(1+e^{-ax})} \\
                  &= a\frac{1}{(1+e^{-ax})}\left( 1- \frac{1}{(1+e^{-ax})} \right ) \\
                  &= a\sigma(x)(1-\sigma(x))
    \end{align*}
\end{frame}
\begin{frame}{Second Generation}
	First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2  \right ] = out_{o5}-t_5\]
    Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
\end{frame}
\begin{frame}{Second Generation}
	First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2  \right ] = out_{o5}-t_5\]
    Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
    Third, 
    \[\frac{\partial in_{o5}}{\partial w_{35}} = \frac{\partial (out_{h3}*w_{35})}{\partial w_{35}} = out_{h3}\]
    Finally, 
    \[\frac{\partial E}{\partial w_{35}} = (out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
    \begin{block}{}
    	Beautifully, all parameters are already calculated and what we have to do is easy math.
    \end{block}
    
\end{frame}
\begin{frame}{Second Generation}
	Then, how to update weights?
    \[w := w - r\frac{\partial E}{\partial w}  \text{,  r is constant called learning rate.}\] 
    
    So, updated $w_{35}$ is
    \[w_{35} := w_{35} - r(out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
    This method called \textbf{Gradient descent.}
    
    \centering
    \includegraphics[scale = 0.16]{1_Gradient_descent.png}
    
\end{frame}
	
\begin{frame}{Second Generation}
	\begin{block}{Gradient descent}
		Simply, moving to orthogonal direction from contour line. 
	\end{block}
    \textit{Why the direction to orthogonal?}
    At minimum point of f(x,y),
    \[\nabla f(x,y) = \left( \frac{\partial f}{\partial x}, \frac{\partial f}{\partial y} \right ) = 0\]
    Assume direction of contour line is $(a,b)$. Then using \textbf{Tayler series}, derive orthogonal direction by linearize the contour line.
    \[f(x_1+a,y_1+b) \simeq  f(x_1,y_2) + \frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b + \dots\]
    The condition of $(a,b)$ that minimize error is
    \[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = 0 \]    
\end{frame}
\begin{frame}{Second Generation}
	If $a = \frac{\partial f}{\partial y}$ and $b = -\frac{\partial f}{\partial x}$.
    \[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = \frac{\partial f}{\partial x}\frac{\partial f}{\partial y} + \frac{\partial f}{\partial y}(-\frac{\partial f}{\partial x}) = 0\]
    In addition, the inner product of gradient and (a,b) is 
    \[(\nabla f(x,y))\cdot (a,b) = \left (\frac{\partial f}{\partial x}  ,\frac{\partial f}{\partial y} \right )\cdot \left ( \frac{\partial f}{\partial y} ,-\frac{\partial f}{\partial x} \right ) = 0\]
    \begin{block}{}
    	It means the vector orthogonal to contour line is gradient itself. And if we track the gradient until it is 0, we can find minimum point.
    \end{block}
    *Caution it can be a saddle point not minimum but I don't want to discuss in this time because I don't know.
\end{frame}
\begin{frame}{Second Generation}
	Problems
	\begin{itemize}
		\item Gradient descent is bad at non-convex function, but sigmoid is non-convex function.
        \[\sigma''(x) = a^{2}\sigma(x)(1-\sigma(x))(1-2\sigma(x))\]
        \[a^{2}\sigma(x)(1-\sigma(x)) \geq 0 \text{ but } -1 \leq 1-2\sigma(x) \leq 1\]
        \item Cost of back-propagation is Big.
        \item Vanishing Gradient Problem.
	\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
	
    \begin{block}{Cost of back-propagation.}
    	Cost is big at shallow layer.
    \end{block}
    For example, 
    \[\frac{\partial E}{\partial w_{13}} = \frac{\partial E}{\partial out_{h3}}*\frac{\partial out_{h3}}{\partial in_{h3}}*\frac{\partial in_{h3}}{\partial w_{13}}\]
    \[\vdots\]
    \[= \left [(out_{o5}-t_5)\{out_{o5}(1-out_{o5})\}w_{35} + (out_{o5}-t_5)\{out_{o6}(1-out_{o6})\}w_{36}\right ]\]
    \[*(1-out_{h3})*out_{h3}*out_{i1}\]
    Of course! since it is chain-rule algorithm, it is easier than looks like. However if we have very big network?
    
\end{frame}
\begin{frame}{Second Generation}
	\begin{block}{Vanishing Gradient Problem}
		Because of sigmoid function, gradient is going to 0 while repeat Back-propagation.        
	\end{block}
    
    \centering
	\includegraphics[scale = 0.5]{1_sigmoid.png}
    
\end{frame}
\subsection{Thrid Generation (ReLU)}
\begin{frame}{Thrid Generation}
	\centering
	\includegraphics[scale = 0.25]{1_relu.png}
    
	\begin{block}{Rectified Linear Unit : ReLU}
		\begin{itemize}
			\item Convex : good at gradient descent.
        	\item Cost of Back-propagation is decrease. (since $f'(x)$ = 1 or 0 always)
            \item Safe from Vanishing Gradient Problem
		\end{itemize}    
	\end{block}
All problems are from bad activation function.
    
\end{frame}
\begin{frame}{Thrid Generation}
	\centering
    \includegraphics[scale = 0.5]{1_Bad_Act.PNG}
    
    Notice at gap between tanh and ReLU.
\end{frame}
\section{Convolutional Neural Network}
\begin{frame}
	Section 2. Convolutional Neural Network
    \begin{itemize}
    \item Convolution layer
    \item ReLU layer
    \item Pooling layer
    \item Fully Connected layer
    \end{itemize}
\end{frame}
\subsection{Convolution layer}
\begin{frame}{Convolution layer}
		
	\begin{block}{2D Convolution}
		Nothing specially different from 1D convolution.
	\end{block}
    \begin{columns}
    	\begin{column}{0.4\textwidth}
    \includegraphics[scale = 0.5]{2_simple_conv.JPG}
    	\end{column}
        \begin{column}{0.6\textwidth}
        
        
    		\begin{itemize}
    			\item Input size = 7x7x1
        		\item Filter size = 3x3
        		\item The number of filter = 1
        		\item Stride = 1                 
    		\end{itemize}
    
    	\end{column}
    
    \end{columns}
    
    
    
\end{frame}
\begin{frame}{Convolution layer}
	
    \begin{block}{What is the filter do?}
    	Assume weights are already trained.
    \end{block}
    \centering
	\includegraphics[scale = 0.5]{2_line_filter.png}
    
    Curve detection filter and its visualization.
     
\end{frame}
\begin{frame}{Filter}
	\centering
    \includegraphics[scale = 0.5]{2_Mice.png}
    
    \includegraphics[scale = 0.5]{2_mice_hip.png}
    \begin{block}{}
    If Original image has similar shape at part, the result of Mult and Sum has a large number.
    \end{block}
    
    
\end{frame}
\begin{frame}{Filter}
	\centering    
    \includegraphics[scale = 0.5]{2_mice_conv.png}
    
    \begin{block}{}
    In contrast, If not, the result has a small number.
    \end{block}
    \begin{block}{}
    Trained filter can \textbf{give a score} for which feature exist or not!!
    \end{block}
\end{frame}
\begin{frame}{Filter}
	\centering
    \includegraphics[scale = 0.5]{2_ActMap.png}
	Each score is grouped together and forms layer by convolution.
\end{frame}
\begin{frame}{Padding}
	\centering
    \includegraphics[scale = 0.4]{3_padding.png}
    \begin{block}{}
    	\begin{itemize}
        	\item Attach zeros around the layer. \ \ (Zero-padding)
    		\item Prevent from size decreasing while convolution.
        	\item To catch the features at edge more detail.
    	\end{itemize}
    \end{block}
\end{frame}
\begin{frame}{Convolution layer}
	\begin{block}{Convolution}		
        W = width, H = Height, D = Depth, P = Padding, S = stride. 
        
        F = Filters W and H, N = Number of filters.
	\end{block}
  \begin{columns}[onlytextwidth]
    \begin{column}{0.5\textwidth}
      \centering
      \includegraphics[scale=0.3]{2_conv.JPG}% Place your graphic here
    \end{column}
    
    \begin{column}{0.5\textwidth}
    (6+1)x(6+1)x3 input
    
    Two 3x3x3 filters
    
    $\Rightarrow$ Two output with 3x3x2
    
    \begin{itemize}
    	\item $W_{2} = \frac{W-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
        \item $H_{2} = \frac{H-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
        \item $D_{2} = N = 2$ \ \ \ (Depth is same with Number of filters)
    \end{itemize}
    
    
    \end{column}
  \end{columns}
\end{frame}
\subsection{ReLU layer}
\begin{frame}{ReLU layer}
	\begin{columns}[onlytextwidth]
    	\begin{column}{0.5\textwidth}
    		\centering
    		\includegraphics[scale = 0.35]{1_relu.png}
    	\end{column}
        
        \begin{column}{0.5\textwidth}
        	\begin{itemize}
        		\item Zero OR Itself.
                \item Used to give Non-linearity and threshold.
                \item No parameter. No size change.
        	\end{itemize}
        \end{column}
	
	\end{columns}	    
\end{frame}
\begin{frame}{ReLU layer}
	\begin{block}{Why we have to give a Non-linearity.}
		Experimental result is given.
	\end{block}
	\centering
    \includegraphics[scale=0.35]{2_why_NonL}
    
    With Image.net classification test.
    
    
\end{frame}
\subsection{Pooling layer}
\begin{frame}{Pooling layer}
    \includegraphics[scale = 0.25]{2_maxpool.jpeg}
    \includegraphics[scale = 0.25]{2_dawnsampling.jpeg}
	\begin{itemize}		
        \item Usually, using \textbf{Max-Pooling.} (If higher value is important)
        \item No depth change.
        \item \textbf{\textit{Reduce Complexity!!!!!!(Down-sampling)}} $\frac{1}{4} = 75\%$ reduced. 
        \item Not Recessary. (But Recommended)
	\end{itemize}
	\[W_{2} = \frac{W-F}{S}+1 = \frac{224-2}{2}+1 = 112\]
    
    
\end{frame}
\subsection{Fully Connected layer}
\begin{frame}{Fully Connected layer}
	\centering
    \includegraphics[scale = 0.6]{2_Fully.png}
    
	\begin{itemize}
	\item Make 2D layer to 1D line layer (Make layer to vector.)
    \item Used to compare with target.
	\item Making method is not only one.
	\end{itemize}
	
	
\end{frame}
\section{Painting Style Transfer}
\begin{frame}
	Section 3. Painting Style Transfer
    \begin{itemize}
    \item VGGnet
    \item Algorithm and Loss function
    \item Result
    \end{itemize}
\end{frame}
\subsection{VGGnet}
\begin{frame}{VGGnet}
	\centering
    \includegraphics[scale = 1.8]{3_VGG_19.png}
	
    \begin{block}{}
    	\begin{itemize}
    	\item $F_{conv} = 3 \ (3*3*D), S_{conv} = 1, Padding = 1$
        \item $F_{Pool} = 2 \ (2*2*D), S_{pool} = 2$
    	\end{itemize}
    	
    \end{block}
	
	\[\frac{W-F_{conv}+2P}{S_{conv}}+1 = \frac{224-3+2*1}{1}+1 = 224\]   
    \[\frac{W-F_{conv}}{S_{pool}} + 1 = \frac{224-2}{2} + 1 = 112\]
    
\end{frame}
\begin{frame}{Painting style transfer}
	\centering
    \includegraphics[scale = 0.125]{3_Structure.jpg}
    
    \begin{block}{}
    	\begin{itemize}
    		\item Weights must be trained already.
            \item $a = $ style image, $p = $ content image
            \item $x = $ generated image.
    	\end{itemize}		
	\end{block}
    
\end{frame}
\subsection{Algorithm and Loss function}
\begin{frame}{Painting style transfer}
	\begin{block}{}
    	\begin{itemize}
    	\item $N_l = $ Number of feature maps of $l$th layer
        \item $M_l = $ Size of feature map of $l$th layer
        \item $F^l \in \mathcal{R}^{N_l*M_l}$
        \item $F^{l}_{ij}$  is the activation of the $i^{th}$ filter at position $j$ in layer $l$
        \item $P^{l}_{ij}$ is same with $F^{l}_{ij}$ but it is from content image.(conv4\_2)
    	\end{itemize}
		
        
        
	\end{block}
	\[\mathcal{L}_{\text{content}}(\vec{p},\vec{x}, l)=\frac{1}{2}\sum_{i, j}(F_{ij}^{l}-P_{ij}^{l})^{2}.\]
    
    \begin{block}{}
    So this loss function want to minimize distance of each value of same position between content layer and generate layer.
    \end{block}
        
\end{frame}
\begin{frame}
	\begin{block}{}
		\begin{itemize}
			\item $G^l \in \mathcal{R}^{N_l*N_l}$ 
            \item $G^{l}_{ij}$ is the inner product between the vectorized feature maps $i$ and j in layer $l$ (Gram matrix of style layer)
            \item \[G_{ij}^{l}= \sum_{k}F_{ik}^{l}F_{jk}^{l}\]
            \item $A^{l}_{ij}$ is same with $G^{l}_{ij}$ but it is from content image.
		\end{itemize}
	\end{block}
    
    
    \[E_{l}= \frac{1}{4N_{l}^{2}M_{l}^{2}}\sum_{i, j}(G_{ij}^{l}-A_{ij}^{l})^{2}\]
    \[\mathcal{L}_{\text{style}}(\vec{a},\vec{x})=\sum_{l=0}^{L}w_{l}E_{l}\]
    \begin{block}{}
    They have thought the style information is hide on correlation but I can't understand.
    \end{block}
\end{frame}
\begin{frame}{Painting style transfer}
	The differential of each loss function are
    \[\frac{\partial \mathcal{L}_{\text{content}}}{\partial F_{ij}^{l}}=\begin{cases} (F^{l}-P^{l})_{ij} & \text{if}\ F_{ij}^{l} > 0\\ 0 & \text{if}\ F_{ij}^{l} < 0, \end{cases}\]
    \[\frac{\partial E_{l}}{\partial F_{ij}^{l}}=\begin{cases} \frac{1}{N_{l}^{2}M_{l}^{2}}((F^{l})^{\mathrm{T}}(G^{l}-A^{l}))_{ji}& \text{if}\ F_{ij}^{l} > 0\\ 0& \text{if}\ F_{ij}^{l} < 0. \end{cases}\]
    
    And the total loss is
    \[\mathcal{L}_{\text{total}}(\vec{p},\vec{a},\vec{x})=\alpha \mathcal{L}_{\text{content}}(\vec{p},\vec{x})+\beta \mathcal{L}_{\text{style}}(\vec{a},\vec{x})\]
    \begin{itemize}
    	\item $\alpha$ and $\beta$ are learning rate.
    \end{itemize}    
\end{frame}
\begin{frame}
	\centering
    \includegraphics[scale = 0.125]{3_Structure.jpg}
    
    \[\vec{x} := \vec{x} - \lambda \frac{\partial \mathcal{L}_{total}}{\partial \vec{x}}\]
    \begin{itemize}
    	\item $\lambda$ is learning rate.
        \item At first, $\vec{x}$ is white noise image.
        \item \textbf{Not learning weights, learning $\vec{x}$!!!!}
    \end{itemize}
    
\end{frame}
\subsection{Result}
\begin{frame}{Result}
	\centering
    \begin{columns}[onlytextwidth]
    	\begin{column}{0.5\textwidth}
        	\centering
    		\includegraphics[scale = 0.16]{starry_night.jpg}
            \centering
            \[+\]
            \centering
            \includegraphics[scale = 0.16]{in4.JPG}
        \end{column}
        \begin{column}{0.5\textwidth}
        	
            
        	\includegraphics[scale = 0.18]{2800.png}
        \end{column}
    \end{columns}
\end{frame}
\begin{frame}{Bonus}
    \begin{block}{}
    	Thank you!
    \end{block}
        \begin{columns}[onlytextwidth]
    		\begin{column}{0.5\textwidth}
        		\centering
    			\includegraphics[scale = 0.2]{in3.jpg}
        	\end{column}
        \begin{column}{0.5\textwidth}
        	\includegraphics[scale = 0.2]{2100.png}
        \end{column}
    \end{columns}
	
\end{frame}
\end{document}