Self taught learning
Autor
Prateekshit Pandey
Last Updated
hace 11 años
License
Other (as stated in the work)
Resumen
Based on the paper "Self-taught learning: Transfer Learning from Unlabeled Data"
\documentclass{beamer}
%
% Choose how your presentation looks.
%
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\usepackage{multirow}
\mode<presentation>
{
\usetheme{Darmstadt} % or try Darmstadt, Madrid, Warsaw, ...
\usecolortheme{default} % or try albatross, beaver, crane, ...
\usefonttheme{default} % or try serif, structurebold, ...
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\title[]{Self taught Learning}
\subtitle{Based on the paper "Self-taught learning: Transfer Learning from Unlabeled Data"}
\author{Prateekshit Pandey}
\institute{BTech CSE, 2nd year}
\date{January 3, 2014}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% Uncomment these lines for an automatically generated outline.
% Uncomment them before submitting it
%begin{frame}{Outline}
%ableofcontents
%end{frame}
\section{Introduction: Neural Networks}
\subsection{Neural Nets: Basics}
\begin{frame}{Basics}
\begin{itemize}
\item {The basic study of learning starts from neurons (McCulloch-Pitts neuron model).}
\item {A set of neurons when assigned to work on the same set of features, it forms a \bf{layer of neurons} for a neural network.}
\begin{figure}
\includegraphics[scale=0.5]{image1.jpg}
\caption{\label{fig:your-figure}{A sample neural net}}
\end{figure}
\end{itemize}
%\vskip 1cm
%\begin{block}{Examples}
%Some examples of commonly used commands and features are included, to help you get started.
%\end{block}
\end{frame}
\subsection {Training of a neural net}
\begin{frame}{Feed forward and back-propagation}
\begin{itemize}
\item{Feed forward: Calculate activation layer of each layer and pass it on to the next layer, and thus, calculate the output layer.}
\item{Back Propagation: Starting from the output layer, calculate the error for each layer.
\begin{itemize}
\item{$\delta^L = -(y-a^{(L)}).*f'(z^{(L)})$}
\item{$\delta^k = ((W^{(k)})^T.\delta^{k+1}).*f'(z^{(L)})$ for k = L-1, L-2, ... , 3, 2}
\end{itemize}}
\item{The cost function for a neural network is given by a one-half squared error error function.
\begin{itemize}
\item{$J(W, b) = \frac{1}{m}\sum_i^m (\frac{1}{2}||h_{W,b}(x^{(i)}) - y^{(i)}||^2)$}
\end{itemize}}
\item{Gradient can be calculated using the errors calculated for each layer. This gradient can then be input to any optimization algorithm like gradient descent or L-BFGS.
\begin{itemize}
\item{$\nabla_{W^{(l)}}J(W, b) = \delta^{(l+1)}(a^{(l)})^T$}
\item{$\nabla_{b^{(l)}}J(W, b) = \delta^{(l+1)}$}
\end{itemize}}
\end{itemize}
\end{frame}
\section{Need for a novel method}
\subsection{Real world learning problems}
\begin{frame}{Real world problems}
\begin{itemize}
\item {Perform speaker identification, provided unlimited access to natural sounds}
\item {Perform classification of elephants and rhinos, provided unlimited access to natural images}
\item {Perform email foldering of ICML reviewing emails and NIPS reviewing emails, provided unlimited access to news articles (text).}
\item {Conclusion: always a mix of labeled and unlabeled data.}
\end{itemize}
\end{frame}
\begin {frame}{Problems faced}
\begin{itemize}
\item {\textbf{Labeled data}: difficult and expensive to obtain. Learning on a small data set may result in not being able to generalize over a larger data set.}
\item {\textbf{Unlabled data}: expensive to find unlabeled data with desired class labels.}
\item{\textbf{Motivation}: exploit the abundance of unlabeled data to generalize over a larger scale of data.}
\end{itemize}
\end {frame}
\section{Self Taught Learning}
\subsection{Motivation}
\begin {frame} {Previous algorithms and their shortcomings}
\begin {itemize}
\item{\textbf{Supervised learning}: works perfectly well if large amount of labeled data is provided, but fails to generalize well in case of scarcity of labeled data.}
\item {\textbf{Semi-supervised learning}: needs labeled as well as unlabeled data for learning; assumes that the unlabeled data can be labeled with the same labels as the classification task.}
\item {\textbf{Transfer learning}: typically requires transfer of knowledge from one supervised task to another, thus it requires additional labeled data.}
\item {\textbf{Idea}: Transfer knowledge from unalabeled data.}
\end {itemize}
\end {frame}
\begin {frame} {Advantages and further motivations}
\begin {itemize}
\item {Use unlabeled data (from the same domain) without any restrictions.}
\item {More accurately reflects how humans may learn, since much of human learning is believed to be from unlabeled data.}
\end {itemize}
\end {frame}
\subsection {Problem Formalism}
\begin {frame}{Problem formalisation}
\begin {itemize}
\item {Number of classes to classify data: C}
\item {A set of m labeled examples: $\{(x_l^{(1)}, y^{(1)}), (x_l^{(2)}, y^{(2)}), ... , (x_l^{(m)}, y^{(m)})\}$ where $x_l^{(i)} \epsilon R^n$ and $y^{(i)} \epsilon \{ 1, 2, ... , C\}$}
\item {A set of k unlabeled example: $\{x_u^{(1)}, x_u^{(2)}, ... , x_u^{(k)}\}$ where $x_u^{(i)} \epsilon R^n$}
\item {The learning algorithm outputs a hypothesis $h : R^n \rightarrow \{ 1,2, ..., C\}$}
\item {The hypothesis function tries to mimic he input-output relationship represented by the labeled training data.}
\item {This hypothesis function is tested under the same distribution from which labeled data was drawn.}
\end {itemize}
\end {frame}
\subsection{A sample approach}
\begin {frame} {Learning high level features - I}
\begin {itemize}
\item {We start with using the large unlabeled data to learn a higher level, more succint representation of the inputs.}
\item {In case of images:
\begin {itemize}
\item {The inputs $x_u^{(i)}$ are vectors of pixel intensities of the images; the algorithm will try to learn 'basic elements' of the image.}
\item {The 'basic elements' can include some strong correlation between rows of pixels. Thus, it will be able to learn \textit{edges}.}
\item{Thus, we will be able to present an image in terms of its edges, rather than raw pixel intensities.}
\end {itemize}}
\item {By applying learned representation to the labeled data, we obtain a higher level representation of the data. This makes the task of supervised learning much easier.}
\end {itemize}
\end {frame}
\begin {frame} {Learning high level features - II}
\begin {itemize}
\item {Following a modified version of sparse coding by Olshausen \& Field (1996).}
\item {Optimization objective: minimize$_{b,a} \sum_i^k ||x_u^{(i)} - \sum_j a_j^{(i)}b_j||_2^2 + \beta||a^{(i)}||_1$
\begin {itemize}
\item {Number of bases: s}
\item {Basis: $b = \{b_1, b_2, ... , b_s \}; b_j \epsilon R^n$}
\item {Activations: $a = \{a^{(1)}, a^{(2)}, ... , a^{(k)}; a^{(i)}_j \epsilon R^s \}$}
\item {The number of bases s can be much larger than s.}
\end {itemize}}
\item {The optimization objective balances two terms:
\begin {itemize}
\item {The first quadratic term pushes each $x^{(i)}_u$ to be reconstructed well as a weighted linear combination of the bases.}
\item {It encourages the activation to have a low $L_1$ norm, thus encouraging the activations to be \textbf{\textit{sparse}}.}
\end {itemize}}
\end {itemize}
\end {frame}
\begin {frame} {Unsupervised Feature Construction}
\begin {itemize}
\item {It is often quite easy to obtain large amounts of unabeled data that shares several salient features with the classification task of interest.}
\item {After learning a set of bases $b$ from the unlabeled data, we compute the features $\hat{a}(x_l^{(i)})$ for the labeled data.}
\item {We do this by solving the following optimization problem: $\hat{a}(x_l^{(i)}) = arg min_{a^{(i)}} ||x_l^{(i)} - \sum_j a_j^{(i)}b_j||_2^2 + \beta ||a^{(i)}||_1$}
\item {Since, it is a L1 regularized optimization, we obtain a sparse representation of the labeled data}
\end {itemize}
\end {frame}
\begin {frame} {Algorithm: Self-taught learning via Sparse Coding}
\textbf{input}Labeled training set\newline
{50 mm}T = $\{ (x_l^{(1)}, y_l^{(1)}), (x_l^{(2)}, y_l^{(2)}), ... , (x_l^{(m)}, y_l^{(m)})\}$\newline
Unlabeled data $\{ x_u^{(1)}, x_u^{(2)}, ... , x_u^{(k)}\}$\newline
\textbf{output} Learned classifier for the classification task.\newline
\textbf{algorithm} Using unlabeled data $\{ x_u^{(i)}\}$, solve the optimization problem to obtain bases b.
Compute features for the classification task to obtain a new labeled training set $\hat{T} = \{ (\hat{a}(x_l^{(i)}), y^{(i)})_{i=1}^m\}$, where\newline
$\hat{a}(x_l^{(i)}) = arg min_{a^{(i)}} ||x_l^{(i)} - \sum_j a_j^{(i)}b_j||_2^2 + \beta ||a^{(i)}||_1$\newline
Learn a classifier by applying a supervised learning algorithm (eg. SVM) to the labeled training set $\hat{T}$.\newline
\textbf{result} the learned classifier C.
\end {frame}
\begin {frame}{Comparison with other methods}
\begin {itemize}
\item {Every self-taught learning algorithm must be able to detect some structure using the unlabeled data.}
\item {\textbf{Principal Component Analysis (PCA)}: identifies a lower dimensional subspace of maximal variation within the unlabeled data. The top $T \leq n$ principal components $b_1, b_2, ... , b_T$ are the solution to the optimization problem: $minimize_{b, a} \sum_i ||x_u^{(i)} - \sum_ja_j^{(i)}b_j||_2^2$\newline such that $b_1, b_2, ... , b_T$ are orthogonal}
\item {PCA seems convenient because it can be solved easily using standard numerical software. But, as compared to the sparse encoder, it has some limitations:
\begin {itemize}
\item {PCA results in a linear feature extraction; features $a_j^{(i)}$ are simply a linear function of the input. Sparse coding features $\hat{a}(x)$ are inherently non-linear.}
\item {PCA assumes bases to be orthogonal, hence the number of PCA features cannot exceed n. This is not a limitation in sparse coding.}
\end {itemize}}
\end {itemize}
\end {frame}
\section{Experiments}
\subsection{Experiment data}
\begin {frame} {Experiments: procedure followed}
\begin {itemize}
\item {For computational reasons, the unlaabeled data was preprocessed by applying PCA to reduce its dimensions.}
\item {The sparse coding based algorithm was then applied in the resulting principal component space.}
\item {The learned features were then used to construct features for each input from the supervised classification task.}
\end {itemize}
\end {frame}
\begin {frame} {Experiment data - I}
\begin{table}
\centering
\begin{tabular}{| p{2cm} | p{2cm} | p{2cm}| p{0.7cm} | p{2.5cm} |}
\hline
Domain & Unlabeled data & Labeled data & Classes & Raw features \\\hline
Image Classification & 10 images of outdoor scenes & Caltech101 image classification dataset & 101 & Intensities in 14x14 pixel patch\\
Handwritten character recognition & Handwritten digits (0-9) & Handwritten english characters (a-z) & 26 & Intensities in 28x28 character/digit image\\
\hline
\end{tabular}
\caption{\label{tab:widgets}Details of self-taught learning applications evaluated in the experiments conducted by Raina et al (2009)}
\end{table}
\end {frame}
\begin {frame} {Experiment data - II}
\begin{table}
\centering
\begin{tabular}{| p{2cm} | p{2cm} | p{2cm}| p{0.7cm} | p{2.5cm} |}
\hline
Domain & Unlabeled data & Labeled data & Classes & Raw features \\\hline
Font character recognition & Handwritten English characters (a-z) & Font characters (a/A - z/Z) & 26 & Intensities in 28x28 character image\\
Song genre classification & Song snippets from 10 genres & Song snippets from 7 diff. genres & 7 & Long frequency spectogram over 50ms time window\\
\hline
\end{tabular}
\caption{\label{tab:widgets}Details of self-taught learning applications evaluated in the experiments conducted by Raina et al (2009)}
\end{table}
\end {frame}
\begin {frame} {Experiment data - III}
\begin{table}
\centering
\begin{tabular}{| p{2cm} | p{2cm} | p{2cm}| p{0.7cm} | p{2.5cm} |}
\hline
Domain & Unlabeled data & Labeled data & Classes & Raw features \\\hline
Webpage classsification & 100,000 news articles (Reuters newswire) & Categorized webpages (from DMOZ hierarchy) & 2 & Bag-of-words with 500 words\\
UseNet article classification & 100,000 news articles (Reuters newswire) & Categorized UseNet posts (from SRAA dataset) & 2 & Bag-of-words with 377 words\\
\hline
\end{tabular}
\caption{\label{tab:widgets}Details of self-taught learning applications evaluated in the experiments conducted by Raina et al (2009)}
\end{table}
\end {frame}
\begin {frame} {Results: Accuracy on the self-taught learning tasks}
\begin{table}
\centering
\begin {tabular}{| p{1.7cm} | p{1.7cm} | p{1.7cm}| p{1.7cm}| p{1.7cm}}
\hline
Domain & Training set size & Unlabeled SC & Labeled PCA & Labeled SC\\
\hline
\multirow{3}{*}{Handwritten} & 100 & 39.7\% & 36.2\% & 31.4\% \\
& 500 & 58.5\% & 50.4\% & 50.8\% \\
& 5000 & 73.1\% & 73.5\% & 73.0\% \\ \hline
\multirow{3}{*}{Font char} & 100 & 7.0\% & 5.2\% & 5.1\% \\
& 500 & 16.6\% & 11.7\% & 14.7\% \\
& 1000 & 23.2\% & 19.0\% & 22.3\% \\ \hline
\multirow{3}{*}{Webpages} & 4 & 64.3\% & 55.9\% & 53.6\% \\
& 10 & 75.9\% & 57.0\% & 54.8\% \\
& 20 & 80.4\% & 62.9\% & 60.5\% \\ \hline
\multirow{2}{*} {UseNet} & 4 & 63.8\% & 60.5\% & 50.9\% \\
& 10 & 68.7\% & 67.9\% & 60.8\% \\ \hline
\end{tabular}
\caption{\label{tab:widgets}Accuracy on the self-learning tasks observed in the experiments conducted by Raina et al (2009)}
\end{table}
\end {frame}
\section{Autoencoders}
\subsection{Introduction}
\begin {frame}{Autoencoders: introduction}
{ An \textbf{autoencoder} neural network is an unsupervised learning algorithm that applies backpropagation, setting the target values to be equal to the inputs.}
\begin{figure}
\includegraphics[scale=0.4]{autoencoder.jpg}
\caption{\label{fig:your-figure}{Autoencoder}}
\end{figure}
\end {frame}
\begin {frame}{Autoencoders: introduction}
\begin {itemize}
\item {The autoencoder tries to learn a function $h_{W,b}(x) \approx x$. In other words, it is trying to learn an approximation to the identity function, so as to output $\hat{x}$ that is similar to $x$.}
]\item {By placing constraints on the network, such as by limiting the number of hidden units, we can discover interesting structure about the data.
\begin {itemize}
\item {If the number of layers in the middle layer is less than the input, then it functions as a compressed representation of the input.}
\item {If the number of layers in the middle layer is more than the input, then it functions as a sparse representation of the input.}
\end {itemize}}
\end{itemize}
\end {frame}
\subsection{Algorithm}
\begin {frame}{How it works}
\begin {itemize}
\item {By backpropagation, the error for the middle layer, or encoding layer, will be given by: $\delta^{(2)}_i = \left( \sum_{j=1}^{s_{2}} W^{(2)}_{ji} \delta^{(3)}_j \right) f'(z^{(2)}_i)$}
\item {We introduce a term: $\hat\rho_j = \frac{1}{m} \sum_{i=1}^m \left[ a^{(2)}_j(x^{(i)})\right]$ and enforce $\hat\rho_j = \rho$ where $\rho$ is the sparsity parameter ans is kept near to zero (0.05). The sparsity parameter ensures that the activations are near to zero, and hence, sparse.}
\item {Because of the sparsity parameter, a penalty term is added to the delta calculation for the encoding layer: $\delta^{(2)}_i =
\left( \left( \sum_{j=1}^{s_{2}} W^{(2)}_{ji} \delta^{(3)}_j \right)
+ \beta \left( - \frac{\rho}{\hat\rho_i} + \frac{1-\rho}{1-\hat\rho_i} \right) \right) f'(z^{(2)}_i) $}
\end {itemize}
\end {frame}
\end{document}
%%%%%%%Following lines may come in use%%%%%%%
% Commands to include a figure:
%\begin{figure}
%\includegraphics[width=\textwidth]{your-figure's-file-name}
%\caption{\label{fig:your-figure}Caption goes here.}
%\end{figure}
% Uncomment the following for a sample table
%\begin{table}
%\centering
%\begin{tabular}{l|r}
%Item & Quantity \\\hline
%Widgets & 42 \\
%Gadgets & 13
%\end{tabular}
%\caption{\label{tab:widgets}An example table.}
%\end{table}
%Following code is a sample maths code
%Let $X_1, X_2, \ldots, X_n$ be a sequence of independent and identically distributed random variables with $\text{E}[X_i] = \mu$ and $\text{Var}[X_i] = \sigma^2 < \infty$, and let
%$$S_n = \frac{X_1 + X_2 + \cdots + X_n}{n}
%= \frac{1}{n}\sum_{i}^{n} X_i$$
%denote their mean. Then as $n$ approaches infinity, the random variables $\sqrt{n}(S_n - \mu)$ converge in distribution to a normal $\mathcal{N}(0, \sigma^2)$.