\documentclass[11pt]{article}
\include{lecture}
\usepackage{alg}
\begin{document}
\lecture{15}{10/27/2011}{Expanders}{Li-Hsiang Kuo}
%\draft % put this here only if you want to indicate this is a draft
In the last lecture we introduced randomized computation in terms of
machines that have access to a source of random bits and that return
correct answers somewhat more than half of the time. We showed that
$\BPP$ has polynomial-size circuits and the conjecture in the community
is that $\BPP = \P$.
Today we'll introduce expanders, a type of graph that is useful in
improving (amplifying) randomized algorithms with little or no
additional random bit overhead. First we'll prove a theorem
relating $\BPP$ to the polynomial time hierarchy.
\section{BPP and the Polynomial Time Hierarchy}
Although we don't know if $\BPP = \P$, or even if $\BPP \subseteq
\NP$, we do know that $\BPP \subseteq \PH$:
\begin{theorem}
$\BPP \subseteq \Sigma^{p}_{2} \cap \Pi^{p}_{2}$
\label{thm:BPP-PH}
\end{theorem}
\begin{proof}
Fix $M$ a randomized polytime machine that accepts $L \in \BPP$, and
let $r$ be the number of random bits $M$ uses when running on an
input $x$ of size $n = |x|$. We will now show that $\BPP \subseteq
\Sigma^p_2$. Since $\BPP = co\BPP$ it follows that $\BPP
\subseteq co\Sigma^p_2 = \Pi^p_2$, completing the proof.
We need to remove randomness from the computation. For a given
input $x$, the space $\{0,1\}^r$ of $r$-bit strings gets partitioned
into two pieces: $Acc(x)$, the set of random strings on which $M$
accepts $x$, and $Rej(x)$, the set of strings on which $M$ rejects
$x$. If the error rate $\varepsilon$ of $M$ is small, then $Acc(x)$
will be much larger than $Rej(x)$ when $x \in L$, and $Acc(x)$ will
be much smaller than $Rej(x)$ when $x \notin L$.
See Figure \ref{fig:acceptReject}. If our random computation has
a small error rate then it will be correct on most random bit
strings. We'll turn ``most'' into ``all.'' The idea is that when $x
\in L$ a few ``shifts'' of $Acc(x)$ will cover the whole space
$\{0,1\}^r$ of $r$-bit random sequences, while the same few shifts
of $Acc(x)$ will fail to cover the whole space when $x \notin L$.
\begin{figure} [h]
\centering
$x \in L$
\input{rej-acc-ellipse.pstex_t}
\quad
\input{acc-rej-ellipse.pstex_t}
$x \notin L$
\caption{If the error rate $\varepsilon$ of $M$ is small, then
$Acc(x)$ will be much larger than $Rej(x)$ when $x \in L$, and
$Acc(x)$ will be much smaller than $Rej(x)$ when $x \not\in
L$. }
\label{fig:acceptReject}
\end{figure}
If $S \subseteq \{0,1\}^r$ and $\sigma \in \{0,1\}^r$, then $S \oplus
\sigma = \{ s \oplus \sigma | s \in S \}$, the \emph{shift} of $S$ by
$\sigma$. Since shifting is invertible we see that $|S \oplus \sigma| = |S|$.
We will use shifts to give a $\Sigma_2$-predicate using the
intuition discussed above. Namely, consider
\begin{equation}
\label{eqn:AccXShiftCoversWholeSpace}
x \in L \iff \exists \sigma_1, \dots, \sigma_t \forall \rho \in
\{0,1\}^r [ \rho \in \bigcup_{i=1}^t Acc(x) \oplus \sigma_i] .
\end{equation}
Now, $r$ is poly in $n$, and so if we can pick $t$ poly in $n$ as
well, then the above will be a $\Sigma^p_2$-predicate, provided that
we can verify the membership of $\rho$ in $\bigcup_{i=1}^t Acc(x)
\oplus \sigma_i$ in time poly in $n$. The membership check is no
problem since we can equivalently check that $\rho \oplus \sigma_i
\in Acc(x)$ for some $i$, which is polytime since it corresponds to
running $M$ on $x$ with the random bit string $\rho \oplus
\sigma_i$, for poly-many $\sigma_i$.
We'll show we can pick $t$ a suitable poly to make
\eqref{eqn:AccXShiftCoversWholeSpace} true by showing that we can
choose the $\sigma_i$s randomly with a high rate of success. There
are two cases to consider:
\begin{enumerate}
\item $x \in L$: For $A \subseteq \{0,1\}^r$, define
\[
\mu(A) = \frac{|A|}{2^r}.
\]
For $\rho$ fixed,
\[
\Pr_{\sigma_1,\dots,\sigma_t}[ \rho \notin
\bigcup_{i=1}^t Acc(x) \oplus \sigma_i] \le (\mu(Rej(x))^t
\le \varepsilon^t,
\]
where $\mu(Rej(x))$ is the probability that $M$ rejects when it
should accept, and is hence not larger than $\varepsilon$. So, by
an union bound,
\[
\Pr[ \{0,1\}^r \not\subseteq \bigcup_{i=1}^t Acc(x) \oplus
\sigma_i] \le |\{0,1\}^r|\varepsilon^t = 2^r\varepsilon^t ,
\]
and hence if $2^r\varepsilon^t < 1$ then some choice of the
$\sigma_i$s must work.
\item $x \notin L$: We need to be sure that for any choice of the $\sigma_i$s
that none of the shifts of $\rho$ make $M$ accpet $x$. But
\[
\mu(\bigcup_{i=1}^t Acc(x) \oplus \sigma_i) \le \sum_{i=1}^t \mu(Acc(x)
\oplus \sigma_i) = t\varepsilon,
\]
since $\mu(Acc(x)) \le \varepsilon$ when $x \not\in L$, and hence
we need $t\varepsilon < 1$.
\end{enumerate}
So, we need to choose $t$ so that both $t\varepsilon$ and
$2^r\varepsilon^t$ are less than $1$. So $t = r$ works, provided
that $\varepsilon < \frac{1}{r}$. From the definition of $\BPP$ we
only know that $\varepsilon < \frac{1}{3}$, but, using the
``majority vote'' trick introduced during the last lecture, we can
in $k$ runs reduce the error to $e^{-2k\delta^2}$, where $\delta =
\frac{1}{2} - \varepsilon > 0$. Since $k$ runs will need $kr$
random bits, we see that it suffices to choose $k$ poly and large
enough that $e^{-2k\delta^2} < \frac{1}{rk}$. In fact, $k = O(\log
r)$ works, and so an examination of the above shows that the time
complexity is $O(T^2\log T)$ if the original run-time was $T$.
\end{proof}
\section{Expanders}
The proof of Theorem \ref{thm:BPP-PH} used the ``majority vote'' trick
to get an exponential in $k$ increase in accuracy using a linear in $k$
increase in random bit usage. Expander graphs, which are introduced
here, lead to an ``amplification'' technique that gives an exponential
in $k$ accuracy improvement using only a constant increase in random
bit usage ($rk$ versus $r + k$). Expanders have many other uses, some
of which we mention in this lecture, and others we may see in later lectures.
\subsection{Definitions}
Here we introduce two definitions of expander graphs, namely the combinatorial
definition and the algebraic definition.
\begin{definition} [Combinatorial definition]
A graph $G = (V,E)$ is a \emph{$(k,c)$-expander}
if $\forall S \subseteq V$ with $|S| \le k$
satisfies $|\Gamma(S)| \ge c|S|$, where
$\Gamma(S) = \{ v \in V \mid \exists s \in S, (s,v) \in E \}$ is the
\emph{neighborhood} of $S$ in $G$.
\end{definition}
Notice that it is easy to construct a graph which is $(k,c)$-expanding
for all $k$ with $c \le 1$. Since $\Gamma(V) = V$ one can see it is
important not to let $|S|$ be too large. In other words, for
$k \ge |V|$ and $c > 1$, no graph is $(k,c)$-expanding. A typical setting
is $c$ is a constant $> 1$ and $k = \frac{N}{2}$ where $N = |V|$.
\begin{definition} [Algebraic definition]
Given a regular $G = (V,E)$ with degree d its $N \times N$
\emph{normalized adjacency matrix} $A$ is defined by
\[
A_{ij} =
\begin{cases}
\frac{1}{d}, & (i,j) \in E\\
0, & \text{otherwise}
\end{cases}
\]
\end{definition}
The normalized adjacency matrix describes the Markov chain of a random
walk on $G$:
\[
A_{ij} = \Pr[\text{go to state $i$ from state $j$}],
\]
and if $p$ is a column vector with $p_i$ the probability of being at
vertex $i$, then $(Ap)_i$ gives the probability of being at vertex $i$
after one step of the random walk on $G$.
\[
(Ap)_i = \sum_{j=1}^N A_{ij}p_j .
\]
\subsection{Properties of the normalized adjacency matrix of a regular graph}
The normalized adjacency matrix has a number of properties that can be
proved using basic linear algebra. We will use the following properties, but
we do not prove them here.
\begin{proposition}
Let $A$ be a normalized adjacency matrix of a regular graph $G$ on $N$ vertices.
\begin{enumerate}
\item $A$ is real and symmetric: all eigenvalues of $A$ are real, and there exists
a full orthonormal basis of eigenvectors.
\item All eigenvalues $\lambda$ of $A$ satisfy $|\lambda| \le 1$.
\item 1 is an eigenvalue of $A$, with corresponding eigenvector
$u = (\frac{1}{N},\frac{1}{N},...,\frac{1}{N})$.
The multiplicity of 1 as an eigenvalue equals the number of components of $G$.
\item -1 is an eigenvalue of $A$ iff $G$ is bipartite.
\end{enumerate}
\label{prop:normalized-adjacency-matrix}
\end{proposition}
As the uniform distribution is always an eigenvector with eigenvalue 1,
we look only at the remaining eigenvector/eigenvalues. It turns out that the
second largest eigenvalue in absolute value is often useful to work with.
\begin{definition}
$\lambda(G) = max\{ |\lambda| \mid \lambda$ is an eigenvalue of $G$
with eigenvector $e_\lambda$ orthogonal to $u\}$
\end{definition}
From Proposition \ref{prop:normalized-adjacency-matrix} we have $\lambda(G) \le 1$, and
\[
\lambda(G) < 1 \iff G \text{ is connected and not bipartite}.
\]
So, for expanders, the uniform distribution $u$ is the only fixed
point. The following theorem tells us that the larger the
difference between 1 and $\lambda(G)$ the faster an arbitrary probability
distribution converges to the uniform distribution via a random walk on $G$.
\begin{theorem}
For any probability distribution vector $p$,
\[
\| A^t p - u \|_1 \le \sqrt{N}\lambda^t,
\]
where $\| v \|_q = \big[\sum_i |v_i|^q \big] ^{1/q}$ is the $q$-norm
of $v$ and $\lambda = \lambda(G)$.
\label{thm:spectralConvergenceRelationship}
\end{theorem}
One way to understand this theorem is: given an initial distribution for a
random walk $p$, after $t$ steps of the random walk $A^tp$, we will be
exponentially closer to the uniform distribution $u$, provided
$\lambda < 1$, that is $G$ is connected and not bipartite.
\begin{proof}
Since $Au = u$ we have $A^t p - u = A^t (p - u)$. Now, $(p - u)
\bot u$ because
\[
\langle p - u, u \rangle = \langle p, u \rangle - \langle u, u \rangle
= \sum_{i=1}^N p_i/N - \sum_{i=1}^N 1/N^2 = 1/N - 1/N,
\]
which follows since $p$ is a probability distribution.
Write $p - u = \sum_i a_i e_i$ for some real $a_i$,
where the $\{e_i\}$ form an
orthogonal basis of eigenvectors for $A$ with $A e_i = \lambda_i e_i$. Then,
since the $u$-component of $p - u$ is $0$, we have the following inequality
\begin{align*}
\| A^t (p - u) \|^2_2
&= \left\|A^t \sum_i a_i e_i \right\|^2_2
= \left\| \sum_i \lambda^t_i a_i e_i \right\|^2_2
= \sum_i \left\| \lambda^t_i a_i e_i \right\|^2_2
= \sum_i |\lambda_i|^{2t} \left\| a_i e_i \right\|^2_2 \\
&\le \sum_i \lambda^{2t} \left\| a_i e_i \right\|^2_2
= \lambda^{2t} \sum_i \left\| a_i e_i \right\|^2_2
= \lambda^{2t} \| p - u \|^2_2 .
\end{align*}
Now
\[
\| p - u \|^2_2 + \| u \|^2_2 = \| p \|^2_2 \le \| p \|^2_1 = 1,
\]
where $(p - u) \bot u$ implies the first equality, and
$\| v \|^2_2 \le \| v \|^2_1$ for all vectors $v$ implies the inequality.
By the Cauchy-Schwartz inequality, we have
\begin{align*}
\|A^t (p - u)\|_1 &= | \langle
(-1^{\sigma_1},\dots,-1^{\sigma_N}),
A^t (p - u) \rangle |\\
&\le \|(-1^{\sigma_1},\dots,-1^{\sigma_N})\|_2 \| A^t (p - u) \|_2\\
&= \sqrt{N} \|A^t (p - u) \|_2 ,
\end{align*}
where $\sigma_k =
\begin{cases}
0, & (A^t (p - u))_k \ge 0\\
1, & \text{otherwise}
\end{cases}$. Combining all of the above we get
\begin{align*}
\|A^t (p - u) \|_1 &\le \sqrt{N} \|A^t (p - u) \|_2\\
&\le \sqrt{N} \lambda^t \| (p - u) \|_2\\
&\le \sqrt{N} \lambda^t,
\end{align*}
completing the proof.
\end{proof}
This can be used to prove that the random walk algorithm for the
undirected path problem needs only polynomially many steps, i.e. that
PATH $\in\BPL$. The proof uses Theorem \ref{thm:spectralConvergenceRelationship}
and the following exercise.
\begin{exercise}
If $G$ is connected and not bipartite then $\lambda(G) < 1 - \frac{1}{dN^2}$.
\end{exercise}
\section{Next Time}
In the next lecture we will prove expander mixing lemma.
\begin{lemma}[Expander Mixing Lemma]
For every pair of subsets $S,T \subseteq V$,
\[
\left|\frac{|E(S,T)|}{dN} -
\mu(S)\mu(T)\right| \le \lambda \sqrt{\mu(S)(1-\mu(S))\mu(T)(1-\mu(T))}.
\]
\end{lemma}
This lemma can be used in the following applications:
\begin{enumerate}
\item
\emph{Deterministic error reduction}: Given a randomized algorithm $R$ that uses $r$
random bits we reduce the error to be less than some arbitrary
$\varepsilon$, without using any additional random bits. This
requires running $R$ $\poly(1/\varepsilon)$ times.
\item
\emph{Randomness efficient error reduction}: With $R$ and $r$ as
above we reduce the error to be less than some arbitrary
$\varepsilon$ using $r + O(\log(1/\varepsilon))$ additional random
bits and running $R$ $O(\log(1/\varepsilon))$ times.
\end{enumerate}
We will discuss these applications and prove their correctness in the next lecture.
\section*{Acknowledgements}
In writing the notes for this lecture, I perused the notes by Beth Skubak
and Amanda Hittson for lecture 12 and 13 from the Spring 2010 offering
of CS~710 and the notes by Nathan Collins for lecture 12 from the Spring
2007 offering of CS~810.
\end{document}