\documentclass[11pt]{article}
\usepackage[usenames,dvipsnames]{pstricks}
\usepackage{epsfig}
\usepackage{alg}
\include{lecture}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\warn}[1]{{\color{red} #1}}
\begin{document}
\lecture{22}{11/22/2011}{Randomness Extraction}{Prathmesh Prabhu}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Introduction%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In the previous lecture, we saw a pseudorandom generator that fools time-bounded
computations under the plausible hypothesis that there exists a language in $\E$
requiring linear exponential size circuits. We left off by proving the result
under a somewhat stronger hypothesis that there is a language in $\E$ with high
average-case hardness. In the first part of this lecture, we wrap up this
discussion by showing that worst-case hardness at length m, $C_L(m)$, can be
substituted for average-case hardness, $H_L(m)$. Towards this, we make use of
Error Correcting Codes described in previous lectures.
In the latter part of this lecture, we turn to another major pursuit in the
study of randomness and psuedorandomness. Most randomized algorithms assume
access to a perfect source of unbiased and, more importantly, uncorrelated
(independent) random bits. But in practice, we must run these algorithms given
access to only an imperfect random source. The goal of randomness extraction is
to take samples from a weak random source --- one where samples may not be
uniformly distributed, but do have some inherent randomness --- and generate
samples that are close to being uniformly distributed. Such weak random sources
will be our models for physical sources of randomness, such as keystrokes or
delays over networks.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%New Section%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Worst-Case to Average-Case reduction}
In the previous lecture, we enumerated some properties needed in an ECC so that
it can be employed for this reduction. Further, we hinted at a construction of a
local list decoder with these properties by concatenating the Hadamard decoder
and the decoder for Reed-Muller code. The following theorem, stated without
proof, asserts the existence of the required code.
\begin{theorem}
Given length of the information word $K$, and $\epsilon >0$, we can
construct an error correcting code $E \set{0,1}^K \to \set{0,1}^N$ such that
\begin{enumerate}
\item
$E$ is computable in time $poly(K,\frac{1}{\epsilon})$.
\item
\underbar{Local list decodability:} There exists a randomized oracle
Turing Machine $M$ such that $M^r$ runs in time
$poly(\log(K),\frac{1}{\epsilon})$ and outputs a list of randomized
oracle TMs $D_1, D_2, \ldots$, such that for all received
words $r$ and information words $x$, if $r$ agrees with the encoding of
$x$ in at least $\frac{1}{2} + \epsilon$ positions, then there is some
machine $D_i$ that computes $x$. Formally,
\[
\forall x \in \set{0,1}^N \quad \left[\Delta(E(x),r) \le \frac{1}{2} +
\epsilon\right] \Rightarrow \exists i \quad \forall j\in N \quad
D_i^r(j) = x_j
\]
where $\Delta$ is the hamming distance between the two words. Each of
the machines $D_i$ also run in time $poly(\log(K),\frac{1}{\epsilon})$.
\end{enumerate}
\label{22:thm:goodecc}
\end{theorem}
This Error Correcting Code can now be used to obtain worst-case to average-case
reductions.
\begin{theorem}
For every $L\in\E$ there exists a language $L'\in\E$ such that
\[H_{L'}(m)\geq\frac{C_L(m)^{\Omega(1)}}{m^{O(1)}}.\]
\label{22:thm:red}
\end{theorem}
\begin{proof}
Let $L$ be a language in $\E$. Let $\chi_{L|_m}$ be the characteristic
function of $L$. We construct an ECC guaranteed by Theorem
\ref{22:thm:goodecc} with the following parameters: $K = 2^{m}$; $\epsilon =
\frac{1}{H_L(m)}$; and $N = 2^{m'}$ where $m' = O(m)$. We
denote the obtained string of length $N$ by $\chi_{L|_{m'}}$.
We can solve $L'$ in $\E$ by taking an input of length $m'$, computing
the corresponding length $m}(1.78,1.1907812)(0.24,0.49078125)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(1.92,1.1907812)(1.14,0.49078125)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(2.14,1.1707813)(2.2,0.49078125)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(2.5,1.1707813)(3.04,0.47078124)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(3.08,1.2107812)(3.72,0.45078126)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(3.64,1.2107812)(4.04,0.45078126)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(4.2,1.1707813)(4.44,0.47078124)
\usefont{T1}{ptm}{m}{n}
\rput(2.8423438,1.3607812){Uniformly distributed}
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(1.26,-1.1892188)(0.68,-0.24921875)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(1.72,-1.1292187)(1.9,-0.22921875)
\psline[linewidth=0.04cm,arrowsize=0.05291667cm 2.0,arrowlength=1.4,arrowinset=0.4]{->}(1.94,-1.1092187)(3.36,-0.28921875)
\usefont{T1}{ptm}{m}{n}
\rput(1.5214063,-1.3392187){set by adversary}
\end{pspicture}
}
\caption{A bit fixing source}
\end{figure}
\item \emph{Unpredictable or ``Somewhat random" sources.}
These are sources where each bit is
unpredictable given previous bits, namely for each $i$,
$\Pr[X_{i+1}=0 | X_0,X_1,...,X_i]\in[1/2-\delta,1/2+\delta]$ for some
$\delta<1/2$. That is, given any prefix of bits, the probability that
the next bit is 0, say, is not too strongly biased.
For $\delta\leq 1/2-1/n^{1-\epsilon}$. For any constant $\epsilon<1$ this gives a weak random source.
The min-entropy for this source, $H_{\infty} =
O\left(\log\left[\frac{1}{\left(1/2+\delta\right)^r}\right]\right) = O(r)$.
Therefore, the min-entropy for this source is also linear in $r$ for small
$\delta$.
\item \emph{Flat sources.} These sources are a generalization of Bit fixing
sources. The output is a uniform distribution over some subset $S$ of the
range $\{0,1\}^n$. It follows that $H_\infty(X)=\log|S|$.and such a
source is a weak random source if $|S|=2^{n^{\Omega(1)}}$. These flat
sources are generic sources to look at when we want to model a weak random
source with a given min-entropy $k$. This is the case because all weak sources
of the given min-entropy can be obtained as a convex combination of flat
sources. \begin{exercise}
Let $X$ be a random source with $H_\infty(X)\geq k$. Show that $X$ is a
convex combination of flat sources each on a subset of size at least $2^k$.
\end{exercise}
\end{itemize}
\subsection{Randomness Extractors}
Our goal is to construct extractors that can extract the maximum amount of
randomness from a given source with min entropy at least $k$. Although the distribution
of the extracted strings can not be expected to be perfectly uniform,
we want it to be statistically close to uniform. Also, we want to
extract a large number of random bits from the source string. The maximum
number of bits that can be extracted equals the min-entropy, because that
captures the total amount of randomness in the source sequence. Finally, we
would hope that the extractor is a deterministic procedure that outputs the
nearly uniformly distributed strings given input from the weak source. This last
requirement is too strong to be feasible. Indeed it is impossible to extract
even a single truly uniform bit deterministically.
\begin{proposition}
Let $E:\{0,1\}^n\rightarrow\{0,1\}^m$ be a function taking input from a
weak random source. There is a weak random source $X$ with
$H_\infty(X)=n-1$ so that when $m=1$, $E(X)$ is a constant function.
\end{proposition}
\begin{proof}
In this setting, $E$ outputs a single bit and so must output either 0 or 1
with probability $\geq 1/2$; suppose 0.
Define $X$ to be the flat distribution on $S=\{x | E(x)=0\}$. Then $X$ has
min-entropy at least $n-1$, yet $\Pr[E(X)=0]=1$ meaning that the
output distribution of $E$ is a constant.
\end{proof}
We think of $E$ in the above as taking a weak random source as input and
attempting to output bits that are close to uniform. The proposition shows
this cannot be done so simply. We therefore
augment $E$ with an additional input that comes from a perfect random source.
\begin{definition}[Extractor]
The function $E:\{0,1\}^n\times \{0,1\}^\ell\rightarrow\{0,1\}^m$ is a
$(k,\epsilon)$ extractor with seed length $l$ if for
all $X$ on $\{0,1\}^n$ with $H_\infty(X)\geq k$,
\begin{equation}
\parallel E(X,U_\ell)-U_m\parallel_1 < 2\epsilon
\label{eqn:22:extractor}
\end{equation}
where $U_\ell$ is a uniform variable on $\ell$ bits and $U_m$ is uniform
on $m$ bits.
\label{def:22:extractor}
\end{definition}
Note that (\ref{eqn:22:extractor}) is equivalent to the following: for every
event $A\subseteq\{0,1\}^m$,
\begin{equation}
|\Pr[E(X,U_\ell)\in A] - \Pr[U_m\in A]| < \epsilon.
\label{eqn:22:extractor2}
\end{equation}
Note that this definition of an extractor places a very strong requirement that
the extractor algorithm, which is independent of the weak source of randomness
under consideration, should work for all sources with the stated min-entropy.
Although we need some additional true randomness to define the extractor, we
often will be in the setting where $\ell=O(\log n)$, meaning the amount of true
randomness needed is very small. In fact, we will see uses of extractors in the
next section that eliminate the need for any true randomness in an algorithm by
cycling over all of the possible additional random strings in $\{0,1\}^\ell$
that could be input to the extractor.
There are two parameters of Definition \ref{def:22:extractor} that we
wish to optimize: we want to use as few true random bits as possible, and
we want to have as many near-uniform bits as output. That is, we want to
minimize $\ell$ and maximize $m$. The limits within which we can do this are
described by the following bounds, which we state without proof.
\begin{itemize}
\item $\ell\geq \log n + 2\log\frac{1}{\epsilon} - O(1)$
\item $m\leq k + \ell-2\log\frac{1}{\epsilon} + O(1)$
\end{itemize}
The intuition behind the first bound is that we need at least enough perfect
random bits to specify an index into the weak random source. The lower bound on
$l$ once again implies that it is not possible to have a deterministic extractor
with the given properties. Indeed, the proof of this bound is a generalization
of the construction we saw earlier as an argument for the need of a seed. The
second bound intuitively means that we can
extract out at most as many random bits as contained in the combination of the
weak source and perfect random source; And as $\epsilon$ decreases, the output
distribution gets statistically closer to a uniform source, at the cost of a
reduction in the number of bits extracted.
It can be shown that picking a function
at random with $\ell=\log n + 2\log\frac{1}{\epsilon}+O(1)$ and
$m=k+\ell-2\log\frac{1}{\epsilon}-O(1)$ with high probability satisfies
Definition \ref{def:22:extractor}. However,
this does not help us in the application we are interested in as the act of
picking an extractor at random requires a large amount of perfect randomness.
For our application, we want to develop extractors that are computable in
deterministic polynomial time. We will see some constructions in the next
lecture. In the rest of this lecture, we discuss a couple applications of
extractors.
\subsection{Applications}
We give two applications of extractors: to achieve our original goal of
simulating randomized algorithms with weak random sources, and to give another
alternate proof that $\BPP\subseteq\Sigma_2^p$.
In each application, we eliminate the
need for the perfect randomness by using an extractor where $\ell=O(\log n)$
and cycling over all possible seeds. There are other areas where this is
not feasible. For example, in many cryptographic settings, we do not have
this luxury. There do exist extractors, called seedless extractors,
that can be used in these settings. For these, the extractor takes input
from two independent weak random sources and outputs a distribution close
to uniform.
We do not discuss seedless extractors but only mention their existence.
\subsubsection{Simulating Randomized Algorithms}
We always assumed that a source of perfectly random sequences was available in
our discussion of randomized algorithms. We now show how extractors can be used
to run these algorithms using weak random sources. If we had an extractor that
did not need the seed of length $\ell$ from a perfect random source, simulating
a randomized algorithm with the extractor would be trivial; as we have shown,
however, such an extractor does not exist. So instead, we describe a simulation
that removes the need for the perfect random source while giving a simulation
that is correct with high probability. The main idea is to choose a sample from
the weak source and run the extractor with this sample on all possible strings
of the truly random input. We then test the algorithm with the each output of
the extractor, and take the majority vote.
Suppose we have a randomized algorithm $M$ that needs $r$ random bits. Because
our simulation should run in polynomial time, we can use only a polynomial
number of bits from the weak random source in the simulation. This means that we
should be able to extract $r$ truly random bits from $poly(r)$ bits from the
weak random source. This is only possible if the min-entropy of the source,
$H_{\infty} \ge n^{\gamma}, \gamma > 0$. This bounds the minimum randomness
needed in the source for this simulation to work. For instance, if the source
had $H_{\infty} = o(1)$, the time needed to extract $r$ bits would be
super-polynomial in $r$, and the simulation would not run in polynomial time.
We next describe a simulation for which a source with $H_{\infty} =
n^{\gamma}, \gamma > 0$ suffices.
Given a randomized algorithm M that needs $r$ random bits and an
extractor $E:\{0,1\}^n\times\{0,1\}^\ell\rightarrow\{0,1\}^r$. Given an
input $z$, we simulate $M(z)$ as follows:
\begin{algtab}
Set $count=0$.\\
Let $x$ be a sample from a random source $X$ on $\{0,1\}^n$.\\
\algforeach{$y\in\{0,1\}^\ell$}
Let $\rho_y=E(x,y)$.\\
\algifthen{$M(z,\rho_y)=1$}{$count = count + 1$}
\algend
\algifthenelse{$count\geq 2^{\ell}/2$}{Output 1}{Output 0}
\end{algtab}
Let us consider the probability that this simulation errs. Let $B_z$ be the bad
set for $z$ on algorithm $M$,
\begin{align*}
B_z&=\{\rho | M(z,\rho)\ \text{errs}\}
\\&=\{\rho | M(z,\rho)\neq \textrm{maj}_r(M(z,r))\}
\end{align*}
Then the bad set for
our simulation, given we have chosen the fixed source $x$, is
\begin{align*}
B_z'&= \{x | \textrm{maj}_y(M(z,E(x,y))\ \text{errs}\}
\\&= \{x | \Pr_y[E(x,y)\in B_z]\geq 1/2\}
\end{align*}
\begin{claim}
If $E$ is a $(k,1/6)$ extractor, then $|B_z'|< 2^k$.
\label{clm:18:extractor}
\end{claim}
\begin{proof}
Suppose $|B_z'|\geq 2^k$, and let $X$ be the flat source on $B_z'$. Notice
that $X$ has min-entropy at least $k$. Also, by our assumtion on $|B_z'|$,
$\Pr[E(X,U_\ell)\in B_z]\geq 1/2$, while $\Pr[U_m\in B_z]\leq 1/3$ since $M$ decides a
language with bounded error. So we
have a set $B_z$ where the difference in probability assigned between
the extractor and uniform is at least 1/6, contradicting $E$ being a
$(k,1/6)$ extractor.
\end{proof}
Given this claim, we compute the probability our simulation errs
(because of our choice of $x$), assuming that
$E$ is a $(k,1/6)$ extractor:
$$\Pr[\text{Simulation errs}] = \Pr_{x\leftarrow X}[x\in B_z']
\leq |B_z'|\cdot 2^{-H_\infty(X)} < 2^{k-H_\infty(X)}.$$
If we use a source $X$ with $H_\infty(X)$ slightly larger than $k$,
this probability will be at most $1/3$
(for example, $H_\infty(X)\geq k+2$ suffices).
\medskip
Now consider the efficiency of the simulation.
The time to complete the
simulation is the time to compute $E$, plus the product of $2^\ell$ and the
time of the original algorithm. Given a $\poly(n)$ computable extractor,
the time to compute $E$ is $\poly(m)$ because
$n=\poly(m)$. The $2^\ell$ term is $\poly(m)$ if $\ell=O(\log m)$, or
equivalently $\ell=O(\log n)$ since $n=\poly(m)$. Thus, the overhead is
only polynomial in the number of random bits $m$ the algorithm requires.
\subsubsection{Alternate Proof of $\BPP\subseteq\Sigma_2^p$}
For this application, we assume the existence of a $(n/2,1/6)$ extractor
$E:\{0,1\}^n\times\{0,1\}^\ell\rightarrow\{0,1\}^m$
computable in polynomial time and with $\ell=O(\log n)$. The existence of
such an extractor is proven in the next section.
Given a BPP machine $M$ requiring $m$ random bits and input $z$,
we wish to give a $\Sigma_2^p$ formula
equivalent to the acceptance of $M(z)$. We start by considering the simulation
given in the previous section using $E$ on a perfectly random source (one with
$H_\infty(X)=n$). We view a sample $x$ from this source as two
components of equal length: $x= (x_1,x_2)$ where $|x_1|=|x_2|=n/2$.
The number of $x$ on which the simulation fails on a sample from $X$ is
$< 2^{n/2}$ by Claim \ref{clm:18:extractor}. By a counting argument, there
is a choice of $x_1$ so that the simulation when given $(x_1,x_2)$ results in
the correct answer for all $x_2$. Stated formally, for an input $z$,
we have the following
$$z\in L(M) \Rightarrow \exists x_1 \forall x_2
(\Pr_y[M(z;E(x_1,x_2,y))=1] \geq 1/2).$$
Because $|y|=O(\log n)$ and assuming $m=n^{\Omega(1)}$,
the inside predicate is computable in polynomial
time.
If we can show that $z\notin L(M)$ implies the negation of the RHS, we will
be done, for then the above implication is in fact an equivalence,
and so the $\Sigma_2^p$ predicate
does exactly decide the language of $M$.
With this goal in mind, we switch the roles of $x_1$ and $x_2$, and
note that the simulation outputs 0 only when the appropriate probability
is less than 1/2, and get the following:
$$
\begin{array}{rl}
z\notin L(M) \Rightarrow & \exists x_2 \forall x_1
(\Pr_y[M(z;E(x_1,x_2,y))=1] < 1/2) \\
\Rightarrow & \forall x_1 \exists x_2
\neg(\Pr_y[M(z;E(x_1,x_2,y))=1]\geq 1/2).
\end{array}
$$
The first line implies the second because $\exists x \forall y$ always
implies $\forall y \exists x$ and
$(\Pr_y[M(z;E(x_1,x_2,y))=1] < 1/2) = \neg
(\Pr_y[M(z;E(x_1,x_2,y))=1]\geq 1/2)$.
\section*{Acknowledgements}
In writing the notes for this lecture, I perused the notes by Matt Elder
and Beth Skubak from the Spring 2007 offering of
CS~810, and the notes by Brian Rice and Jeff Kinne for lectures 19 and 21
from the Spring 2010 offering of CS~710.
\end{document}