% To produce pdf under linux, run
% pdflatex hw2.tex 

\documentclass{article}
\usepackage{amsmath,amssymb}
\usepackage{bbm}
\usepackage{graphicx}
\usepackage{url}

\def\bfx{\mathbf x}
\def\R{\mathbb R}
\def\E{\mathbb E}
\def\argmax{\mathrm{argmax}}

\title{CS761 Spring 2015 Homework 2}
\author{Assigned Mar. 13, due Mar. 27 before class}
\date{}
\begin{document}
\maketitle

Instructions: 
\begin{itemize}
\item Homeworks are to be done individually.
\item Typeset your homework in latex using this file as template (e.g. use pdflatex).  Show your derivations.
\item Hand in the compiled pdf (not the latex file) online.  Instructions will be provided.  We do not accept hand-written homeworks.  
\item Homework will no longer be accepted once the lecture starts.
\item Fill in your name and email below.  
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Insert your name and email here:

Name:                      

Email: 

\newpage % Please keep this page-break
% Do not include any identifying information below this line.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{enumerate}

\item
Let $X_0, X_1, \ldots, X_{M-1}$ denote a random sample of $N$-dimensional random vectors $X_n$, each of which has mean value $m$ and covariance matrix $R$.  Show that the sample mean
$$\hat m_t = \frac{1}{t+1} \sum_{n=0}^t  X_n$$
and the sample covariance 
$$S_t(\hat m_t) = \frac{1}{t+1} \sum_{n=0}^t (X_n - \hat m_t)(X_n - \hat m_t)^\top$$
may be written recursively as
$$\hat m_t = \frac{t}{t+1} \hat m_{t-1} + \frac{1}{t+1} X_t, \quad \hat m_0 = X_0,$$
and
$$S_t(\hat m_t) = Q_t - \hat m_t \hat m_t^\top,$$
where
$$ Q_t  = \frac{t}{t+1} Q_{t-1} + \frac{1}{t+1} X_t X_t^\top.$$

\item
Suppose we roll a fair 6-sided die 100 times. Let $X$ be the sum of the outcomes.  Bound $P(|X-350| \ge 100)$ using Chebyshev and Hoeffding, respectively.

\item
Let $\mathcal X$ be the vector space of \emph{finitely} nonzero sequences $X=(x_1, x_2, \ldots, x_n, 0, 0, \ldots)$.
Define the norm on $\mathcal X$ as $\|X\|=\max |x_i|$.
Let $X_n$ be a point in $\mathcal X$ (a sequence) defined by
$$X_n = \left(1, \frac{1}{2}, \frac{1}{3}, \ldots, \frac{1}{n}, 0, 0, \ldots \right).$$
\begin{itemize}
\item
Show that the sequence $X_n$ is a Cauchy sequence.
\item
Show that $\mathcal X$ is not complete.
\end{itemize}

\item
Determine the range and nullspace of the following linear operators (matrices):
$$A=
\begin{bmatrix}
1 & 0 \\
5 & 4 \\
2 & 4
\end{bmatrix}
\quad
B=
\begin{bmatrix}
1 & 0 & 1 \\
5 & 4 & 9 \\
2 & 4 & 6
\end{bmatrix}
$$

\item
Let 
$$A=\begin{bmatrix}
1 & 4 & 5 & 6 \\
6 & 7 & 2 & 1
\end{bmatrix}
\quad
b=\begin{bmatrix}
48\\
30
\end{bmatrix}.$$
One solution to $Ax=b$ is $x=[1,2,3,4]^\top$.  Compute the least-squares solution using the SVD (explain how), and compare. Why was the solution chosen?

\item
Consider the following process.  A probability vector $p=(p_1, \ldots, p_d)$ is drawn from a Dirichlet distribution with parameter vector $\alpha$.
Then, a vector of category counts $x=(x_1, \ldots, x_d)$ is drawn from a multinomial distribution with probability vector $p$ and number of trials $N$. Give an analytic form of $P(x \mid \alpha)$.  

\item
Let $X_1, X_2, \ldots, X_m$ be a random sample, where $X_i \sim U(0,\theta)$ the uniform distribution.
\begin{itemize}
\item Show that $\hat\theta_{ML} = \max X_i$.
\item Show that the density of $\hat\theta_{ML}$ is $f_\theta(x) = \frac{m}{\theta^m} x^{m-1}$.
\item Find the expected value of $\hat\theta_{ML}$.
\item Find the variance of $\hat\theta_{ML}$.
\end{itemize}

\item
Let $X_1, \ldots, X_n$ be a sample from $N(\mu, \sigma^2)$.
\begin{itemize}
\item Show that the MLE of $\sigma^2$ is
$${\hat\sigma}^2 = n^{-1} \sum_{i=1}^n (X_i - \bar X)^2.$$
\item Show that ${\hat\sigma}^2$ has a smaller mean squared error than 
$$(n-1)^{-1} \sum_{i=1}^n (X_i - \bar X)^2.$$
\end{itemize}

\item
Consider the directed graphical model in which none of the variables is observed.  
$$
\begin{array}{cc}
a \searrow &  \\
& c \rightarrow d \\
b \nearrow &
\end{array}
$$
Show that $a \bot b \vert \emptyset$ by using a probability argument.
Suppose we now observe the variable $d$.  Show that in general $a \not\perp b \vert d$ (you can use a counterexample).

\item
Consider two discrete random variables $x,y \in \{A,B,C\}$.
Construct a joint distribution $p(x,y)$ with the following properties:
\begin{itemize}
\item $\hat x$ is the maximizer of the marginal $p(x)$
\item $\hat y$ is the maximizer of the marginal $p(y)$
\item $p(\hat x, \hat y)=0$.
\end{itemize}

\item
Logistic regression for $y\in \{-1,1\}$ is defined by
$$p(y \mid x; w,b) = \frac{1}{1+e^{-y (x^\top w + b)}}.$$
Show that logistic regression is in the exponential family, that is, the probability distribution can be written in the form
$$p(y \mid x; \tilde w) = \frac{1}{Z(x,\tilde w)} e^{\phi(y,x)^\top \tilde w}.$$
Note the mapping $\phi$ depends only on $y, x$, but not on $w$ or $b$.
\end{enumerate}
\end{document}