### R Code for Lecture 3 ###

# Sampling Distribution if Population is Normally Distributed #

# Parameters of the population distribution
mu = 10 #Population mean
sigma = 2 #Population standard deviation

# Parameters of the sampling scheme
n = 30 #sample size
nExperiments = 1000 #number of times we are sampling 30 units from the population
sampleMeans = rep(0,nExperiments) #this vector is created to store the sample mean for each experiment

for (i in 1:nExperiments) {
  # For each experiment...
  X = rnorm(n,mu,sigma) #Draw n i.i.d. samples from the population
  sampleMeans[i] = mean(X)
}
h <- hist(sampleMeans,breaks=50,main=paste("Sample mean for n=",n,"after",nExperiments,"experiments"))

# Generate the sampling distribution expected from theory
x = seq(7,12,length = nExperiments)
y = dnorm(x,mu,sigma/sqrt(n))
y = y * max(h$counts) / max(y)
lines(x,y,col="blue")
abline(v=mu,col="red")
legend("topleft",legend=c("(Theory-Based) Sampling Distribution","Mean (Mu)"),lty=1,col=c("blue","red"),bty="n")

# Effect on the sampling distribution as n changes
# Parameters of the sampling scheme
possibleN = c(30,50,100,250) # possible sample size
nExperiments = 5000 #number of times we are sampling 30 units from the population

plot(c(8,12),c(0,3.2),type="n",main=paste("Effect on the sampling distribution as n changes after",nExperiments,"experiments"),ylab="Density",xlab="Sample mean values")
color = c("red","blue","purple","black")
for (i in 1:length(possibleN)) {
  n = possibleN[i] # First choose a sample size n
  sampleMeans = rep(0,nExperiments) # Reset this value to zero every time n changes
  for (j in 1:nExperiments) {
    # For each experiment...
    X = rnorm(n,mu,sigma) #Draw n i.i.d. samples from the population
    sampleMeans[j] = mean(X)
  }
  d <- density(sampleMeans)
  lines(d,col=color[i],lwd=2)
}
legend("topright",paste("N =",possibleN),lty=1,lwd=2,col=color)

# Probability that sample mean is within epsilon of the population mean
sigma = 1
epsilon = 1
maxN = 1000

nSeq = seq(1,maxN,1)
prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1)
plot(c(1,maxN),c(0,1),type="n",main=paste("Prob. that sample mean is within",epsilon,"of the pop. mean"),xlab="Sample size",ylab="Probability")
lines(nSeq,prob,col="red",lwd=2)

sigma = 2
prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1)
lines(nSeq,prob,col="blue",lwd=2)

sigma = 10
prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1)
lines(nSeq,prob,col="purple",lwd=2)

legend("bottomright",legend=c("Sigma=1","Sigma=2","Sigma=5"),lwd=2,col=c("red","blue","purple"))

# Chi Square Distribution of Sample Variance
mu = 10 #population mean
sigma = 4 #population sigma
nSample = 40 # Number of samples

nExperiments = 1000 # Number of resampling/experiment (i.e. number of simulation)
sampleVar = rep(0,nExperiments)
for (i in 1:nExperiments) {
  # For each resampling, we draw nSample from the population
  data = rnorm(nSample,mu,sigma)
  sampleVar[i] = (nSample - 1)*var(data)/sigma^2
}
h = hist(sampleVar,main="Sampling Distribution of (n-1)S^2/sigma^2",xlab="Value",breaks=50)
x = seq(0,100,length = nExperiments)
y = dchisq(x,nSample-1)
y = y * max(h$counts) / max(y) 
lines(x,y,col="blue")
legend("topright",legend=c("(Theory-Based) Sampling Distribution"),lty=1,col=c("blue"),bty="n")


# Central Limit Theorem and Convergence to Standard Normal
lambda = 4
nSample = 40 #number of samples

nExperiments = 1000 # Number of resampling/experiment (i.e. number of simulation)
sampleMean = rep(0,nExperiments)
for (i in 1:nExperiments) {
  # For each resampling, we draw nSample from the population
  data = rexp(nSample,lambda)
  sampleMean[i] = mean(data)
}
h = hist(sampleMean,main=paste("Sampling Distribution of the Sample Mean from Exponential Distribution (Lambda =",lambda,")"),xlab="Value",breaks=50)
x = seq(0,0.5,length = nExperiments)
y = dnorm(x,1/lambda,sqrt(1/(lambda^2 *nSample)))
y = y * max(h$counts) / max(y) 
lines(x,y,col="blue")
legend("topright",legend=c("(Theory-Based) Sampling Distribution"),lty=1,col=c("blue"),bty="n")

# Effect on the sampling distribution as n changes
# Parameters of the sampling scheme
lambda = 4
possibleN = c(5,15,30,50,100) # possible sample size
nExperiments = 5000 #number of times we are sampling 30 units from the population

plot(c(0,1),c(0,20),type="n",main=paste("Effect on the sampling distribution as n changes after",nExperiments,"experiments from Exp(",lambda,")"),ylab="Density",xlab="Sample mean values")
color = c("red","blue","purple","orange","black")
for (i in 1:length(possibleN)) {
  n = possibleN[i] # First choose a sample size n
  sampleMeans = rep(0,nExperiments) # Reset this value to zero every time n changes
  for (j in 1:nExperiments) {
    # For each experiment...
    X = rexp(n,lambda) #Draw n i.i.d. samples from the population
    sampleMeans[j] = mean(X)
  }
  d <- density(sampleMeans)
  lines(d,col=color[i],lwd=2)
}
legend("topright",paste("N =",possibleN),lty=1,lwd=2,col=color)

# An Experiment
data = rnorm(100)

par(mfrow = c(1,1))
resampleSize = round(length(data)*0.8)
nExperiments = 5000
results = rep(0,nExperiments)
for (i in 1:nExperiments) {
   data.resampled = sample(data,size=resampleSize,replace=TRUE)
   results[i] = mean(data.resampled)
   if (i > 30) {
     d <- density(results[1:i])
     plot(d,col="black",lwd=2,main="Sampling Distribution of the Mean")
   }
}