### R Code for Lecture 3 ### # Sampling Distribution if Population is Normally Distributed # # Parameters of the population distribution mu = 10 #Population mean sigma = 2 #Population standard deviation # Parameters of the sampling scheme n = 30 #sample size nExperiments = 1000 #number of times we are sampling 30 units from the population sampleMeans = rep(0,nExperiments) #this vector is created to store the sample mean for each experiment for (i in 1:nExperiments) { # For each experiment... X = rnorm(n,mu,sigma) #Draw n i.i.d. samples from the population sampleMeans[i] = mean(X) } h <- hist(sampleMeans,breaks=50,main=paste("Sample mean for n=",n,"after",nExperiments,"experiments")) # Generate the sampling distribution expected from theory x = seq(7,12,length = nExperiments) y = dnorm(x,mu,sigma/sqrt(n)) y = y * max(h$counts) / max(y) lines(x,y,col="blue") abline(v=mu,col="red") legend("topleft",legend=c("(Theory-Based) Sampling Distribution","Mean (Mu)"),lty=1,col=c("blue","red"),bty="n") # Effect on the sampling distribution as n changes # Parameters of the sampling scheme possibleN = c(30,50,100,250) # possible sample size nExperiments = 5000 #number of times we are sampling 30 units from the population plot(c(8,12),c(0,3.2),type="n",main=paste("Effect on the sampling distribution as n changes after",nExperiments,"experiments"),ylab="Density",xlab="Sample mean values") color = c("red","blue","purple","black") for (i in 1:length(possibleN)) { n = possibleN[i] # First choose a sample size n sampleMeans = rep(0,nExperiments) # Reset this value to zero every time n changes for (j in 1:nExperiments) { # For each experiment... X = rnorm(n,mu,sigma) #Draw n i.i.d. samples from the population sampleMeans[j] = mean(X) } d <- density(sampleMeans) lines(d,col=color[i],lwd=2) } legend("topright",paste("N =",possibleN),lty=1,lwd=2,col=color) # Probability that sample mean is within epsilon of the population mean sigma = 1 epsilon = 1 maxN = 1000 nSeq = seq(1,maxN,1) prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1) plot(c(1,maxN),c(0,1),type="n",main=paste("Prob. that sample mean is within",epsilon,"of the pop. mean"),xlab="Sample size",ylab="Probability") lines(nSeq,prob,col="red",lwd=2) sigma = 2 prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1) lines(nSeq,prob,col="blue",lwd=2) sigma = 10 prob = pnorm(epsilon *sqrt(nSeq)/sigma) - pnorm(epsilon*sqrt(nSeq)/sigma * -1) lines(nSeq,prob,col="purple",lwd=2) legend("bottomright",legend=c("Sigma=1","Sigma=2","Sigma=5"),lwd=2,col=c("red","blue","purple")) # Chi Square Distribution of Sample Variance mu = 10 #population mean sigma = 4 #population sigma nSample = 40 # Number of samples nExperiments = 1000 # Number of resampling/experiment (i.e. number of simulation) sampleVar = rep(0,nExperiments) for (i in 1:nExperiments) { # For each resampling, we draw nSample from the population data = rnorm(nSample,mu,sigma) sampleVar[i] = (nSample - 1)*var(data)/sigma^2 } h = hist(sampleVar,main="Sampling Distribution of (n-1)S^2/sigma^2",xlab="Value",breaks=50) x = seq(0,100,length = nExperiments) y = dchisq(x,nSample-1) y = y * max(h$counts) / max(y) lines(x,y,col="blue") legend("topright",legend=c("(Theory-Based) Sampling Distribution"),lty=1,col=c("blue"),bty="n") # Central Limit Theorem and Convergence to Standard Normal lambda = 4 nSample = 40 #number of samples nExperiments = 1000 # Number of resampling/experiment (i.e. number of simulation) sampleMean = rep(0,nExperiments) for (i in 1:nExperiments) { # For each resampling, we draw nSample from the population data = rexp(nSample,lambda) sampleMean[i] = mean(data) } h = hist(sampleMean,main=paste("Sampling Distribution of the Sample Mean from Exponential Distribution (Lambda =",lambda,")"),xlab="Value",breaks=50) x = seq(0,0.5,length = nExperiments) y = dnorm(x,1/lambda,sqrt(1/(lambda^2 *nSample))) y = y * max(h$counts) / max(y) lines(x,y,col="blue") legend("topright",legend=c("(Theory-Based) Sampling Distribution"),lty=1,col=c("blue"),bty="n") # Effect on the sampling distribution as n changes # Parameters of the sampling scheme lambda = 4 possibleN = c(5,15,30,50,100) # possible sample size nExperiments = 5000 #number of times we are sampling 30 units from the population plot(c(0,1),c(0,20),type="n",main=paste("Effect on the sampling distribution as n changes after",nExperiments,"experiments from Exp(",lambda,")"),ylab="Density",xlab="Sample mean values") color = c("red","blue","purple","orange","black") for (i in 1:length(possibleN)) { n = possibleN[i] # First choose a sample size n sampleMeans = rep(0,nExperiments) # Reset this value to zero every time n changes for (j in 1:nExperiments) { # For each experiment... X = rexp(n,lambda) #Draw n i.i.d. samples from the population sampleMeans[j] = mean(X) } d <- density(sampleMeans) lines(d,col=color[i],lwd=2) } legend("topright",paste("N =",possibleN),lty=1,lwd=2,col=color) # An Experiment data = rnorm(100) par(mfrow = c(1,1)) resampleSize = round(length(data)*0.8) nExperiments = 5000 results = rep(0,nExperiments) for (i in 1:nExperiments) { data.resampled = sample(data,size=resampleSize,replace=TRUE) results[i] = mean(data.resampled) if (i > 30) { d <- density(results[1:i]) plot(d,col="black",lwd=2,main="Sampling Distribution of the Mean") } }