### R Code for Lecture 2 ### # Data # data <- c(23,24,24,25,24,24,24,24,24,24,24,25) # OR if your data is in a file # data <- read.table("guessAge.txt") # attach(data) ### Numerical Anaylsis of Data ### # Summary Statistics # Center/location mean(guessAge) #gives you the mean median(guessAge) #gives you the median # Dispersion sd(guessAge) #gives you the standard deviation var(guessAge) #gives you the variance IQR(guessAge) #gives you the IQR # Rank min(guessAge) #gives you the minimum value max(guessAge) #gives you the maximum value quantile(guessAge,0.25) #gives you the 25% quantile quantile(guessAge,c(0.25,0.75)) # gives you the 25% and the 75% quantile simultaneously # Others summary(guessAge) #gives you a quick summary of the collected sample ### Visual Analysis of Data ### # Histograms par(mfrow = c(2,3)) # Makes a 2 by 2 plot window hist(runif(5000),main="Uniform and Symmetric",xlab="Value") hist(rnorm(5000),main="Unimodal and Symmetric",xlab="Value") hist(c(rnorm(2500),rnorm(2500,5)),main="Bimodal and Symmetric",breaks=50,xlab="Value") hist(rchisq(5000,5),main="Right-Skewed",xlab="Value") hist(rbeta(5000,5,2),main="Left-Skewed",xlab="Value") hist(c(rnorm(95),rep(10,5)),breaks=10,main="Possible Outlier",xlab="Value") par(mfrow = c(1,1)) d.skinny <- density(rnorm(10000)) d.fat <- density(rt(10000,5)) plot(d.skinny,col="blue",main="Skinny and Fat Tailed Distributions",xlab="Value") lines(d.fat,col="red") legend("topright",legend=c("Fat-Tailed","Skinny-Tailed"),col=c("red","blue"),lty=1) # Boxplots data.boxplot <- matrix(c(rnorm(250,3),rnorm(250,3,10),rchisq(250,3),10*rbeta(250,5,2),rnorm(245,3),rep(10,5)),250,5) colnames(data.boxplot) <- c("Symmetric","Fat-tailed (vs Symmetric)","Right/Positive Skewed","Left/Negative-Skewed","Outlier") boxplot(data.boxplot) # Scatterplots par(mfrow = c(3,2)) #Makes 3 by 2 plot window x1 = runif(1000) x2 = 2 + 5*x1 + rnorm(1000) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) x1 = runif(1000) x2 = 2 - 5*x1 + rnorm(1000) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) x1 = runif(1000) x2 = runif(1000) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) x1 = runif(1000,-1,1) x2 = 2 -10* x1^2 + rnorm(1000) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) x1 = runif(1000,-1,1) x2 = rep(c(-1,1),500)*sqrt(1 - x1^2) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) x1 = runif(1000) x2 = rep(0,1000) plot(x1,x2,main=paste("Scatterplot, Sample Correlation",cor(x1,x2))) # 3D Tour of scatterplots (thanks to Andreas Buja) # Don't worry too much about the code below. Remember, R treats <- and = as identical operations smooth2d <- function(xxy, bw1=1/3, bw2=bw1, nloc1=21, nloc2=nloc1) { x1 <- xxy[,1]; x2 <- xxy[,2]; y <- xxy[,3] # unpack x1min <- min(x1); x1max <- max(x1) # for grid x2min <- min(x2); x2max <- max(x2) # dito x1s <- seq(x1min, x1max, length=nloc1) # x1 grid x2s <- seq(x2min, x2max, length=nloc2) # x2 grid fxx <- matrix(NA, nrow=nloc1, ncol=nloc2) # fits: nloc1*nloc2 act.bw1 <- sd(x1) * bw1 # x1 bandwidth act.bw2 <- sd(x2) * bw2 # x2 bandwidth for(i in 1:nloc1) { # loop over all combinations.. for(j in 1:nloc2) { # of x1 and x2 grid values w <- dnorm(x=x1-x1s[i],sd=act.bw1) * # weights: product of dnorm(x=x2-x2s[j],sd=act.bw2) # Gaussian kernels fxx[i,j] <- sum(y*w) / sum(w) } } return(list(x=x1s, y=x2s, z=fxx)) # return the two grids and the fits } # note: length(z)=length(x)*length(y) xxy <- matrix(0,100,3) xxy[,1] <- runif(100,-3,3) xxy[,2] <- runif(100,-3,3) xxy[,3] <- xxy[,1]^2 + xxy[,2]^2 + rnorm(100,0,0.5) sm2d <- smooth2d(xxy) #Perspective plots, first with hidden lines shown: not very good... persp(sm2d) #With hidden lines removed and the surface colored and shaded: persp(sm2d, ltheta=0, lphi=45, shade=0.9, col="orange") #Animate to search for informative views: for(iplt in 0:10000) { persp(sm2d, theta=0.1*iplt, phi=15+iplt*0.02, r=sqrt(3), shade=0.5, col="orange", main=iplt, xlab="X1", ylab="X2", zlab="X3",axes=TRUE,ticktype="detailed") -> res points(trans3d(xxy[,1], xxy[,2], xxy[,3], pmat = res), col=2, pch=16, cex=.5) } # Quantile-Quantile Plots par(mfrow =c(2,2)) x <- rchisq(100,2) hist(x,main="Right-Skewed Data",breaks=10,xlab="Values") qqnorm(scale(x),main="Normal QQ Plot") # Normal QQ plot. You must use scale(x) to obtain the z-scores abline(a=0,b=1) #Draws a line with intercept zero and slope 1 hist(sqrt(x),main="Square Root Transformed Data",breaks=10,xlab="Sqrt(Values)") qqnorm(scale(sqrt(x)),main="Square Root Transformed QQ Plot") abline(a=0,b=1)