##### Hyunseung Kang #### ##### Stat 431, Summer 2012 ##### ##### Last Update: July 19, 2012 ##### ##### General Advice about R ##### # If you have questions about R, Google is the best place to search for the answers # Google will search and return the R mailing list as your top results. R mailing list is # where R users post questions about R and R developers or the R community members answer # them. # Also, http://www.statmethods.net/ give you an excellent overview of how to make plots in R. # Also, www.stanford.edu/class/stat191 has R examples that are relevant to this course. # Each topic on the Stanford website gives you R examples along with its output for all # the stuff we're doing in class. ##### Reading Data ##### data = read.csv("filename") #for CSV or `,' deliminited data data = read.table("filename") #for importing tab deliminted and other forms of data #Attachs the column variables into R so you don't have to do data$something. Instead, #you can call something directly in R attach(data) ##### Creating Your Own Vectors in R ##### 1:55 # Creates a vector 1,2,3,4,5,...,55 seq(1,55,2) # Creates a vector 1,3,5,7,....,55, skipping every 2 (the last parameter in seq rep(1,100) #Creates a vector 1,1,1,....,1 repeated 100 times ##### Taking a subset of the data # Suppose your data looks like this # x1 x2 x3 # 1 21.23 "A" # 1 89.23 "B" # 0 NA "B" # . . . data[x3 == "A",] #Returns a subset of the data where x3 equals "A" data[x3 == "A" & x2 < 40,] # Returns a subset of the data where x3 equals "A" and x2 < 40 data[!is.na(x2),] # Returns a subset of the data where there are no (denoted by !) missing measurements for x2 levels(x3) #Returns all the choices in the categorical variable x3 ##### Basic Plots ##### par(mfrow = c(2,3)) # Creates a 2 by 3 plotting window. So you can put 6 plots in a single window! plot(x,y,main="Title",xlab="X Label",ylab="Y Label") #Creates a scatterplot between x and y hist(x) #Creates a histogram boxplot(x) # Creates a boxplot ##### Summary statistics ##### nrow(data) # Number of rows in the data ncol(data) # Number of columns in the data x = c(13,12,14,10,9,10,12,NA) is.na(x) #A vector that gives you whether each index is missing or not mean(x,na.rm=TRUE) #Sample mean with missing values removed sd(x,na.rm=TRUE) #Sample standard deviation with missing values removed var(x,na.rm=TRUE) #Sample variance with missing values removed median(x,na.rm=TRUE) #Sample median with missing values removed IQR(x,na.rm=TRUE) #Interquartile range with missing values removed quantile(x,0.75,na.rm=TRUE) #Sample 75% quantile with missing values removed table(x) #Creates a frequency table ##### Regression ##### model = lm(y ~ x) # Simple linear regression model = lm(y ~ x1 + x2 + x3) # Multiple linear regression summary(model) # Gives you a summary of the regression model predict(model) # Gives you the predicted values for the Xs residuals(model) # Gives you the residuals for the Xs hatvalues(model) # Gives you the hat values cooks.distance(model) # Gives you the Cook's distance plot(x,y,main="Scatterplot of X and Y",xlab="X",ylab="Y") abline(model) # Draws the fitted line in the scatter plot # Gives prediction and confidence intervals for a new value of X predict(model,list(x1=100,x2="male",x3=3),interval="confidence",level=0.95) predict(model,list(x1=100,x2="male",x3=3),interval="predict",level=0.95) ##### For Loops in R ##### x = 1 # For loops in R are very similar to those in Python. For each i, from 1 to 100 # the loop will perform x = x + i. In each iteration of the loop, i will take on # a value between 1 and 100 for(i in 1:100) { x = x + i } # Computes the mean for each column (denoted as 2) apply(dataMatrix,2,mean) # Computes the mean for each row (denoted as 1) apply(dataMatrix,1,mean)