##### Hyunseung Kang ####
##### Stat 431, Summer 2012 #####
##### Last Update: July 19, 2012 #####

##### General Advice about R #####
# If you have questions about R, Google is the best place to search for the answers
# Google will search and return the R mailing list as your top results. R mailing list is 
# where R users post questions about R and R developers or the R community members answer 
# them.

# Also, http://www.statmethods.net/ give you an excellent overview of how to make plots in R.
# Also, www.stanford.edu/class/stat191 has R examples that are relevant to this course. 
# Each topic on the Stanford website gives you R examples along with its output for all
# the stuff we're doing in class.

##### Reading Data #####
data = read.csv("filename") #for CSV or `,' deliminited data
data = read.table("filename") #for importing tab deliminted and other forms of data

#Attachs the column variables into R so you don't have to do data$something. Instead,
#you can call something directly in R
attach(data) 

##### Creating Your Own Vectors in R #####
1:55 # Creates a vector 1,2,3,4,5,...,55
seq(1,55,2) # Creates a vector 1,3,5,7,....,55, skipping every 2 (the last parameter in seq
rep(1,100) #Creates a vector 1,1,1,....,1 repeated 100 times

##### Taking a subset of the data
# Suppose your data looks like this
# x1 x2      x3
# 1  21.23  "A"
# 1  89.23  "B"
# 0  NA     "B"
# .   .      .
data[x3 == "A",] #Returns a subset of the data where x3 equals "A"
data[x3 == "A" & x2 < 40,] # Returns a subset of the data where x3 equals "A" and x2 < 40
data[!is.na(x2),] # Returns a subset of the data where there are no (denoted by !) missing measurements for x2
levels(x3) #Returns all the choices in the categorical variable x3

##### Basic Plots #####
par(mfrow = c(2,3)) # Creates a 2 by 3 plotting window. So you can put 6 plots in a single window!
plot(x,y,main="Title",xlab="X Label",ylab="Y Label") #Creates a scatterplot between x and y
hist(x) #Creates a histogram
boxplot(x) # Creates a boxplot

##### Summary statistics #####
nrow(data) # Number of rows in the data
ncol(data) # Number of columns in the data
x = c(13,12,14,10,9,10,12,NA)
is.na(x) #A vector that gives you whether each index is missing or not
mean(x,na.rm=TRUE) #Sample mean with missing values removed
sd(x,na.rm=TRUE) #Sample standard deviation with missing values removed
var(x,na.rm=TRUE) #Sample variance with missing values removed
median(x,na.rm=TRUE) #Sample median with missing values removed
IQR(x,na.rm=TRUE) #Interquartile range with missing values removed
quantile(x,0.75,na.rm=TRUE) #Sample 75% quantile with missing values removed
table(x) #Creates a frequency table

##### Regression #####
model = lm(y ~ x) # Simple linear regression
model = lm(y ~ x1 + x2 + x3) # Multiple linear regression
summary(model) # Gives you a summary of the regression model
predict(model) # Gives you the predicted values for the Xs 
residuals(model) # Gives you the residuals for the Xs 
hatvalues(model) # Gives you the hat values
cooks.distance(model) # Gives you the Cook's distance

plot(x,y,main="Scatterplot of X and Y",xlab="X",ylab="Y")
abline(model) # Draws the fitted line in the scatter plot

# Gives prediction and confidence intervals for a new value of X
predict(model,list(x1=100,x2="male",x3=3),interval="confidence",level=0.95) 
predict(model,list(x1=100,x2="male",x3=3),interval="predict",level=0.95)


##### For Loops in R #####
x = 1
# For loops in R are very similar to those in Python. For each i, from 1 to 100
# the loop will perform x = x + i. In each iteration of the loop, i will take on 
# a value between 1 and 100
for(i in 1:100) {
  x = x + i
}

# Computes the mean for each column (denoted as 2)
apply(dataMatrix,2,mean) 

# Computes the mean for each row (denoted as 1)
apply(dataMatrix,1,mean)