I encourage you to review the cor(), lm(), and coef() functions
in the R Guide for Chapter 2.
The 8.1 notes start by reviewing chapter 2 and modeling auditory response time (\(y\)) as a linear function of visual response time (\(x\)). Here are those calculations (along with a few chapter 1 summary statistics we'll need later).
x = c(161, 203, 235, 176, 201, 188, 228, 211, 191, 178)
y = c(159, 206, 241, 163, 197, 193, 209, 189, 169, 201)
(model = lm(y ~ x))
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 22.360 0.864
mean(x)
## [1] 197.2
sd(x)
## [1] 23.26
sd(y)
## [1] 24.62
cor(x, y)
## [1] 0.8159
We can see more detail on the model via summary():
summary(model)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.344 -11.117 -0.707 8.279 24.885
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.360 42.948 0.52 0.617
## x 0.864 0.216 3.99 0.004 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.1 on 8 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.624
## F-statistic: 15.9 on 1 and 8 DF, p-value: 0.004
Notice under “Coefficients” that \(\hat{\beta}_0\) and \(\hat{\beta}_1\)
are shown, as are \(s_{\hat{\beta}_0}\) and \(s_{\hat{\beta}_1}\). Also
the test statistic \(t\) and P-value for the tests “\(H_0: \beta_0 = 0\)”
and “\(H_0: \beta_1 = 0\)” are given. (We did the \(\beta_1\) test in the
8.1 notes.)
Notice after “Residual standard error” that \(s\) is given as
15.1. We can get access to this number by saving the output of
summary(), which is a list whose components are accessed with the
$ operator and whose structure can be seen via str(). (You don't
need to study the output of str() below.)
out = summary(model)
str(out)
## List of 11
## $ call : language lm(formula = y ~ x)
## $ terms :Classes 'terms', 'formula' length 3 y ~ x
## .. ..- attr(*, "variables")= language list(y, x)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "y" "x"
## .. .. .. ..$ : chr "x"
## .. ..- attr(*, "term.labels")= chr "x"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: 0x417bfc8>
## .. ..- attr(*, "predvars")= language list(y, x)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "y" "x"
## $ residuals : Named num [1:10] -2.43 8.29 15.65 -11.39 1.02 ...
## ..- attr(*, "names")= chr [1:10] "1" "2" "3" "4" ...
## $ coefficients : num [1:2, 1:4] 22.36 0.864 42.948 0.216 0.521 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "x"
## .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
## $ aliased : Named logi [1:2] FALSE FALSE
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "x"
## $ sigma : num 15.1
## $ df : int [1:3] 2 8 2
## $ r.squared : num 0.666
## $ adj.r.squared: num 0.624
## $ fstatistic : Named num [1:3] 15.9 1 8
## ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
## $ cov.unscaled : num [1:2, 1:2] 8.08912 -0.040513 -0.040513 0.000205
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "x"
## .. ..$ : chr [1:2] "(Intercept)" "x"
## - attr(*, "class")= chr "summary.lm"
It looks like our 15.1 is the component called $sigma (which might
have been named $s, since it's an estimate for \(\sigma\), not
\(\sigma\) itself).
(s = out$sigma)
## [1] 15.1
Use confint() to get confidence intervals on the coefficients:
confint(model, conf.level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -76.6779 121.398
## x 0.3647 1.363
Notice the interval for \(\beta_1\) that we found (under “e.g. (#6c)” in the notes).
We can handle this without any new R. e.g. Here's the confidence interval from #6d in the notes:
(n = length(x))
## [1] 10
(s.y.hat = s * sqrt(1/n + (200 - mean(x))^2/((n - 1) * var(x))))
## [1] 4.813
(beta.0 = coef(model)[1])
## (Intercept)
## 22.36
(beta.1 = coef(model)[2])
## x
## 0.8638
(y.hat = beta.0 + beta.1 * 200)
## (Intercept)
## 195.1
alpha = 1 - 0.95
(t = -qt(alpha/2, n - 2))
## [1] 2.306
(margin = t * s.y.hat)
## [1] 11.1
(interval = c(y.hat - margin, y.hat + margin))
## (Intercept) (Intercept)
## 184.0 206.2
The fitted values \(\hat{y}_i\) and residuals \(e_i\) are components of
the (list) output of lm() (use str() to see other components, if
you wish). Here's the residual plot of the points {(\(\hat{y}_i\),
\(e_i\))} for the visual/auditory reaction times in the 8.2 notes.
y.hat = model$fitted.values
e = model$residuals
plot(y.hat, e, main = "Residual plot", xlab = "Residual", ylab = "Fitted value")
abline(a = 0, b = 0) # add horizontal line with intercept 0 and slope 0
We don't need any new R to handle transformations. Here's the Hudson Bay rivers example, #6c in the 8.2 notes.
x = c(94.24, 66.57, 59.79, 48.52, 40, 32.3, 31.2, 30.69, 26.65, 22.75, 21.2,
20.57, 19.77, 18.62, 17.96, 17.84, 16.06, 14.69, 11.63, 11.19, 11.08, 10.92,
9.94, 7.86, 6.92, 6.17, 4.88, 4.49, 3.98, 3.74, 3.25, 3.15, 2.76, 2.64,
2.59, 2.25, 2.23, 0.99, 0.84, 0.64, 0.52, 0.3) # annual discharge
y = c(4110.3, 4961.7, 10275.5, 6616.9, 7459.5, 2784.4, 3266.7, 4368.7, 1328.5,
4437.6, 1983, 1320.1, 1735.7, 1944.1, 3420.2, 2655.3, 3470.3, 1561.6, 869.8,
936.8, 1315.7, 1727.1, 768.1, 483.3, 334.5, 1049.9, 485.1, 289.6, 551.8,
288.9, 295.2, 500.1, 611, 1311.5, 413.8, 263.2, 490.7, 204.2, 491.7, 74.2,
240.6, 56.6) # peak flow
(model = lm(log(y) ~ log(x)))
##
## Call:
## lm(formula = log(y) ~ log(x))
##
## Coefficients:
## (Intercept) log(x)
## 5.258 0.799
plot(log(x), log(y), main = "Model", xlab = "ln(Annual discharge)", ylab = "ln(Peak flow)")
abline(a = coef(model)[1], b = coef(model)[2]) # add regression line
plot(model$fitted.values, model$residuals, main = "Residual plot", xlab = "Fitted value",
ylab = "Residual")
abline(a = 0, b = 0)
We don't need much new R to handle 8.3. To get a model from lm()
with y depending on several independent variables, separate those
variables with “+”. e.g. For a linear model of \(y\) depending linearly
on \(x_1\), \(x_2\), and \(x_3\), we can use lm(y ~ x.1 + x.2 + x.3).
e.g. Here's example #16 from the 8.3 notes. Notice that it uses
“polynomial regression,” in which we trick the regression computation
into treating \(x^2\) as a second independent variable. A more typical
example would set x.2 (below) from raw data, not from the square of
x.
x = c(1, 1.5, 2, 2.5, 3)
y = c(99, 96, 88, 76, 66)
x.1 = x
(x.2 = x^2)
## [1] 1.00 2.25 4.00 6.25 9.00
(model = lm(y ~ x.1 + x.2))
##
## Call:
## lm(formula = y ~ x.1 + x.2)
##
## Coefficients:
## (Intercept) x.1 x.2
## 101.40 3.37 -5.14
(y.hat = model$fitted.values)
## 1 2 3 4 5
## 99.63 94.89 87.57 77.69 65.23
(e = model$residuals)
## 1 2 3 4 5
## -0.6286 1.1143 0.4286 -1.6857 0.7714
summary(model)
##
## Call:
## lm(formula = y ~ x.1 + x.2)
##
## Residuals:
## 1 2 3 4 5
## -0.629 1.114 0.429 -1.686 0.771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.40 6.44 15.73 0.004 **
## x.1 3.37 7.01 0.48 0.678
## x.2 -5.14 1.73 -2.97 0.097 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.62 on 2 degrees of freedom
## Multiple R-squared: 0.993, Adjusted R-squared: 0.986
## F-statistic: 145 on 2 and 2 DF, p-value: 0.00685
(SSE = sum(e^2))
## [1] 5.257
n = length(y)
(SST = (n - 1) * var(y))
## [1] 768
(SSR = sum((y.hat - mean(y))^2))
## [1] 762.7
SST - SSE # check that this equals SSR, above
## [1] 762.7
(R.squared = SSR/SST)
## [1] 0.9932
p = 2
(F = (SSR/p)/(SSE/(n - (p + 1))))
## [1] 145.1
(p.value = 1 - pf(F, p, n - (p + 1)))
## [1] 0.006845
Note that the last line of the last summary() call, above, shows
“F-statistic: 145 ...”, while the “(F = (SSR/p) / (SSE/(n-(p+1))))”
line gives “145.1”. R made different decisions on rounding \(F\) in
these two cases, and neither gives very many digits. We can suggest
(but not control) the number of digits to print via the options()
function:
options(digits = 15)
Now when we re-run the previous two lines, we see more digits of \(F\):
summary(model)
##
## Call:
## lm(formula = y ~ x.1 + x.2)
##
## Residuals:
## 1 2 3 4
## -0.628571428571 1.114285714286 0.428571428571 -1.685714285714
## 5
## 0.771428571429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.40000000000 6.44448823193 15.73438 0.0040149 **
## x.1 3.37142857143 7.00833323222 0.48106 0.6779608
## x.2 -5.14285714286 1.73322867293 -2.96721 0.0972878 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.62128696676 on 2 degrees of freedom
## Multiple R-squared: 0.993154761905, Adjusted R-squared: 0.98630952381
## F-statistic: 145.086956522 on 2 and 2 DF, p-value: 0.00684523809524
(F = (SSR/p)/(SSE/(n - (p + 1))))
## [1] 145.086956521739