The following data (bloodpress.txt) on 20 individuals with high blood pressure

The researchers were interested in determining if a relationship exists between blood pressure and age, weight, body surface area, duration, pulse rate and/or stress level.

rm(list=ls())
bloodpress <- read.table("./bloodpress.txt", header = T)
( cor.bp <- round ( cor(bloodpress[,-1]) ,3))
##           BP   Age Weight   BSA   Dur Pulse Stress
## BP     1.000 0.659  0.950 0.866 0.293 0.721  0.164
## Age    0.659 1.000  0.407 0.378 0.344 0.619  0.368
## Weight 0.950 0.407  1.000 0.875 0.201 0.659  0.034
## BSA    0.866 0.378  0.875 1.000 0.131 0.465  0.018
## Dur    0.293 0.344  0.201 0.131 1.000 0.402  0.312
## Pulse  0.721 0.619  0.659 0.465 0.402 1.000  0.506
## Stress 0.164 0.368  0.034 0.018 0.312 0.506  1.000
cor.bp *(cor.bp>0.8)
##           BP Age Weight   BSA Dur Pulse Stress
## BP     1.000   0  0.950 0.866   0     0      0
## Age    0.000   1  0.000 0.000   0     0      0
## Weight 0.950   0  1.000 0.875   0     0      0
## BSA    0.866   0  0.875 1.000   0     0      0
## Dur    0.000   0  0.000 0.000   1     0      0
## Pulse  0.000   0  0.000 0.000   0     1      0
## Stress 0.000   0  0.000 0.000   0     0      1

Topic 1 Effects of multicollinearity:

regress BP on Weight and BSA, check out the changes when a new variable which is correlated to the existing variable is added to the model.

## 
## Call:
## lm(formula = BP ~ Weight, data = bloodpress)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6933 -0.9318 -0.4935  0.7703  4.8656 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.20531    8.66333   0.255    0.802    
## Weight       1.20093    0.09297  12.917 1.53e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.74 on 18 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.8972 
## F-statistic: 166.9 on 1 and 18 DF,  p-value: 1.528e-10
## 
## Call:
## lm(formula = BP ~ BSA, data = bloodpress)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.314 -1.963 -0.197  1.934  4.831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   45.183      9.392   4.811  0.00014 ***
## BSA           34.443      4.690   7.343 8.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.79 on 18 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7358 
## F-statistic: 53.93 on 1 and 18 DF,  p-value: 8.114e-07
## 
## Call:
## lm(formula = BP ~ Weight + BSA, data = bloodpress)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8932 -1.1961 -0.4061  1.0764  4.7524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.6534     9.3925   0.602    0.555    
## Weight        1.0387     0.1927   5.392 4.87e-05 ***
## BSA           5.8313     6.0627   0.962    0.350    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.744 on 17 degrees of freedom
## Multiple R-squared:  0.9077, Adjusted R-squared:  0.8968 
## F-statistic: 83.54 on 2 and 17 DF,  p-value: 1.607e-09
## 
## Call:
## lm(formula = BP ~ BSA + Weight, data = bloodpress)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8932 -1.1961 -0.4061  1.0764  4.7524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.6534     9.3925   0.602    0.555    
## BSA           5.8313     6.0627   0.962    0.350    
## Weight        1.0387     0.1927   5.392 4.87e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.744 on 17 degrees of freedom
## Multiple R-squared:  0.9077, Adjusted R-squared:  0.8968 
## F-statistic: 83.54 on 2 and 17 DF,  p-value: 1.607e-09
## Analysis of Variance Table
## 
## Response: BP
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Weight     1 505.47  505.47  166.86 1.528e-10 ***
## Residuals 18  54.53    3.03                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
## 
## Response: BP
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## BSA        1 419.86  419.86  53.927 8.114e-07 ***
## Residuals 18 140.14    7.79                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
## 
## Response: BP
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## Weight     1 505.47  505.47 166.1648 3.341e-10 ***
## BSA        1   2.81    2.81   0.9251    0.3496    
## Residuals 17  51.71    3.04                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
## 
## Response: BP
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## BSA        1 419.86  419.86 138.021 1.391e-09 ***
## Weight     1  88.43   88.43  29.069 4.871e-05 ***
## Residuals 17  51.71    3.04                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 2, how to detect the multicollinearity – Variation Inflation Factor.

VIF definition

model1 <- lm(BP~., data = bloodpress[,-1])
summary(model1)
## 
## Call:
## lm(formula = BP ~ ., data = bloodpress[, -1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93213 -0.11314  0.03064  0.21834  0.48454 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.870476   2.556650  -5.034 0.000229 ***
## Age           0.703259   0.049606  14.177 2.76e-09 ***
## Weight        0.969920   0.063108  15.369 1.02e-09 ***
## BSA           3.776491   1.580151   2.390 0.032694 *  
## Dur           0.068383   0.048441   1.412 0.181534    
## Pulse        -0.084485   0.051609  -1.637 0.125594    
## Stress        0.005572   0.003412   1.633 0.126491    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4072 on 13 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9944 
## F-statistic: 560.6 on 6 and 13 DF,  p-value: 6.395e-15
#install.packages("lme4")
#install.packages("car")
library(lme4)
## Loading required package: Matrix
library(car)
car::vif(model1)
##      Age   Weight      BSA      Dur    Pulse   Stress 
## 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845

Topic 3, regress on categorial and continuous variables – ANCOVA (analysis of covariance)

crates <- data.frame(list(
    Day = 1:9,
    x1 = c(1,1,1,0,0,1,0,1,1),
    x2 = c(0,1,0,1,1,1,1,0,1),
    y  = c(48,51,39,24,24,27,12,27,24)
))

#a) 
Model.1 <- lm(y~ x1+x2 -1, crates)
summary(Model.1)
## 
## Call:
## lm(formula = y ~ x1 + x2 - 1, data = crates)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##    -18     -3      9     12     18 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)   
## x1   30.000      6.503   4.613  0.00245 **
## x2   12.000      6.503   1.845  0.10749   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.79 on 7 degrees of freedom
## Multiple R-squared:  0.8635, Adjusted R-squared:  0.8245 
## F-statistic: 22.14 on 2 and 7 DF,  p-value: 0.0009404
res <- crates$y - Model.1$fitted.values
sum(crates$x1*res)
## [1] 7.105427e-15
sum(crates$x2*res)
## [1] 5.329071e-15
## test sum of slope =30?
X <- model.matrix((Model.1))
var.12 <- sum(solve(t(X)%*%X)) * sum(res^2)/(9-2)
t = (sum(Model.1$coefficients)-30)/sqrt(var.12)
(1- pt(q = t, df = 7))*2
## [1] 0.1074942
## a new factor variable  U
crates$U <- as.factor(c(1,3,1,2,2,3,2,1,3))
Model.2 <- lm(y~U, crates)
anova(Model.2)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value Pr(>F)
## U          2    536     268   2.127 0.2003
## Residuals  6    756     126
#(f)
newdata <- crates
newdata $y.hat <- mean(crates$y)
newdata$'y.bar- y.hat' <- Model.2$fitted.values -newdata$y.hat
newdata$'y - y.bar' <- newdata$y - Model.2$fitted.values
newdata
##   Day x1 x2  y U    y.hat y.bar- y.hat y - y.bar
## 1   1  1  0 48 1 30.66667     7.333333        10
## 2   2  1  1 51 3 30.66667     3.333333        17
## 3   3  1  0 39 1 30.66667     7.333333         1
## 4   4  0  1 24 2 30.66667   -10.666667         4
## 5   5  0  1 24 2 30.66667   -10.666667         4
## 6   6  1  1 27 3 30.66667     3.333333        -7
## 7   7  0  1 12 2 30.66667   -10.666667        -8
## 8   8  1  0 27 1 30.66667     7.333333       -11
## 9   9  1  1 24 3 30.66667     3.333333       -10
#(g)
Model.3 <- lm(y~ x1 + x2+ x1*x2 +Day -1 ,crates)
summary(Model.3)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x1 * x2 + Day - 1, data = crates)
## 
## Residuals:
##       1       2       3       4       5       6       7       8       9 
## -0.4639  4.2108 -2.4880 -0.6506  2.8373 -5.8373 -2.1867  2.9518  1.6265 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## x1     51.9518     3.2112  16.178 1.64e-05 ***
## x2     38.6024     3.7466  10.303 0.000148 ***
## Day    -3.4880     0.5471  -6.375 0.001405 ** 
## x1:x2 -36.7892     4.5376  -8.108 0.000463 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.07 on 5 degrees of freedom
## Multiple R-squared:  0.9915, Adjusted R-squared:  0.9847 
## F-statistic:   146 on 4 and 5 DF,  p-value: 2.31e-05
anova(Model.3)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## x1         1 7776.0  7776.0 469.4218 3.886e-06 ***
## x2         1  648.0   648.0  39.1185  0.001532 ** 
## Day        1  160.3   160.3   9.6769  0.026527 *  
## x1:x2      1 1088.9  1088.9  65.7333  0.000463 ***
## Residuals  5   82.8    16.6                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfcol=c(2,2))
plot(Model.3, which = 1)
plot(Model.3, which = 2)
plot(1:9, Model.3$residuals,type = "b")

#(h)
par(mfcol=c(2,2))

plot(Model.1, which =1)
plot(Model.1, which =2)
plot(1:9, Model.1$residuals,type = "b")

par(mfcol=c(2,2))

plot(Model.2, which =1)
plot(Model.2, which =2)
plot(1:9, Model.2$residuals,type = "b")

## time trend is very clear


## catergorical and continuous variables
Model.4 <- lm(y ~ U+Day+Day*U, data = crates)
summary(Model.4)
## 
## Call:
## lm(formula = y ~ U + Day + Day * U, data = crates)
## 
## Residuals:
##       1       2       3       4       5       6       7       8       9 
##  1.3462  2.4324 -1.8846 -1.7143  2.5714 -5.6757 -0.8571  0.5385  3.2432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   49.538      4.519  10.961  0.00163 **
## U2            -6.681     12.603  -0.530  0.63274   
## U3             6.975      7.458   0.935  0.41868   
## Day           -2.885      0.910  -3.170  0.05048 . 
## U2:Day        -1.401      2.333  -0.601  0.59043   
## U3:Day        -1.088      1.304  -0.835  0.46522   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.64 on 3 degrees of freedom
## Multiple R-squared:   0.95,  Adjusted R-squared:  0.8667 
## F-statistic:  11.4 on 5 and 3 DF,  p-value: 0.03626
# verifying the Least square estimate
X <- model.matrix(Model.1)
solve(t(X)%*%X) %*% t(X)%*% crates$y
##    [,1]
## x1   30
## x2   12