# From the book Introductory Statistics with R by
# Peter Dalgaar
# Chapter 11 Multiple Regression
# As an example in this chapter, we use a study concerning lung function in
# patients with cystic fibrosis in Altman (1991, p. 338). The data are in the
# cystfibr data frame in the ISwR package.
library(ISwR)
objects(grep("ISwR",search()))
## [1] "alkfos" "ashina" "bcmort"
## [4] "bp.obese" "caesar.shoe" "coking"
## [7] "cystfibr" "eba1977" "energy"
## [10] "ewrates" "fake.trypsin" "graft.vs.host"
## [13] "heart.rate" "hellung" "IgM"
## [16] "intake" "juul" "juul2"
## [19] "kfm" "lung" "malaria"
## [22] "melanom" "nickel" "nickel.expand"
## [25] "philion" "react" "red.cell.folate"
## [28] "rmr" "secher" "secretin"
## [31] "stroke" "tb.dilute" "thuesen"
## [34] "tlc" "vitcap" "vitcap2"
## [37] "wright" "zelazo"
names(cystfibr)
## [1] "age" "sex" "height" "weight" "bmp" "fev1" "rv"
## [8] "frc" "tlc" "pemax"
str(cystfibr)
## 'data.frame': 25 obs. of 10 variables:
## $ age : int 7 7 8 8 8 9 11 12 12 13 ...
## $ sex : int 0 1 0 1 0 0 1 1 0 1 ...
## $ height: int 109 112 124 125 127 130 139 150 146 155 ...
## $ weight: num 13.1 12.9 14.1 16.2 21.5 17.5 30.7 28.4 25.1 31.5 ...
## $ bmp : int 68 65 64 67 93 68 89 69 67 68 ...
## $ fev1 : int 32 19 22 41 52 44 28 18 24 23 ...
## $ rv : int 258 449 441 234 202 308 305 369 312 413 ...
## $ frc : int 183 245 268 146 131 155 179 198 194 225 ...
## $ tlc : int 137 134 147 124 104 118 119 103 128 136 ...
## $ pemax : int 95 85 100 85 95 80 65 110 70 95 ...
model <- lm(cystfibr$pemax~.,data = cystfibr)
summary(model)
##
## Call:
## lm(formula = cystfibr$pemax ~ ., data = cystfibr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.338 -11.532 1.081 13.386 33.405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.0582 225.8912 0.779 0.448
## age -2.5420 4.8017 -0.529 0.604
## sex -3.7368 15.4598 -0.242 0.812
## height -0.4463 0.9034 -0.494 0.628
## weight 2.9928 2.0080 1.490 0.157
## bmp -1.7449 1.1552 -1.510 0.152
## fev1 1.0807 1.0809 1.000 0.333
## rv 0.1970 0.1962 1.004 0.331
## frc -0.3084 0.4924 -0.626 0.540
## tlc 0.1886 0.4997 0.377 0.711
##
## Residual standard error: 25.47 on 15 degrees of freedom
## Multiple R-squared: 0.6373, Adjusted R-squared: 0.4197
## F-statistic: 2.929 on 9 and 15 DF, p-value: 0.03195
# The layout should be well known by now. Notice that there is not one
# single significant t value, but the joint F test is nevertheless significant,
# so there must be an effect somewhere. The reason is that the t tests only
# say something about what happens if you remove one variable and leave
# in all the others. You cannot see whether a variable would be statistically
# significant in a reduced model; all you can see is that no variable must be
# included.
# From the book ISLR fourth printing
# page 244
# 6.5 Lab 1: Subset Selection Methods
library(leaps)
regfit.full = regsubsets(pemax~.,cystfibr)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(pemax ~ ., cystfibr)
## 9 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## sex FALSE FALSE
## height FALSE FALSE
## weight FALSE FALSE
## bmp FALSE FALSE
## fev1 FALSE FALSE
## rv FALSE FALSE
## frc FALSE FALSE
## tlc FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## age sex height weight bmp fev1 rv frc tlc
## 1 ( 1 ) " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " " " " " "
## 3 ( 1 ) " " " " " " "*" "*" "*" " " " " " "
## 4 ( 1 ) " " " " " " "*" "*" "*" "*" " " " "
## 5 ( 1 ) " " " " " " "*" "*" "*" "*" " " "*"
## 6 ( 1 ) "*" " " "*" "*" "*" "*" "*" " " " "
## 7 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" " "
## 8 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*"
# An asterisk indicates that a given variable is included in the corresponding
# model. For instance, this output indicates that the best two-variable model
# contains age,height,weght,bmp,fev1 and rv
names(regfit.full)
## [1] "np" "nrbar" "d" "rbar" "thetab"
## [6] "first" "last" "vorder" "tol" "rss"
## [11] "bound" "nvmax" "ress" "ir" "nbest"
## [16] "lopt" "il" "ier" "xnames" "method"
## [21] "force.in" "force.out" "sserr" "intercept" "lindep"
## [26] "nullrss" "nn" "call"
regfit.full=regsubsets(pemax~.,data = cystfibr,nvmax = 19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.4035070 0.4748731 0.5699943 0.6141043 0.6214494 0.6266394 0.6316019
## [8] 0.6359228 0.6373354
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variables",ylab = "RSS",
type = "l")
plot(reg.summary$adjr2,xlab = "Number of Variables", ylab = "Adjusted rSq",type = "l")
which.max(reg.summary$adjr2)
## [1] 4
points(4,reg.summary$adjr2[4],col="red",cex=2,pch=20)
model <- lm(cystfibr$pemax~.,data = cystfibr)
summary(model)
##
## Call:
## lm(formula = cystfibr$pemax ~ ., data = cystfibr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.338 -11.532 1.081 13.386 33.405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.0582 225.8912 0.779 0.448
## age -2.5420 4.8017 -0.529 0.604
## sex -3.7368 15.4598 -0.242 0.812
## height -0.4463 0.9034 -0.494 0.628
## weight 2.9928 2.0080 1.490 0.157
## bmp -1.7449 1.1552 -1.510 0.152
## fev1 1.0807 1.0809 1.000 0.333
## rv 0.1970 0.1962 1.004 0.331
## frc -0.3084 0.4924 -0.626 0.540
## tlc 0.1886 0.4997 0.377 0.711
##
## Residual standard error: 25.47 on 15 degrees of freedom
## Multiple R-squared: 0.6373, Adjusted R-squared: 0.4197
## F-statistic: 2.929 on 9 and 15 DF, p-value: 0.03195
model1 <- lm(cystfibr$pemax~age+weight+bmp+fev1,data = cystfibr)
summary(model1)
##
## Call:
## lm(formula = cystfibr$pemax ~ age + weight + bmp + fev1, data = cystfibr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.521 -10.885 3.003 15.488 41.767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 179.2957 61.8855 2.897 0.00891 **
## age -3.4181 3.3086 -1.033 0.31389
## weight 2.6882 1.1727 2.292 0.03287 *
## bmp -2.0657 0.8198 -2.520 0.02036 *
## fev1 1.0882 0.5139 2.117 0.04695 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.4 on 20 degrees of freedom
## Multiple R-squared: 0.5918, Adjusted R-squared: 0.5101
## F-statistic: 7.248 on 4 and 20 DF, p-value: 0.0008891
