# From the book Introductory Statistics with R by
# Peter Dalgaar
# Chapter 11 Multiple Regression
# As an example in this chapter, we use a study concerning lung function in
# patients with cystic fibrosis in Altman (1991, p. 338). The data are in the
# cystfibr data frame in the ISwR package.
library(ISwR)
objects(grep("ISwR",search()))
##  [1] "alkfos"          "ashina"          "bcmort"         
##  [4] "bp.obese"        "caesar.shoe"     "coking"         
##  [7] "cystfibr"        "eba1977"         "energy"         
## [10] "ewrates"         "fake.trypsin"    "graft.vs.host"  
## [13] "heart.rate"      "hellung"         "IgM"            
## [16] "intake"          "juul"            "juul2"          
## [19] "kfm"             "lung"            "malaria"        
## [22] "melanom"         "nickel"          "nickel.expand"  
## [25] "philion"         "react"           "red.cell.folate"
## [28] "rmr"             "secher"          "secretin"       
## [31] "stroke"          "tb.dilute"       "thuesen"        
## [34] "tlc"             "vitcap"          "vitcap2"        
## [37] "wright"          "zelazo"
names(cystfibr)
##  [1] "age"    "sex"    "height" "weight" "bmp"    "fev1"   "rv"    
##  [8] "frc"    "tlc"    "pemax"
str(cystfibr)
## 'data.frame':    25 obs. of  10 variables:
##  $ age   : int  7 7 8 8 8 9 11 12 12 13 ...
##  $ sex   : int  0 1 0 1 0 0 1 1 0 1 ...
##  $ height: int  109 112 124 125 127 130 139 150 146 155 ...
##  $ weight: num  13.1 12.9 14.1 16.2 21.5 17.5 30.7 28.4 25.1 31.5 ...
##  $ bmp   : int  68 65 64 67 93 68 89 69 67 68 ...
##  $ fev1  : int  32 19 22 41 52 44 28 18 24 23 ...
##  $ rv    : int  258 449 441 234 202 308 305 369 312 413 ...
##  $ frc   : int  183 245 268 146 131 155 179 198 194 225 ...
##  $ tlc   : int  137 134 147 124 104 118 119 103 128 136 ...
##  $ pemax : int  95 85 100 85 95 80 65 110 70 95 ...
model <- lm(cystfibr$pemax~.,data = cystfibr)
summary(model)
## 
## Call:
## lm(formula = cystfibr$pemax ~ ., data = cystfibr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.338 -11.532   1.081  13.386  33.405 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.0582   225.8912   0.779    0.448
## age          -2.5420     4.8017  -0.529    0.604
## sex          -3.7368    15.4598  -0.242    0.812
## height       -0.4463     0.9034  -0.494    0.628
## weight        2.9928     2.0080   1.490    0.157
## bmp          -1.7449     1.1552  -1.510    0.152
## fev1          1.0807     1.0809   1.000    0.333
## rv            0.1970     0.1962   1.004    0.331
## frc          -0.3084     0.4924  -0.626    0.540
## tlc           0.1886     0.4997   0.377    0.711
## 
## Residual standard error: 25.47 on 15 degrees of freedom
## Multiple R-squared:  0.6373, Adjusted R-squared:  0.4197 
## F-statistic: 2.929 on 9 and 15 DF,  p-value: 0.03195
# The layout should be well known by now. Notice that there is not one
# single significant t value, but the joint F test is nevertheless significant,
# so there must be an effect somewhere. The reason is that the t tests only
# say something about what happens if you remove one variable and leave
# in all the others. You cannot see whether a variable would be statistically
# significant in a reduced model; all you can see is that no variable must be
# included.
# From the book ISLR fourth printing
# page 244
# 6.5 Lab 1: Subset Selection Methods
library(leaps)
regfit.full = regsubsets(pemax~.,cystfibr)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(pemax ~ ., cystfibr)
## 9 Variables  (and intercept)
##        Forced in Forced out
## age        FALSE      FALSE
## sex        FALSE      FALSE
## height     FALSE      FALSE
## weight     FALSE      FALSE
## bmp        FALSE      FALSE
## fev1       FALSE      FALSE
## rv         FALSE      FALSE
## frc        FALSE      FALSE
## tlc        FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          age sex height weight bmp fev1 rv  frc tlc
## 1  ( 1 ) " " " " " "    "*"    " " " "  " " " " " "
## 2  ( 1 ) " " " " " "    "*"    "*" " "  " " " " " "
## 3  ( 1 ) " " " " " "    "*"    "*" "*"  " " " " " "
## 4  ( 1 ) " " " " " "    "*"    "*" "*"  "*" " " " "
## 5  ( 1 ) " " " " " "    "*"    "*" "*"  "*" " " "*"
## 6  ( 1 ) "*" " " "*"    "*"    "*" "*"  "*" " " " "
## 7  ( 1 ) "*" " " "*"    "*"    "*" "*"  "*" "*" " "
## 8  ( 1 ) "*" " " "*"    "*"    "*" "*"  "*" "*" "*"
# An asterisk indicates that a given variable is included in the corresponding
# model. For instance, this output indicates that the best two-variable model
# contains  age,height,weght,bmp,fev1 and rv
names(regfit.full)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"   
##  [6] "first"     "last"      "vorder"    "tol"       "rss"      
## [11] "bound"     "nvmax"     "ress"      "ir"        "nbest"    
## [16] "lopt"      "il"        "ier"       "xnames"    "method"   
## [21] "force.in"  "force.out" "sserr"     "intercept" "lindep"   
## [26] "nullrss"   "nn"        "call"
regfit.full=regsubsets(pemax~.,data = cystfibr,nvmax = 19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
## [1] 0.4035070 0.4748731 0.5699943 0.6141043 0.6214494 0.6266394 0.6316019
## [8] 0.6359228 0.6373354
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variables",ylab = "RSS",
     type = "l")
plot(reg.summary$adjr2,xlab = "Number of Variables", ylab = "Adjusted rSq",type = "l")
which.max(reg.summary$adjr2)
## [1] 4
points(4,reg.summary$adjr2[4],col="red",cex=2,pch=20)
model <- lm(cystfibr$pemax~.,data = cystfibr)
summary(model)
## 
## Call:
## lm(formula = cystfibr$pemax ~ ., data = cystfibr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.338 -11.532   1.081  13.386  33.405 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.0582   225.8912   0.779    0.448
## age          -2.5420     4.8017  -0.529    0.604
## sex          -3.7368    15.4598  -0.242    0.812
## height       -0.4463     0.9034  -0.494    0.628
## weight        2.9928     2.0080   1.490    0.157
## bmp          -1.7449     1.1552  -1.510    0.152
## fev1          1.0807     1.0809   1.000    0.333
## rv            0.1970     0.1962   1.004    0.331
## frc          -0.3084     0.4924  -0.626    0.540
## tlc           0.1886     0.4997   0.377    0.711
## 
## Residual standard error: 25.47 on 15 degrees of freedom
## Multiple R-squared:  0.6373, Adjusted R-squared:  0.4197 
## F-statistic: 2.929 on 9 and 15 DF,  p-value: 0.03195
model1 <- lm(cystfibr$pemax~age+weight+bmp+fev1,data = cystfibr)
summary(model1)
## 
## Call:
## lm(formula = cystfibr$pemax ~ age + weight + bmp + fev1, data = cystfibr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.521 -10.885   3.003  15.488  41.767 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 179.2957    61.8855   2.897  0.00891 **
## age          -3.4181     3.3086  -1.033  0.31389   
## weight        2.6882     1.1727   2.292  0.03287 * 
## bmp          -2.0657     0.8198  -2.520  0.02036 * 
## fev1          1.0882     0.5139   2.117  0.04695 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.4 on 20 degrees of freedom
## Multiple R-squared:  0.5918, Adjusted R-squared:  0.5101 
## F-statistic: 7.248 on 4 and 20 DF,  p-value: 0.0008891