Hald [1952] presented a dataset concerning the heat involved in calories per gram of cement (𝑌) as a function of the amount of each of the following four ingredients in the mixture:
hald<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/hald.csv",
header=TRUE)
head(hald)
## y x1 x2 x3 x4
## 1 78.5 7 26 6 60
## 2 74.3 1 29 15 52
## 3 104.3 11 56 8 20
## 4 87.6 11 31 8 47
## 5 95.9 7 52 6 33
## 6 109.2 11 55 9 22
In variable selection, there are \(2^p\) possible sets of variables to consider for the model. This can get large fast! By performing forward or backward selection, this can greatly reduce the set of models to consider.
n=dim(hald)[1]
alpha=0.15
# Look at models with one variable
p=1
this.df=n-p-1
# Find the F_in cut-off
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.394941
## A: 𝑥1: tricalcium aluminate
mod1a<-lm(y~x1, data=hald)
anova(mod1a)#F-value: 12.602
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 1450.1 1450.08 12.602 0.004552 **
## Residuals 11 1265.7 115.06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## B: 𝑥2: tricalcium silicate
mod1b<-lm(y~x2, data=hald)
anova(mod1b)#F-value: 21.961
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x2 1 1809.43 1809.43 21.961 0.0006648 ***
## Residuals 11 906.34 82.39
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## C: 𝑥3: tetracalcium alumino ferrite
mod1c<-lm(y~x3, data=hald)
anova(mod1c)#F-value: 4.4034
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x3 1 776.36 776.36 4.4034 0.05976 .
## Residuals 11 1939.40 176.31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## D: 𝑥4: dicalcium silicate
mod1d<-lm(y~x4, data=hald)
anova(mod1d)#F-value: 22.799
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 22.799 0.0005762 ***
## Residuals 11 883.87 80.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# NOW LOOK AT TWO VARIABLE MODELS
p=2
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.431217
## A: 𝑥1: tricalcium aluminate
mod2a<-lm(y~x4+x1, data=hald)
anova(mod2a)#F-value: 108.22
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 245.03 2.319e-08 ***
## x1 1 809.10 809.10 108.22 1.105e-06 ***
## Residuals 10 74.76 7.48
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## B: 𝑥2: tricalcium silicate
mod2b<-lm(y~x4+x2, data=hald)
anova(mod2b)#F-value: 0.1725
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 21.0834 0.0009927 ***
## x2 1 14.99 14.99 0.1725 0.6866842
## Residuals 10 868.88 86.89
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## C: 𝑥3: tetracalcium alumino ferrite
mod2c<-lm(y~x4+x3, data=hald)
anova(mod2c)#F-value: 40.295
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 104.240 1.314e-06 ***
## x3 1 708.13 708.13 40.295 8.375e-05 ***
## Residuals 10 175.74 17.57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# NOW LOOK AT THREE VARIABLE MODELS
p=3
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.476644
## B: 𝑥2: tricalcium silicate
mod3b<-lm(y~x4+x1+x2, data=hald)
anova(mod3b)#F-value: 5.0259
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 343.6758 1.771e-08 ***
## x1 1 809.10 809.10 151.7934 6.150e-07 ***
## x2 1 26.79 26.79 5.0259 0.05169 .
## Residuals 9 47.97 5.33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## C: 𝑥3: tetracalcium alumino ferrite
mod3c<-lm(y~x4+x1+x3, data=hald)
anova(mod3c)#F-value: 4.2358
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 324.3179 2.285e-08 ***
## x1 1 809.10 809.10 143.2435 7.875e-07 ***
## x3 1 23.93 23.93 4.2358 0.06969 .
## Residuals 9 50.84 5.65
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p=4
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.535169
## C: 𝑥3: tetracalcium alumino ferrite
mod4c<-lm(y~x4+x1+x2+x3, data=hald)
anova(mod4c)#F-value: 0.0182
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 306.1859 1.161e-07 ***
## x1 1 809.10 809.10 135.2350 2.722e-06 ***
## x2 1 26.79 26.79 4.4776 0.06724 .
## x3 1 0.11 0.11 0.0182 0.89592
## Residuals 8 47.86 5.98
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Our final model for calories per gram of cement (\(Y\)) is with dicalcium silicate, tricalcium aluminate, tricalcium silicate
## STEP 1: NOW LOOK AT SATURATED MODEL
p=4
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.535169
modBack_1<-lm(y~x4+x1+x2+x3, data=hald)
anova(modBack_1)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 306.1859 1.161e-07 ***
## x1 1 809.10 809.10 135.2350 2.722e-06 ***
## x2 1 26.79 26.79 4.4776 0.06724 .
## x3 1 0.11 0.11 0.0182 0.89592
## Residuals 8 47.86 5.98
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# x3 has the lowerst F-value and its smaller than the threshold
# take x3 out
## STEP 2: LOOK AT 3 Variable Model to Remove One
p=3
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.476644
modBack_2<-lm(y~x4+x1+x2, data=hald)
anova(modBack_2)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 1831.90 1831.90 343.6758 1.771e-08 ***
## x1 1 809.10 809.10 151.7934 6.150e-07 ***
## x2 1 26.79 26.79 5.0259 0.05169 .
## Residuals 9 47.97 5.33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leaps
package## We can automate with the leaps package
library(leaps)
# FORWARD STEPWISE SELECTION
regfit.fwd=regsubsets(y~., data=hald,
method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald, method = "forward")
## 4 Variables (and intercept)
## Forced in Forced out
## x1 FALSE FALSE
## x2 FALSE FALSE
## x3 FALSE FALSE
## x4 FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: forward
## x1 x2 x3 x4
## 1 ( 1 ) " " " " " " "*"
## 2 ( 1 ) "*" " " " " "*"
## 3 ( 1 ) "*" "*" " " "*"
## 4 ( 1 ) "*" "*" "*" "*"
# BACKWARD STEPWISE SELECTION
regfit.bwd=regsubsets(y~., data=hald,
method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald, method = "backward")
## 4 Variables (and intercept)
## Forced in Forced out
## x1 FALSE FALSE
## x2 FALSE FALSE
## x3 FALSE FALSE
## x4 FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: backward
## x1 x2 x3 x4
## 1 ( 1 ) " " "*" " " " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " "*"
## 4 ( 1 ) "*" "*" "*" "*"
# BEST SUBSET
regfit.full=regsubsets(y~., data=hald)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald)
## 4 Variables (and intercept)
## Forced in Forced out
## x1 FALSE FALSE
## x2 FALSE FALSE
## x3 FALSE FALSE
## x4 FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
## x1 x2 x3 x4
## 1 ( 1 ) " " " " " " "*"
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " "*"
## 4 ( 1 ) "*" "*" "*" "*"