Hald Cement Data

Hald [1952] presented a dataset concerning the heat involved in calories per gram of cement (𝑌) as a function of the amount of each of the following four ingredients in the mixture:

hald<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/hald.csv", 
               header=TRUE)
head(hald)
##       y x1 x2 x3 x4
## 1  78.5  7 26  6 60
## 2  74.3  1 29 15 52
## 3 104.3 11 56  8 20
## 4  87.6 11 31  8 47
## 5  95.9  7 52  6 33
## 6 109.2 11 55  9 22

Variable Selection

In variable selection, there are \(2^p\) possible sets of variables to consider for the model. This can get large fast! By performing forward or backward selection, this can greatly reduce the set of models to consider.

Forward Selection

Step 0: Start with a null (empty model)

Step 1: One variable models

n=dim(hald)[1]
alpha=0.15

# Look at models with one variable 
p=1
this.df=n-p-1

# Find the F_in cut-off
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.394941
Consider (\(x_1\)) tricalcium aluminate
## A: 𝑥1: tricalcium aluminate
mod1a<-lm(y~x1, data=hald)
anova(mod1a)#F-value: 12.602
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## x1         1 1450.1 1450.08  12.602 0.004552 **
## Residuals 11 1265.7  115.06                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_2\)) tricalcium silicate
## B: 𝑥2: tricalcium silicate
mod1b<-lm(y~x2, data=hald)
anova(mod1b)#F-value: 21.961
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x2         1 1809.43 1809.43  21.961 0.0006648 ***
## Residuals 11  906.34   82.39                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_3\)) tetracalcium alumino ferrit
## C: 𝑥3: tetracalcium alumino ferrite
mod1c<-lm(y~x3, data=hald)
anova(mod1c)#F-value: 4.4034
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## x3         1  776.36  776.36  4.4034 0.05976 .
## Residuals 11 1939.40  176.31                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_4\)) dicalcium silicate
## D: 𝑥4: dicalcium silicate
mod1d<-lm(y~x4, data=hald)
anova(mod1d)#F-value: 22.799
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x4         1 1831.90 1831.90  22.799 0.0005762 ***
## Residuals 11  883.87   80.35                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Dicalcium silicate ENTERS THE MODEL!

Step 2: Two variable models

# NOW LOOK AT TWO VARIABLE MODELS
p=2
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.431217
Consider (\(x_1\)) tricalcium aluminate with \(x_4\)
## A: 𝑥1: tricalcium aluminate
mod2a<-lm(y~x4+x1, data=hald)
anova(mod2a)#F-value: 108.22
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x4         1 1831.90 1831.90  245.03 2.319e-08 ***
## x1         1  809.10  809.10  108.22 1.105e-06 ***
## Residuals 10   74.76    7.48                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_2\)) tricalcium silicate with \(x_4\)
## B: 𝑥2: tricalcium silicate
mod2b<-lm(y~x4+x2, data=hald)
anova(mod2b)#F-value: 0.1725
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x4         1 1831.90 1831.90 21.0834 0.0009927 ***
## x2         1   14.99   14.99  0.1725 0.6866842    
## Residuals 10  868.88   86.89                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_3\)) tetracalcium alumino ferrit with \(x_4\)
## C: 𝑥3: tetracalcium alumino ferrite
mod2c<-lm(y~x4+x3, data=hald)
anova(mod2c)#F-value: 40.295
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x4         1 1831.90 1831.90 104.240 1.314e-06 ***
## x3         1  708.13  708.13  40.295 8.375e-05 ***
## Residuals 10  175.74   17.57                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Tricalcium aluminatee ENTERS THE MODEL

Step 3: Three variable models

# NOW LOOK AT THREE VARIABLE MODELS
p=3
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.476644
Consider (\(x_2\)) tricalcium silicate with \(x_4\) and \(x_1\)
## B: 𝑥2: tricalcium silicate
mod3b<-lm(y~x4+x1+x2, data=hald)
anova(mod3b)#F-value: 5.0259
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## x4         1 1831.90 1831.90 343.6758 1.771e-08 ***
## x1         1  809.10  809.10 151.7934 6.150e-07 ***
## x2         1   26.79   26.79   5.0259   0.05169 .  
## Residuals  9   47.97    5.33                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Consider (\(x_3\)) tetracalcium alumino ferrit with \(x_4\) and \(x_1\)
## C: 𝑥3: tetracalcium alumino ferrite
mod3c<-lm(y~x4+x1+x3, data=hald)
anova(mod3c)#F-value: 4.2358
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## x4         1 1831.90 1831.90 324.3179 2.285e-08 ***
## x1         1  809.10  809.10 143.2435 7.875e-07 ***
## x3         1   23.93   23.93   4.2358   0.06969 .  
## Residuals  9   50.84    5.65                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Tricalcium silicate ENTERS THE MODEL!

Step 4: Four variable models

p=4
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.535169
Consider (\(x_3\)) tetracalcium alumino ferrit with \(x_4\), \(x_1\), and \(x_2\)
## C: 𝑥3: tetracalcium alumino ferrite
mod4c<-lm(y~x4+x1+x2+x3, data=hald)
anova(mod4c)#F-value: 0.0182
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## x4         1 1831.90 1831.90 306.1859 1.161e-07 ***
## x1         1  809.10  809.10 135.2350 2.722e-06 ***
## x2         1   26.79   26.79   4.4776   0.06724 .  
## x3         1    0.11    0.11   0.0182   0.89592    
## Residuals  8   47.86    5.98                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Tetracalcium alumino ferrite IS NOT SIGNIFICANT

Step 5: Report the final model

Our final model for calories per gram of cement (\(Y\)) is with dicalcium silicate, tricalcium aluminate, tricalcium silicate

Backward Elimination

Step 1: Start with the saturated/full model

## STEP 1: NOW LOOK AT SATURATED MODEL
p=4
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.535169
modBack_1<-lm(y~x4+x1+x2+x3, data=hald)
anova(modBack_1)
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## x4         1 1831.90 1831.90 306.1859 1.161e-07 ***
## x1         1  809.10  809.10 135.2350 2.722e-06 ***
## x2         1   26.79   26.79   4.4776   0.06724 .  
## x3         1    0.11    0.11   0.0182   0.89592    
## Residuals  8   47.86    5.98                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# x3 has the lowerst F-value and its smaller than the threshold
# take x3 out 
Take out \(x_3\)

Step 2: Consider eliminating one variable

## STEP 2: LOOK AT 3 Variable Model to Remove One
p=3
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 2.476644
modBack_2<-lm(y~x4+x1+x2, data=hald)
anova(modBack_2)
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## x4         1 1831.90 1831.90 343.6758 1.771e-08 ***
## x1         1  809.10  809.10 151.7934 6.150e-07 ***
## x2         1   26.79   26.79   5.0259   0.05169 .  
## Residuals  9   47.97    5.33                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can stop here because the lowest F-stat is still bigger than the threshold

Automate with the leaps package

## We can automate with the leaps package
library(leaps)

# FORWARD STEPWISE SELECTION
regfit.fwd=regsubsets(y~., data=hald,
                      method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald, method = "forward")
## 4 Variables  (and intercept)
##    Forced in Forced out
## x1     FALSE      FALSE
## x2     FALSE      FALSE
## x3     FALSE      FALSE
## x4     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: forward
##          x1  x2  x3  x4 
## 1  ( 1 ) " " " " " " "*"
## 2  ( 1 ) "*" " " " " "*"
## 3  ( 1 ) "*" "*" " " "*"
## 4  ( 1 ) "*" "*" "*" "*"
# BACKWARD STEPWISE SELECTION
regfit.bwd=regsubsets(y~., data=hald,
                      method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald, method = "backward")
## 4 Variables  (and intercept)
##    Forced in Forced out
## x1     FALSE      FALSE
## x2     FALSE      FALSE
## x3     FALSE      FALSE
## x4     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: backward
##          x1  x2  x3  x4 
## 1  ( 1 ) " " "*" " " " "
## 2  ( 1 ) "*" "*" " " " "
## 3  ( 1 ) "*" "*" " " "*"
## 4  ( 1 ) "*" "*" "*" "*"
# BEST SUBSET
regfit.full=regsubsets(y~., data=hald)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = hald)
## 4 Variables  (and intercept)
##    Forced in Forced out
## x1     FALSE      FALSE
## x2     FALSE      FALSE
## x3     FALSE      FALSE
## x4     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
##          x1  x2  x3  x4 
## 1  ( 1 ) " " " " " " "*"
## 2  ( 1 ) "*" "*" " " " "
## 3  ( 1 ) "*" "*" " " "*"
## 4  ( 1 ) "*" "*" "*" "*"