The problem corresponding to data-table-B9.csv studies the effect of four factors (x1 = superficial fluid velocity of the gas (cs/s), x2= kinematic velocity, x3= mesh opening (cm), and x4= dimensionless number relating the superficial velocity of the gas to the fluid) on the pressure drop in a screen plate bubble column (y).
1. Fit all possible first order models to this dataset (note there will 2^k where k=4 possible models). Select the best model using available criterion (AIC, AdjR^2, etc.) and further analysis of candidate models.
dat<-read.csv("C:\\Users\\abdal\\OneDrive\\Desktop\\TTU\\Spring 2023\\IE 5344\\FA\\FA14\\data-table-B9(2).csv",header=TRUE)
library(MuMIn)
## Warning: package 'MuMIn' was built under R version 4.2.3
fullmodel<-lm(y~.,data=dat,na.action="na.fail")
summary.fit<-dredge(fullmodel)
## Fixed term is "(Intercept)"
summary.fit
## Global model call: lm(formula = y ~ ., data = dat, na.action = "na.fail")
## ---
## Model selection table
## (Intrc) x1 x2 x3 x4 df logLik AICc delta weight
## 15 4.641 0.1830 34.62 4.569 5 -186.378 383.8 0.00 0.326
## 16 5.895 -0.4779 0.1827 35.40 5.844 6 -185.322 384.2 0.34 0.274
## 7 7.190 0.1846 35.12 4 -187.781 384.3 0.44 0.262
## 8 8.279 -0.2650 0.1846 35.62 5 -187.441 386.0 2.13 0.112
## 11 16.600 0.1806 4.801 4 -191.123 390.9 7.12 0.009
## 3 19.450 0.1822 3 -192.455 391.3 7.50 0.008
## 12 17.950 -0.4238 0.1803 5.936 5 -190.413 391.9 8.07 0.006
## 4 20.440 -0.2071 0.1822 4 -192.276 393.3 9.43 0.003
## 5 13.120 29.87 3 -220.520 447.5 63.63 0.000
## 1 23.510 2 -221.766 447.7 63.91 0.000
## 13 9.619 29.26 6.160 4 -219.638 448.0 64.15 0.000
## 9 19.690 6.338 3 -220.868 448.1 64.32 0.000
## 14 10.980 -0.5240 30.13 7.555 5 -219.208 449.5 65.66 0.000
## 6 14.150 -0.2485 30.34 4 -220.417 449.5 65.71 0.000
## 10 21.200 -0.4773 7.614 4 -220.525 449.8 65.92 0.000
## 2 24.460 -0.1993 3 -221.702 449.8 65.99 0.000
## Models ranked by AICc(x)
plot(summary.fit)
mod1<-lm(y~x2+x3+x4,data=dat)
#check the first 5 models with the highest AIC values
summary(mod1)
##
## Call:
## lm(formula = y ~ x2 + x3 + x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2730 -3.4598 -0.5632 2.7904 12.3370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.64065 4.26751 1.087 0.28134
## x2 0.18302 0.01733 10.563 3.92e-15 ***
## x3 34.62435 11.17861 3.097 0.00301 **
## x4 4.56878 2.78788 1.639 0.10667
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.056 on 58 degrees of freedom
## Multiple R-squared: 0.6807, Adjusted R-squared: 0.6642
## F-statistic: 41.21 on 3 and 58 DF, p-value: 2.146e-14
#x4 is not significant (p-values higher than 0.05)
mod2<-lm(y~x1+x2+x3+x4,data=dat)
summary(mod2)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9958 -3.3092 -0.2419 3.3924 10.5668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89453 4.32508 1.363 0.17828
## x1 -0.47790 0.34002 -1.406 0.16530
## x2 0.18271 0.01718 10.633 3.78e-15 ***
## x3 35.40284 11.09960 3.190 0.00232 **
## x4 5.84391 2.90978 2.008 0.04935 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared: 0.6914, Adjusted R-squared: 0.6697
## F-statistic: 31.92 on 4 and 57 DF, p-value: 5.818e-14
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
vif(mod2)
## x1 x2 x3 x4
## 1.111143 1.005040 1.005169 1.111589
#check for collinearity: x1 and x4 seems to be collinear, and x2 and x3 too.#x1 is not significant (p-values higher than 0.05)
mod3<-lm(y~x2+x3,data=dat)
summary(mod3)
##
## Call:
## lm(formula = y ~ x2 + x3, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0994 -3.6236 -0.6911 2.4722 14.1854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.18971 4.03031 1.784 0.07958 .
## x2 0.18456 0.01755 10.518 3.74e-15 ***
## x3 35.11616 11.33308 3.099 0.00298 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.127 on 59 degrees of freedom
## Multiple R-squared: 0.6659, Adjusted R-squared: 0.6546
## F-statistic: 58.8 on 2 and 59 DF, p-value: 9.007e-15
#All variables are significant, R sqaure = 0.65 -> best model, Multiple R-squared: 0.6659, Adjusted R-squared: 0.6546, p-value: 9.007e-15
mod4<-lm(y~x1+x2+x3,data=dat)
summary(mod4)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2097 -3.5127 -0.5276 2.5248 13.4899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.2794 4.2662 1.941 0.05716 .
## x1 -0.2650 0.3314 -0.800 0.42722
## x2 0.1846 0.0176 10.490 5.12e-15 ***
## x3 35.6239 11.3856 3.129 0.00275 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.143 on 58 degrees of freedom
## Multiple R-squared: 0.6695, Adjusted R-squared: 0.6524
## F-statistic: 39.17 on 3 and 58 DF, p-value: 5.757e-14
#x1 is not significant (p-values higher than 0.05)
mod5<-lm(y~x2+x4,data=dat)
summary(mod5)
##
## Call:
## lm(formula = y ~ x2 + x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.2694 -3.7897 -0.4098 3.7020 11.9290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.59546 1.94871 8.516 7.38e-12 ***
## x2 0.18058 0.01853 9.748 6.64e-14 ***
## x4 4.80060 2.98294 1.609 0.113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.411 on 59 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6152
## F-statistic: 49.77 on 2 and 59 DF, p-value: 2.167e-13
#x4 is not significant (p-values higher than 0.05)
#a) we scale every predictor because of collinearity. (see comment of question 1)
dat3<-data.frame(scale(dat[,1:2:3:4],center=TRUE,scale=TRUE),dat[,5])
## Warning in 1:2:3: numerical expression has 2 elements: only the first used
## Warning in 1:2:3:4: numerical expression has 3 elements: only the first used
colnames(dat3)<-c("x1","x2","x3","x4","y")
mod2_1<-lm(y~.,dat=dat3)
vif(mod2_1)
## x1 x2 x3 x4
## 1.111143 1.005040 1.005169 1.111589
#We don't need to scale because VIF is not changing.
#b)Use forward stepwise regression, which model is selected?
mod6<-lm(y~x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x3:x4 + x2:x4,data=dat)
step(mod6,direction="forward")
## Start: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x3:x4 +
## x2:x4
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 +
## x2:x3 + x3:x4 + x2:x4, data = dat)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4 x1:x2
## 15.88376 0.18695 0.37921 -11.99940 -8.86442 0.01155
## x1:x3 x1:x4 x2:x3 x3:x4 x2:x4
## NA -1.11525 NA 72.85976 -0.38547
#result: lm(formula = y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x3:x4 + x2:x4, data = dat2)
#Would keep everything in.We got two NA's for x1:x3 and x2:x3, we try to delete them.
mod61<-lm(y~x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x3:x4 + x2:x4,data=dat)
step(mod61,direction="forward")
## Start: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x3:x4 + x2:x4
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x3:x4 +
## x2:x4, data = dat)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4 x1:x2
## 15.88376 0.18695 0.37921 -11.99940 -8.86442 0.01155
## x1:x4 x3:x4 x2:x4
## -1.11525 72.85976 -0.38547
summary(mod61)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x3:x4 +
## x2:x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4804 -3.0766 -0.6635 2.9625 12.2221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.88376 23.17863 0.685 0.49616
## x1 0.18696 0.78447 0.238 0.81255
## x2 0.37921 0.06332 5.989 1.89e-07 ***
## x3 -11.99940 67.31148 -0.178 0.85919
## x4 -8.86442 35.62553 -0.249 0.80446
## x1:x2 0.01155 0.00869 1.329 0.18955
## x1:x4 -1.11525 1.14847 -0.971 0.33592
## x3:x4 72.85976 103.15353 0.706 0.48308
## x2:x4 -0.38547 0.11962 -3.222 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.683 on 53 degrees of freedom
## Multiple R-squared: 0.7496, Adjusted R-squared: 0.7118
## F-statistic: 19.83 on 8 and 53 DF, p-value: 1.947e-13
#Only x2 and x2:x4 are significant, Multiple R-squared: 0.7496, Adjusted R-squared: 0.7118, p-value: 1.947e-13
#c)Use stepwise regression in both directions starting with a first order model, which model is selected?
mod7<-lm(y~.,dat=dat)
step(mod7,direction="both")
## Start: AIC=204.7
## y ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## <none> 1432.8 204.69
## - x1 1 49.66 1482.4 204.81
## - x4 1 101.39 1534.2 206.93
## - x3 1 255.72 1688.5 212.88
## - x2 1 2841.93 4274.7 270.47
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = dat)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 5.8945 -0.4779 0.1827 35.4028 5.8439
vif(mod7)
## x1 x2 x3 x4
## 1.111143 1.005040 1.005169 1.111589
summary(mod7)
##
## Call:
## lm(formula = y ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9958 -3.3092 -0.2419 3.3924 10.5668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89453 4.32508 1.363 0.17828
## x1 -0.47790 0.34002 -1.406 0.16530
## x2 0.18271 0.01718 10.633 3.78e-15 ***
## x3 35.40284 11.09960 3.190 0.00232 **
## x4 5.84391 2.90978 2.008 0.04935 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared: 0.6914, Adjusted R-squared: 0.6697
## F-statistic: 31.92 on 4 and 57 DF, p-value: 5.818e-14
#result:formula = y ~ x1 + x2 + x3 + x4, data = dat
#x1 is not significant, R sqaure = 0.69, Adjusted = 0.6697, p-value=5.818e-14
#d)Use backwards stepwise regression, which model is selected?
step(mod6,direction="backward")
## Start: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x3:x4 +
## x2:x4
##
##
## Step: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x3:x4 + x2:x4
##
##
## Step: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x3:x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x3:x4 1 10.942 1173.4 198.31
## - x1:x4 1 20.682 1183.1 198.82
## <none> 1162.4 199.73
## - x1:x2 1 38.737 1201.2 199.76
## - x2:x4 1 227.751 1390.2 208.82
##
## Step: AIC=198.31
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1:x4 1 19.837 1193.2 197.35
## <none> 1173.4 198.31
## - x1:x2 1 38.709 1212.1 198.32
## - x2:x4 1 228.394 1401.8 207.34
## - x3 1 249.320 1422.7 208.26
##
## Step: AIC=197.35
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1:x2 1 32.307 1225.5 197.01
## <none> 1193.2 197.35
## - x2:x4 1 220.026 1413.2 205.84
## - x3 1 252.209 1445.4 207.24
##
## Step: AIC=197.01
## y ~ x1 + x2 + x3 + x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1 1 11.262 1236.8 195.57
## <none> 1225.5 197.01
## - x2:x4 1 207.286 1432.8 204.69
## - x3 1 248.430 1473.9 206.45
##
## Step: AIC=195.57
## y ~ x2 + x3 + x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## <none> 1236.8 195.57
## - x3 1 243.60 1480.4 204.72
## - x2:x4 1 245.68 1482.4 204.81
##
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat)
##
## Coefficients:
## (Intercept) x2 x3 x4 x2:x4
## 1.5226 0.3806 34.5106 9.5247 -0.3047
#result: lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat2)
mod62<-lm(y ~ x2 + x3 + x4 + x2:x4,data=dat)
summary(mod62)
##
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.959 -3.358 -1.131 3.040 11.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.52261 4.03964 0.377 0.70763
## x2 0.38056 0.06084 6.255 5.47e-08 ***
## x3 34.51062 10.29961 3.351 0.00144 **
## x4 9.52471 2.96093 3.217 0.00214 **
## x2:x4 -0.30472 0.09056 -3.365 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.658 on 57 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7149
## F-statistic: 39.24 on 4 and 57 DF, p-value: 9.297e-16
#Everything is significant, Multiple R-squared: 0.7336, Adjusted R-squared: 0.7149, p-value: 9.297e-16
#e)Do you find the same model in parts b, c, and d? Explain
# d gives us the best model (p-value, R square, Adjusted R square), only d gives us a significant model
#backwards stepwise regression is keeping only the significant parameters
modnew<-lm(y~ x2 + x3 + x4 + x2:x4,data=dat)
step(modnew, direction="forward")
## Start: AIC=195.57
## y ~ x2 + x3 + x4 + x2:x4
##
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat)
##
## Coefficients:
## (Intercept) x2 x3 x4 x2:x4
## 1.5226 0.3806 34.5106 9.5247 -0.3047
summary(modnew)
##
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.959 -3.358 -1.131 3.040 11.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.52261 4.03964 0.377 0.70763
## x2 0.38056 0.06084 6.255 5.47e-08 ***
## x3 34.51062 10.29961 3.351 0.00144 **
## x4 9.52471 2.96093 3.217 0.00214 **
## x2:x4 -0.30472 0.09056 -3.365 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.658 on 57 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7149
## F-statistic: 39.24 on 4 and 57 DF, p-value: 9.297e-16
#We think using backward regression x3 was found as significant, but in forward it was found as insignificant, still x2 and x4 and the interaction x2:x4 is significant in both regressions