library(readxl)
dat<-read_excel("data-table-B9.xlsx")
str(dat)
## tibble [62 × 5] (S3: tbl_df/tbl/data.frame)
## $ x1: num [1:62] 2.14 4.14 8.15 2.14 4.14 8.15 2.14 4.14 8.15 2.14 ...
## $ x2: num [1:62] 10 10 10 10 10 10 10 10 10 10 ...
## $ x3: num [1:62] 0.34 0.34 0.34 0.34 0.34 0.34 0.34 0.34 0.34 0.34 ...
## $ x4: num [1:62] 1 1 1 0.246 0.379 0.474 0.141 0.234 0.311 0.076 ...
## $ y : num [1:62] 28.9 31 26.4 27.2 26.1 23.2 19.7 22.1 22.8 29.2 ...
summary(dat)
## x1 x2 x3 x4
## Min. :2.140 Min. : 1.25 Min. :0.2500 Min. :0.0760
## 1st Qu.:4.140 1st Qu.: 2.63 1st Qu.:0.3400 1st Qu.:0.4733
## Median :4.300 Median : 10.00 Median :0.3400 Median :0.6160
## Mean :4.784 Mean : 22.25 Mean :0.3477 Mean :0.6028
## 3rd Qu.:5.600 3rd Qu.: 10.10 3rd Qu.:0.3400 3rd Qu.:0.7788
## Max. :8.150 Max. :112.00 Max. :0.5500 Max. :1.0000
## y
## Min. : 8.40
## 1st Qu.:16.65
## Median :21.75
## Mean :23.51
## 3rd Qu.:27.95
## Max. :48.60
full_mod <- lm(y ~ (x1 + x2 + x3 + x4)^2, data = dat)
summary(full_mod)
##
## Call:
## lm(formula = y ~ (x1 + x2 + x3 + x4)^2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4804 -3.0766 -0.6635 2.9625 12.2221
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.88376 23.17863 0.685 0.49616
## x1 0.18696 0.78447 0.238 0.81255
## x2 0.37921 0.06332 5.989 1.89e-07 ***
## x3 -11.99940 67.31148 -0.178 0.85919
## x4 -8.86442 35.62553 -0.249 0.80446
## x1:x2 0.01155 0.00869 1.329 0.18955
## x1:x3 NA NA NA NA
## x1:x4 -1.11525 1.14847 -0.971 0.33592
## x2:x3 NA NA NA NA
## x2:x4 -0.38547 0.11962 -3.222 0.00218 **
## x3:x4 72.85976 103.15353 0.706 0.48308
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.683 on 53 degrees of freedom
## Multiple R-squared: 0.7496, Adjusted R-squared: 0.7118
## F-statistic: 19.83 on 8 and 53 DF, p-value: 1.947e-13
summary(full_mod)$r.squared
## [1] 0.7496145
summary(full_mod)$fstatistic
## value numdf dendf
## 19.8342 8.0000 53.0000
Reject H0; at least one regression coefficient is significantly different from zero.
reduced_mod <- lm(y ~ x1 + x2 + x3 + x4, data = dat)
anova(reduced_mod, full_mod)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + x3 + x4
## Model 2: y ~ (x1 + x2 + x3 + x4)^2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 57 1432.8
## 2 53 1162.4 4 270.37 3.0819 0.02352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reduced_mod, full_mod)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + x3 + x4
## Model 2: y ~ (x1 + x2 + x3 + x4)^2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 57 1432.8
## 2 53 1162.4 4 270.37 3.0819 0.02352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fail to reject H0; none of the interactions are significant
library(MASS)
step_mod <- stepAIC(full_mod, direction = "backward")
## Start: AIC=199.73
## y ~ (x1 + x2 + x3 + x4)^2
##
##
## Step: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x4 + x3:x4
##
##
## Step: AIC=199.73
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x2:x4 + x3:x4
##
## Df Sum of Sq RSS AIC
## - x3:x4 1 10.942 1173.4 198.31
## - x1:x4 1 20.682 1183.1 198.82
## <none> 1162.4 199.73
## - x1:x2 1 38.737 1201.2 199.76
## - x2:x4 1 227.751 1390.2 208.82
##
## Step: AIC=198.31
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1:x4 1 19.837 1193.2 197.35
## <none> 1173.4 198.31
## - x1:x2 1 38.709 1212.1 198.32
## - x2:x4 1 228.394 1401.8 207.34
## - x3 1 249.320 1422.7 208.26
##
## Step: AIC=197.35
## y ~ x1 + x2 + x3 + x4 + x1:x2 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1:x2 1 32.307 1225.5 197.01
## <none> 1193.2 197.35
## - x2:x4 1 220.026 1413.2 205.84
## - x3 1 252.209 1445.4 207.24
##
## Step: AIC=197.01
## y ~ x1 + x2 + x3 + x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## - x1 1 11.262 1236.8 195.57
## <none> 1225.5 197.01
## - x2:x4 1 207.286 1432.8 204.69
## - x3 1 248.430 1473.9 206.45
##
## Step: AIC=195.57
## y ~ x2 + x3 + x4 + x2:x4
##
## Df Sum of Sq RSS AIC
## <none> 1236.8 195.57
## - x3 1 243.60 1480.4 204.72
## - x2:x4 1 245.68 1482.4 204.81
summary(step_mod)
##
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x2:x4, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.959 -3.358 -1.131 3.040 11.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.52261 4.03964 0.377 0.70763
## x2 0.38056 0.06084 6.255 5.47e-08 ***
## x3 34.51062 10.29961 3.351 0.00144 **
## x4 9.52471 2.96093 3.217 0.00214 **
## x2:x4 -0.30472 0.09056 -3.365 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.658 on 57 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7149
## F-statistic: 39.24 on 4 and 57 DF, p-value: 9.297e-16
summary(step_mod)$adj.r.squared
## [1] 0.714905
coef(step_mod)["x2:x4"]
## x2:x4
## -0.3047183
Omit x1 since it is not used in the prediction.
new1 <- data.frame(x2 = 10, x3 = 0.5, x4 = 0.75)
predict(step_mod, new1, interval = "confidence")
## fit lwr upr
## 1 27.44168 24.00618 30.87717
new2 <- data.frame(x2 = 3, x3 = 0.25, x4 = 0.85)
predict(step_mod, new2, interval = "prediction")
## fit lwr upr
## 1 18.61092 8.860183 28.36165
True because prediction intervals are always wider.
x1 was not statistically significant and removing it improved AIC.