Automated model selection is a controvertial way of choosing a model. If your subject matter knowledge can help you build a model, stick with it. If that is not the case, use selection method with care.
http://www.umass.edu/statdata/statdata/data/lowbwt.txt
SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. These data are copyrighted by John Wiley & Sons Inc. and must be acknowledged and used accordingly. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986.
## Fix cases 10 and 39 to make them identical to Dr. Orav's dataset
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)
lbw[c(10,39),"BWT"] <- c(2655,3035)
## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))
## Recoding
lbw <- within(lbw, {
## Relabel race
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## Categorize ftv (frequency of visit)
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
ftv.cat <- relevel(ftv.cat, ref = "Normal")
## Dichotomize ptl
preterm <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))
## Categorize smoke ht ui
smoke <- factor(smoke, levels = 0:1, labels = c("No","Yes"))
ht <- factor(ht, levels = 0:1, labels = c("No","Yes"))
ui <- factor(ui, levels = 0:1, labels = c("No","Yes"))
})
lm.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
lm.null <- lm(bwt ~ 1, data = lbw)
anova(lm.full, lm.null)
Analysis of Variance Table
Model 1: bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm
Model 2: bwt ~ 1
Res.Df RSS Df Sum of Sq F Pr(>F)
1 178 74475274
2 188 99927264 -10 -25451991 6.08 0.000000061 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Bacward elimination
lm.step.bw <- step(lm.full, direction = "backward")
Start: AIC=2457
bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm
Df Sum of Sq RSS AIC
- ftv.cat 2 383274 74858548 2454
- age 1 37981 74513254 2455
<none> 74475274 2457
- preterm 1 972851 75448125 2458
- lwt 1 2530480 77005754 2461
- smoke 1 3318310 77793584 2463
- ht 1 3352598 77827871 2463
- race.cat 2 5559887 80035161 2467
- ui 1 5424365 79899638 2468
Step: AIC=2454
bwt ~ age + lwt + smoke + ht + ui + race.cat + preterm
Df Sum of Sq RSS AIC
- age 1 27429 74885977 2452
<none> 74858548 2454
- preterm 1 928864 75787412 2454
- lwt 1 2443283 77301831 2458
- ht 1 3471379 78329927 2461
- smoke 1 3947315 78805863 2462
- race.cat 2 6090524 80949072 2465
- ui 1 5403600 80262148 2465
Step: AIC=2452
bwt ~ lwt + smoke + ht + ui + race.cat + preterm
Df Sum of Sq RSS AIC
<none> 74885977 2452
- preterm 1 1009276 75895254 2453
- lwt 1 2435622 77321599 2456
- ht 1 3447953 78333930 2459
- smoke 1 3923947 78809925 2460
- ui 1 5376170 80262148 2463
- race.cat 2 6251578 81137556 2463
lm.step.bw
Call:
lm(formula = bwt ~ lwt + smoke + ht + ui + race.cat + preterm,
data = lbw)
Coefficients:
(Intercept) lwt smokeYes htYes uiYes race.catBlack race.catOther
2869.59 4.06 -323.19 -574.30 -489.94 -466.71 -335.53
preterm1+
-208.49
## Forward selection
lm.step.fw <- step(lm.null, scope = ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, direction = "forward")
Start: AIC=2493
bwt ~ 1
Df Sum of Sq RSS AIC
+ ui 1 8032186 91895078 2479
+ preterm 1 4760280 95166985 2485
+ race.cat 2 5076973 94850291 2487
+ smoke 1 3566553 96360711 2488
+ lwt 1 3473052 96454213 2488
+ ht 1 2133121 97794143 2491
<none> 99927264 2493
+ ftv.cat 2 2091437 97835828 2493
+ age 1 812010 99115254 2493
Step: AIC=2479
bwt ~ ui
Df Sum of Sq RSS AIC
+ race.cat 2 4773411 87121667 2473
+ ht 1 3164060 88731018 2474
+ smoke 1 2943572 88951506 2475
+ preterm 1 2838829 89056249 2475
+ lwt 1 2095708 89799370 2476
<none> 91895078 2479
+ ftv.cat 2 1856011 90039067 2479
+ age 1 476194 91418884 2480
Step: AIC=2473
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC
+ smoke 1 6072463 81049204 2461
+ ht 1 2690006 84431661 2469
+ preterm 1 2653197 84468471 2469
+ lwt 1 2254369 84867299 2470
<none> 87121667 2473
+ ftv.cat 2 1164466 85957202 2474
+ age 1 58269 87063398 2475
Step: AIC=2461
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC
+ ht 1 2458993 78590212 2457
+ lwt 1 1562529 79486675 2459
+ preterm 1 1340665 79708539 2460
<none> 81049204 2461
+ age 1 235 81048969 2463
+ ftv.cat 2 334011 80715193 2464
Step: AIC=2457
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC
+ lwt 1 2694958 75895254 2453
+ preterm 1 1268612 77321599 2456
<none> 78590212 2457
+ age 1 728 78589484 2459
+ ftv.cat 2 246592 78343620 2461
Step: AIC=2453
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC
+ preterm 1 1009276 74885977 2452
<none> 75895254 2453
+ age 1 107841 75787412 2454
+ ftv.cat 2 327846 75567407 2456
Step: AIC=2452
bwt ~ ui + race.cat + smoke + ht + lwt + preterm
Df Sum of Sq RSS AIC
<none> 74885977 2452
+ age 1 27429 74858548 2454
+ ftv.cat 2 372723 74513254 2455
lm.step.fw
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt + preterm,
data = lbw)
Coefficients:
(Intercept) uiYes race.catBlack race.catOther smokeYes htYes lwt
2869.59 -489.94 -466.71 -335.53 -323.19 -574.30 4.06
preterm1+
-208.49
## Forward selection with elminination
lm.step.both <- step(lm.null, scope = ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, direction = "both")
Start: AIC=2493
bwt ~ 1
Df Sum of Sq RSS AIC
+ ui 1 8032186 91895078 2479
+ preterm 1 4760280 95166985 2485
+ race.cat 2 5076973 94850291 2487
+ smoke 1 3566553 96360711 2488
+ lwt 1 3473052 96454213 2488
+ ht 1 2133121 97794143 2491
<none> 99927264 2493
+ ftv.cat 2 2091437 97835828 2493
+ age 1 812010 99115254 2493
Step: AIC=2479
bwt ~ ui
Df Sum of Sq RSS AIC
+ race.cat 2 4773411 87121667 2473
+ ht 1 3164060 88731018 2474
+ smoke 1 2943572 88951506 2475
+ preterm 1 2838829 89056249 2475
+ lwt 1 2095708 89799370 2476
<none> 91895078 2479
+ ftv.cat 2 1856011 90039067 2479
+ age 1 476194 91418884 2480
- ui 1 8032186 99927264 2493
Step: AIC=2473
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC
+ smoke 1 6072463 81049204 2461
+ ht 1 2690006 84431661 2469
+ preterm 1 2653197 84468471 2469
+ lwt 1 2254369 84867299 2470
<none> 87121667 2473
+ ftv.cat 2 1164466 85957202 2474
+ age 1 58269 87063398 2475
- race.cat 2 4773411 91895078 2479
- ui 1 7728624 94850291 2487
Step: AIC=2461
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC
+ ht 1 2458993 78590212 2457
+ lwt 1 1562529 79486675 2459
+ preterm 1 1340665 79708539 2460
<none> 81049204 2461
+ age 1 235 81048969 2463
+ ftv.cat 2 334011 80715193 2464
- smoke 1 6072463 87121667 2473
- ui 1 6537820 87587024 2474
- race.cat 2 7902302 88951506 2475
Step: AIC=2457
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC
+ lwt 1 2694958 75895254 2453
+ preterm 1 1268612 77321599 2456
<none> 78590212 2457
+ age 1 728 78589484 2459
+ ftv.cat 2 246592 78343620 2461
- ht 1 2458993 81049204 2461
- smoke 1 5841450 84431661 2469
- race.cat 2 7318764 85908976 2470
- ui 1 7358579 85948791 2472
Step: AIC=2453
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC
+ preterm 1 1009276 74885977 2452
<none> 75895254 2453
+ age 1 107841 75787412 2454
+ ftv.cat 2 327846 75567407 2456
- lwt 1 2694958 78590212 2457
- ht 1 3591422 79486675 2459
- smoke 1 4900939 80796192 2463
- race.cat 2 6677141 82572394 2465
- ui 1 6326702 82221956 2466
Step: AIC=2452
bwt ~ ui + race.cat + smoke + ht + lwt + preterm
Df Sum of Sq RSS AIC
<none> 74885977 2452
- preterm 1 1009276 75895254 2453
+ age 1 27429 74858548 2454
+ ftv.cat 2 372723 74513254 2455
- lwt 1 2435622 77321599 2456
- ht 1 3447953 78333930 2459
- smoke 1 3923947 78809925 2460
- ui 1 5376170 80262148 2463
- race.cat 2 6251578 81137556 2463
lm.step.both
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt + preterm,
data = lbw)
Coefficients:
(Intercept) uiYes race.catBlack race.catOther smokeYes htYes lwt
2869.59 -489.94 -466.71 -335.53 -323.19 -574.30 4.06
preterm1+
-208.49
## age is the least significant by partial F test
drop1(lm.full, test = "F")
Single term deletions
Model:
bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74475274 2457
age 1 37981 74513254 2455 0.09 0.76354
lwt 1 2530480 77005754 2461 6.05 0.01488 *
smoke 1 3318310 77793584 2463 7.93 0.00541 **
ht 1 3352598 77827871 2463 8.01 0.00518 **
ui 1 5424365 79899638 2468 12.96 0.00041 ***
ftv.cat 2 383274 74858548 2454 0.46 0.63328
race.cat 2 5559887 80035161 2467 6.64 0.00165 **
preterm 1 972851 75448125 2458 2.33 0.12907
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After elimination, ftv.cat is the least significant
drop1(update(lm.full, ~ . -age), test = "F")
Single term deletions
Model:
bwt ~ lwt + smoke + ht + ui + ftv.cat + race.cat + preterm
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74513254 2455
lwt 1 2505763 77019017 2459 6.02 0.01511 *
smoke 1 3284416 77797670 2461 7.89 0.00552 **
ht 1 3329500 77842755 2461 8.00 0.00522 **
ui 1 5389465 79902719 2466 12.95 0.00041 ***
ftv.cat 2 372723 74885977 2452 0.45 0.63982
race.cat 2 5607507 80120762 2465 6.74 0.00151 **
preterm 1 1054153 75567407 2456 2.53 0.11330
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After elimination, preterm is least significat at p = 0.12.
drop1(update(lm.full, ~ . -age -ftv.cat), test = "F")
Single term deletions
Model:
bwt ~ lwt + smoke + ht + ui + race.cat + preterm
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 74885977 2452
lwt 1 2435622 77321599 2456 5.89 0.01623 *
smoke 1 3923947 78809925 2460 9.48 0.00240 **
ht 1 3447953 78333930 2459 8.33 0.00437 **
ui 1 5376170 80262148 2463 12.99 0.00040 ***
race.cat 2 6251578 81137556 2463 7.56 0.00071 ***
preterm 1 1009276 75895254 2453 2.44 0.12007
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After elimination, all variables are significant at p < 0.1
drop1(update(lm.full, ~ . -age -ftv.cat -preterm), test = "F")
Single term deletions
Model:
bwt ~ lwt + smoke + ht + ui + race.cat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 75895254 2453
lwt 1 2694958 78590212 2457 6.46 0.01185 *
smoke 1 4900939 80796192 2463 11.75 0.00075 ***
ht 1 3591422 79486675 2459 8.61 0.00377 **
ui 1 6326702 82221956 2466 15.17 0.00014 ***
race.cat 2 6677141 82572394 2465 8.01 0.00047 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Show summary for final model
summary(update(lm.full, ~ . -age -ftv.cat -preterm))
Call:
lm(formula = bwt ~ lwt + smoke + ht + ui + race.cat, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1843 -434 67 461 1631
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2835.23 243.61 11.64 < 2e-16 ***
lwt 4.26 1.67 2.54 0.01185 *
smokeYes -354.53 103.41 -3.43 0.00075 ***
htYes -585.73 199.59 -2.93 0.00377 **
uiYes -524.43 134.64 -3.90 0.00014 ***
race.catBlack -476.19 145.56 -3.27 0.00128 **
race.catOther -349.84 112.33 -3.11 0.00214 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 646 on 182 degrees of freedom
Multiple R-squared: 0.24, Adjusted R-squared: 0.215
F-statistic: 9.6 on 6 and 182 DF, p-value: 0.00000000356
## ui is the most significant variable
add1(lm.null, scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ 1
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 99927264 2493
age 1 812010 99115254 2493 1.53 0.2174
lwt 1 3473052 96454213 2488 6.73 0.0102 *
race.cat 2 5076973 94850291 2487 4.98 0.0078 **
smoke 1 3566553 96360711 2488 6.92 0.0092 **
preterm 1 4760280 95166985 2485 9.35 0.0026 **
ht 1 2133121 97794143 2491 4.08 0.0448 *
ui 1 8032186 91895078 2479 16.34 0.000077 ***
ftv.cat 2 2091437 97835828 2493 1.99 0.1399
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After inclusion, race.cat is the most significant
add1(update(lm.null, ~ . +ui), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 91895078 2479
age 1 476194 91418884 2480 0.97 0.3262
lwt 1 2095708 89799370 2476 4.34 0.0386 *
race.cat 2 4773411 87121667 2473 5.07 0.0072 **
smoke 1 2943572 88951506 2475 6.16 0.0140 *
preterm 1 2838829 89056249 2475 5.93 0.0158 *
ht 1 3164060 88731018 2474 6.63 0.0108 *
ftv.cat 2 1856011 90039067 2479 1.91 0.1515
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After inclusion, smoke is the most significant
add1(update(lm.null, ~ . +ui +race.cat), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 87121667 2473
age 1 58269 87063398 2475 0.12 0.72605
lwt 1 2254369 84867299 2470 4.89 0.02828 *
smoke 1 6072463 81049204 2461 13.79 0.00027 ***
preterm 1 2653197 84468471 2469 5.78 0.01721 *
ht 1 2690006 84431661 2469 5.86 0.01644 *
ftv.cat 2 1164466 85957202 2474 1.24 0.29193
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After inclusion, ht is the most significant
add1(update(lm.null, ~ . +ui +race.cat +smoke), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 81049204 2461
age 1 235 81048969 2463 0.00 0.982
lwt 1 1562529 79486675 2459 3.60 0.059 .
preterm 1 1340665 79708539 2460 3.08 0.081 .
ht 1 2458993 78590212 2457 5.73 0.018 *
ftv.cat 2 334011 80715193 2464 0.38 0.687
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After inclusion, lwt is the most significant
add1(update(lm.null, ~ . +ui +race.cat +smoke +ht), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke + ht
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 78590212 2457
age 1 728 78589484 2459 0.00 0.967
lwt 1 2694958 75895254 2453 6.46 0.012 *
preterm 1 1268612 77321599 2456 2.99 0.086 .
ftv.cat 2 246592 78343620 2461 0.28 0.752
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## After inclusion, no variable is significant at p < 0.1
add1(update(lm.null, ~ . +ui +race.cat +smoke +ht +lwt), scope = ~ age + lwt + race.cat + smoke + preterm + ht + ui + ftv.cat, test = "F")
Single term additions
Model:
bwt ~ ui + race.cat + smoke + ht + lwt
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 75895254 2453
age 1 107841 75787412 2454 0.26 0.61
preterm 1 1009276 74885977 2452 2.44 0.12
ftv.cat 2 327846 75567407 2456 0.39 0.68
## Show summary for final model
summary(update(lm.null, ~ . +ui +race.cat +smoke +ht +lwt))
Call:
lm(formula = bwt ~ ui + race.cat + smoke + ht + lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1843 -434 67 461 1631
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2835.23 243.61 11.64 < 2e-16 ***
uiYes -524.43 134.64 -3.90 0.00014 ***
race.catBlack -476.19 145.56 -3.27 0.00128 **
race.catOther -349.84 112.33 -3.11 0.00214 **
smokeYes -354.53 103.41 -3.43 0.00075 ***
htYes -585.73 199.59 -2.93 0.00377 **
lwt 4.26 1.67 2.54 0.01185 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 646 on 182 degrees of freedom
Multiple R-squared: 0.24, Adjusted R-squared: 0.215
F-statistic: 9.6 on 6 and 182 DF, p-value: 0.00000000356
Regression subset selection including exhaustive search. This is only for linear regression.
Reference: http://www.statmethods.net/stats/regression.html
Perform all subset regression, and choose “nbest” model(s) for each number of predictors up to nvmax.
The result shows how it was performed. The best model in the 10-variable case includes all variables, as that is the only way to have 10 variables.
library(leaps)
regsubsets.out <-
regsubsets(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm,
data = lbw,
nbest = 1, # 1 best model for each number of predictors
nvmax = NULL, # NULL for no limit on number of variables
force.in = NULL, force.out = NULL,
method = "exhaustive")
(summary.out <- summary(regsubsets.out))
Subset selection object
Call: regsubsets.formula(bwt ~ age + lwt + smoke + ht + ui + ftv.cat +
race.cat + preterm, data = lbw, nbest = 1, nvmax = NULL,
force.in = NULL, force.out = NULL, method = "exhaustive")
10 Variables (and intercept)
Forced in Forced out
age FALSE FALSE
lwt FALSE FALSE
smokeYes FALSE FALSE
htYes FALSE FALSE
uiYes FALSE FALSE
ftv.catNone FALSE FALSE
ftv.catMany FALSE FALSE
race.catBlack FALSE FALSE
race.catOther FALSE FALSE
preterm1+ FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
age lwt smokeYes htYes uiYes ftv.catNone ftv.catMany race.catBlack race.catOther preterm1+
1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
2 ( 1 ) " " " " " " "*" "*" " " " " " " " " " "
3 ( 1 ) " " "*" " " "*" "*" " " " " " " " " " "
4 ( 1 ) " " " " "*" " " "*" " " " " "*" "*" " "
5 ( 1 ) " " " " "*" "*" "*" " " " " "*" "*" " "
6 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" " "
7 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" "*"
8 ( 1 ) " " "*" "*" "*" "*" " " "*" "*" "*" "*"
9 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
Graphical table of best subsets (plot.regsubsets)
By adjusted \( R^2 \), the best model includes lwt, race.cat, preterm, ht, and ui (variables that have black boxes at the higest Y-axis value).
## Adjusted R2
plot(regsubsets.out, scale = "adjr2", main = "Adjusted R^2")
Plot Output from regsubsets Function in leaps package
This is just another way of presenting the same information for adjusted \( R^2 \). The model with 7 variables (counting dummy variables seprately) has the highest adjusted \( R^2 \).
Mallow Cp is used to decide on the number of predictors to include. The stopping rule is to start with the smallest model and gradually increase number of variables, and stop when Mallow Cp is approximately (number of regressors + 1, broken line) for the first time. In this case, the model with 6 regressors is the first one to achieve such a condition.
library(car)
layout(matrix(1:2, ncol = 2))
## Adjusted R2
res.legend <-
subsets(regsubsets.out, statistic="adjr2", legend = FALSE, min.size = 5, main = "Adjusted R^2")
## Mallow Cp
res.legend <-
subsets(regsubsets.out, statistic="cp", legend = FALSE, min.size = 5, main = "Mallow Cp")
abline(a = 1, b = 1, lty = 2)
res.legend
Abbreviation
age a
lwt l
smokeYes s
htYes h
uiYes u
ftv.catNone f.N
ftv.catMany f.M
race.catBlack r.B
race.catOther r.O
preterm1+ p
See which model has the highest adjusted R2
The model with 7 variables (counting dummy variables separately) has the highest adjusted \( R^2 \). Variables marked with TRUE are the ones chosen.
which.max(summary.out$adjr2)
[1] 7
summary.out$which[7,]
(Intercept) age lwt smokeYes htYes uiYes ftv.catNone ftv.catMany
TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
race.catBlack race.catOther preterm1+
TRUE TRUE TRUE
Do regression with the best model
Somehow I had to recreate the best model from the output above.
best.model <- lm(bwt ~ lwt + race.cat + smoke + preterm + ht + ui, data = lbw)
summary(best.model)
Call:
lm(formula = bwt ~ lwt + race.cat + smoke + preterm + ht + ui,
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1886.4 -440.8 53.6 494.4 1621.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2869.59 243.65 11.78 <2e-16 ***
lwt 4.06 1.67 2.43 0.0162 *
race.catBlack -466.71 145.12 -3.22 0.0015 **
race.catOther -335.53 112.26 -2.99 0.0032 **
smokeYes -323.19 104.94 -3.08 0.0024 **
preterm1+ -208.49 133.49 -1.56 0.1201
htYes -574.30 198.94 -2.89 0.0044 **
uiYes -489.94 135.91 -3.60 0.0004 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643 on 181 degrees of freedom
Multiple R-squared: 0.251, Adjusted R-squared: 0.222
F-statistic: 8.65 on 7 and 181 DF, p-value: 0.00000000385