#
# CRP 245 Part 2 Day 6-7 Model Selection
# Example demonstrating forward and backward model selection
#
# Pima Diabetes Data Set:
# A population of women who were at least 21 years old, of Pima Indian heritage and
# living near Phoenix, Arizona, was tested for diabetes according to World Health
# Organization criteria. The data were collected by the US National Institute of
# Diabetes and Digestive and Kidney Diseases. The data is comprised of 200 complete
# records after dropping the (mainly missing) data on serum insulin.
# Data Set: pima
# Data Dictionary:
# (1) npreg number of pregnancies
# (2) glu plasma glucose concentration in an oral glucose tolerance test (mg/dL)
# (3) bp diastolic blood pressure (mm Hg)
# (4) skin triceps skin fold thickness (mm)
# (5) bmi body mass index (weight in kg/(height in m)^2)
# (6) ped diabetes pedigree function
# (7) age age in years
# (8) type diabetic status according to WHO criteria (1 = diabetic; 0 = not diabetic)
# Download and load the R data file:
load(url("http://www.duke.edu/~sgrambow/crp245data/pima.Rdata"))
# reduce decimal printout for this code
options(digits=2)
#
# ~ Forward Selection Using Alpha-Criterion of 0.5 ~
#
# - Manually perform variable selection
# Create a full and null logistic models
glm.full <- glm(type ~ glu + age + ped + bmi + bp +skin , data=pima, family='binomial')
glm.null <- glm(type ~ 1, data = pima, family = 'binomial')
# run all single variable models and pick variable
# with smallest p-value. glu is first variable in model
# STEP 1:
add1(glm.null, scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
##
## Model:
## type ~ 1
## Df Deviance AIC LRT Pr(>Chi)
## <none> 256 258
## glu 1 207 211 49.0 2.5e-12 ***
## bp 1 248 252 8.9 0.00291 **
## skin 1 245 249 11.7 0.00062 ***
## bmi 1 240 244 16.4 5.0e-05 ***
## ped 1 248 252 8.3 0.00396 **
## age 1 230 234 26.5 2.7e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# now run 2nd step and examine all 2 variable
# models that include glu
# STEP 2:
add1(update(glm.null, ~ .+glu),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
##
## Model:
## type ~ glu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 207 211
## bp 1 206 212 1.47 0.2254
## skin 1 202 208 5.11 0.0238 *
## bmi 1 198 204 8.90 0.0028 **
## ped 1 199 205 8.11 0.0044 **
## age 1 197 203 10.27 0.0014 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 2nd variable with smallest p-value is age
# now run 3rd step and examine all 3 variable models
# STEP 3:
add1(update(glm.null, ~ . + glu+age),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
##
## Model:
## type ~ glu + age
## Df Deviance AIC LRT Pr(>Chi)
## <none> 197 203
## bp 1 197 205 0.01 0.9257
## skin 1 194 202 3.62 0.0571 .
## bmi 1 188 196 8.71 0.0032 **
## ped 1 187 195 10.01 0.0016 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 3rd variable with smallest p-value is ped
# now run 3rd step and examine all 4 variable models
# STEP 4:
add1(update(glm.null, ~ . + glu+age+ped),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
##
## Model:
## type ~ glu + age + ped
## Df Deviance AIC LRT Pr(>Chi)
## <none> 187 195
## bp 1 187 197 0.03 0.855
## skin 1 185 195 2.46 0.117
## bmi 1 181 191 6.02 0.014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 4th variable with smallest p-value is bmi
# now run 4th step and examine all 5 variable models
# STEP 5:
add1(update(glm.null, ~ . + glu+age+ped+bmi),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
##
## Model:
## type ~ glu + age + ped + bmi
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 191
## bp 1 181 193 0.0805 0.78
## skin 1 181 193 0.0193 0.89
# STOP ADDING VARIABLES
# No more variables significant at p-value < .5
# final model is:
final.forward.fit <- glm(type ~ glu + age + ped + bmi, data=pima, family='binomial')
summary(final.forward.fit)
##
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial",
## data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.086 -0.673 -0.369 0.682 2.209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.97139 1.52759 -6.53 6.7e-11 ***
## glu 0.03126 0.00663 4.72 2.4e-06 ***
## age 0.05860 0.01757 3.33 0.00085 ***
## ped 1.71979 0.65609 2.62 0.00876 **
## bmi 0.07703 0.03225 2.39 0.01692 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 256.41 on 199 degrees of freedom
## Residual deviance: 181.08 on 195 degrees of freedom
## AIC: 191.1
##
## Number of Fisher Scoring iterations: 5
# - Automate variable selection using the step() function
# -- Note: To perform forward selection using an alpha-to-entery criterion of 0.5,
# must set k in the step() function call equal to the 50th percentile of a
# chi-square(1) distribution ---> qchisq(0.5,1,lower.tail=FALSE) = 0.4549364
# easy to change to AIC step criterion -- change k = 2
# - Perform forward stepwise selection using the step() function
# glm.null is the null model from above
# scope is the full model we would consider with all variables
forward.fit <- step(glm.null,direction="forward",test="LRT",
scope= ~ glu + age + ped + bmi + bp +skin,k=0.45)
## Start: AIC=257
## type ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + glu 1 207 208 49.0 2.5e-12 ***
## + age 1 230 231 26.5 2.7e-07 ***
## + bmi 1 240 241 16.4 5.0e-05 ***
## + skin 1 245 246 11.7 0.00062 ***
## + bp 1 248 248 8.9 0.00291 **
## + ped 1 248 249 8.3 0.00396 **
## <none> 256 257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=208
## type ~ glu
##
## Df Deviance AIC LRT Pr(>Chi)
## + age 1 197 198 10.27 0.0014 **
## + bmi 1 198 200 8.90 0.0028 **
## + ped 1 199 201 8.11 0.0044 **
## + skin 1 202 204 5.11 0.0238 *
## + bp 1 206 207 1.47 0.2254
## <none> 207 208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=198
## type ~ glu + age
##
## Df Deviance AIC LRT Pr(>Chi)
## + ped 1 187 189 10.01 0.0016 **
## + bmi 1 188 190 8.71 0.0032 **
## + skin 1 194 195 3.62 0.0571 .
## <none> 197 198
## + bp 1 197 199 0.01 0.9257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=189
## type ~ glu + age + ped
##
## Df Deviance AIC LRT Pr(>Chi)
## + bmi 1 181 183 6.02 0.014 *
## + skin 1 185 187 2.46 0.117
## <none> 187 189
## + bp 1 187 189 0.03 0.855
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=183
## type ~ glu + age + ped + bmi
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 183
## + bp 1 181 184 0.0805 0.78
## + skin 1 181 184 0.0193 0.89
summary(forward.fit)
##
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial",
## data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.086 -0.673 -0.369 0.682 2.209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.97139 1.52759 -6.53 6.7e-11 ***
## glu 0.03126 0.00663 4.72 2.4e-06 ***
## age 0.05860 0.01757 3.33 0.00085 ***
## ped 1.71979 0.65609 2.62 0.00876 **
## bmi 0.07703 0.03225 2.39 0.01692 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 256.41 on 199 degrees of freedom
## Residual deviance: 181.08 on 195 degrees of freedom
## AIC: 191.1
##
## Number of Fisher Scoring iterations: 5
#
# ~ Backward Selection Using Alpha-Criterion of 0.5 ~
#
# - Manually perform variable selection
# remove variable with largest p-value
# STEP 1:
drop1(glm.full, test = "LRT")
## Single term deletions
##
## Model:
## type ~ glu + age + ped + bmi + bp + skin
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 195
## glu 1 208 220 26.74 2.3e-07 ***
## age 1 192 204 10.92 0.00095 ***
## ped 1 188 200 7.28 0.00698 **
## bmi 1 185 197 3.62 0.05726 .
## bp 1 181 193 0.07 0.78909
## skin 1 181 193 0.01 0.91861
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# drop skin (p=.92)
# STEP 2:
drop1(update(glm.full, ~ . -skin), test = "LRT")
## Single term deletions
##
## Model:
## type ~ glu + age + ped + bmi + bp
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 193
## glu 1 208 218 26.73 2.3e-07 ***
## age 1 192 202 10.96 0.00093 ***
## ped 1 188 198 7.27 0.00702 **
## bmi 1 187 197 6.06 0.01380 *
## bp 1 181 191 0.08 0.77668
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# drop bp (p=0.78)
# STEP 3:
drop1(update(glm.full, ~ . -skin -bp), test = "LRT")
## Single term deletions
##
## Model:
## type ~ glu + age + ped + bmi
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 191
## glu 1 208 216 26.93 2.1e-07 ***
## age 1 193 201 11.78 0.0006 ***
## ped 1 188 196 7.31 0.0069 **
## bmi 1 187 195 6.02 0.0142 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ALL remaining significant at 0.5 so stop
# final model is:
final.backward.fit <- glm(type ~ glu + age + ped + bmi, data=pima, family='binomial')
summary(final.backward.fit)
##
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial",
## data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.086 -0.673 -0.369 0.682 2.209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.97139 1.52759 -6.53 6.7e-11 ***
## glu 0.03126 0.00663 4.72 2.4e-06 ***
## age 0.05860 0.01757 3.33 0.00085 ***
## ped 1.71979 0.65609 2.62 0.00876 **
## bmi 0.07703 0.03225 2.39 0.01692 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 256.41 on 199 degrees of freedom
## Residual deviance: 181.08 on 195 degrees of freedom
## AIC: 191.1
##
## Number of Fisher Scoring iterations: 5
# - Automate variable selection Using the step() function
# -- Note: To perform forward selection using an alpha-to-stay criterion of 0.5,
# must set k in the step() function call equal to the 50th percentile of a
# chi-square(1) distribution ---> qchisq(0.5,1,lower.tail=FALSE) = 0.4549364
# - Define the "full" model with all potential covariates
full <- glm(type ~ glu + bp + skin + bmi + ped + age, data=pima, family='binomial')
# - Perform forward stepwise selection using the step() function
backward.fit <- step(full,direction="backward",test="LRT",k=0.45)
## Start: AIC=184
## type ~ glu + bp + skin + bmi + ped + age
##
## Df Deviance AIC LRT Pr(>Chi)
## - skin 1 181 184 0.01 0.91861
## - bp 1 181 184 0.07 0.78909
## <none> 181 184
## - bmi 1 185 187 3.62 0.05726 .
## - ped 1 188 191 7.28 0.00698 **
## - age 1 192 195 10.92 0.00095 ***
## - glu 1 208 210 26.74 2.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=184
## type ~ glu + bp + bmi + ped + age
##
## Df Deviance AIC LRT Pr(>Chi)
## - bp 1 181 183 0.08 0.77668
## <none> 181 184
## - bmi 1 187 189 6.06 0.01380 *
## - ped 1 188 190 7.27 0.00702 **
## - age 1 192 194 10.96 0.00093 ***
## - glu 1 208 210 26.73 2.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=183
## type ~ glu + bmi + ped + age
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 181 183
## - bmi 1 187 189 6.02 0.0142 *
## - ped 1 188 190 7.31 0.0069 **
## - age 1 193 195 11.78 0.0006 ***
## - glu 1 208 210 26.93 2.1e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(backward.fit)
##
## Call:
## glm(formula = type ~ glu + bmi + ped + age, family = "binomial",
## data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.086 -0.673 -0.369 0.682 2.209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.97139 1.52759 -6.53 6.7e-11 ***
## glu 0.03126 0.00663 4.72 2.4e-06 ***
## bmi 0.07703 0.03225 2.39 0.01692 *
## ped 1.71979 0.65609 2.62 0.00876 **
## age 0.05860 0.01757 3.33 0.00085 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 256.41 on 199 degrees of freedom
## Residual deviance: 181.08 on 195 degrees of freedom
## AIC: 191.1
##
## Number of Fisher Scoring iterations: 5
# End of Porgram