#
#  CRP 245 Part 2 Day 6-7 Model Selection
# Example demonstrating forward and backward model selection
#
# Pima Diabetes Data Set:
# A population of women who were at least 21 years old, of Pima Indian heritage and 
# living near Phoenix, Arizona, was tested for diabetes according to World Health
# Organization criteria. The data were collected by the US National Institute of 
# Diabetes and Digestive and Kidney Diseases. The data is comprised of 200 complete 
# records after dropping the (mainly missing) data on serum insulin.

# Data Set: pima

# Data Dictionary: 
# (1)   npreg       number of pregnancies
# (2)   glu       plasma glucose concentration in an oral glucose tolerance test (mg/dL)
# (3)   bp        diastolic blood pressure (mm Hg)
# (4)   skin        triceps skin fold thickness (mm)
# (5)   bmi       body mass index (weight in kg/(height in m)^2)
# (6)   ped     diabetes pedigree function
# (7)   age       age in years
# (8)   type        diabetic status according to WHO criteria (1 = diabetic; 0 = not diabetic)

# Download and load the R data file:
load(url("http://www.duke.edu/~sgrambow/crp245data/pima.Rdata"))


# reduce decimal printout for this code
options(digits=2)

#
# ~ Forward Selection Using Alpha-Criterion of 0.5 ~
#

# - Manually perform variable selection 

# Create a full and null logistic models
glm.full <- glm(type ~ glu + age + ped + bmi + bp +skin  , data=pima, family='binomial')
glm.null <- glm(type ~ 1, data = pima, family = 'binomial')

# run all single variable models and pick variable
# with smallest p-value. glu is first variable in model
# STEP 1: 
add1(glm.null, scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
## 
## Model:
## type ~ 1
##        Df Deviance AIC  LRT Pr(>Chi)    
## <none>         256 258                  
## glu     1      207 211 49.0  2.5e-12 ***
## bp      1      248 252  8.9  0.00291 ** 
## skin    1      245 249 11.7  0.00062 ***
## bmi     1      240 244 16.4  5.0e-05 ***
## ped     1      248 252  8.3  0.00396 ** 
## age     1      230 234 26.5  2.7e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# now run 2nd step and examine all 2 variable
# models that include glu

# STEP 2: 
add1(update(glm.null, ~ .+glu),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
## 
## Model:
## type ~ glu
##        Df Deviance AIC   LRT Pr(>Chi)   
## <none>         207 211                  
## bp      1      206 212  1.47   0.2254   
## skin    1      202 208  5.11   0.0238 * 
## bmi     1      198 204  8.90   0.0028 **
## ped     1      199 205  8.11   0.0044 **
## age     1      197 203 10.27   0.0014 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 2nd variable with smallest p-value is age
# now run 3rd step and examine all 3 variable models

# STEP 3: 
add1(update(glm.null, ~ . + glu+age),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
## 
## Model:
## type ~ glu + age
##        Df Deviance AIC   LRT Pr(>Chi)   
## <none>         197 203                  
## bp      1      197 205  0.01   0.9257   
## skin    1      194 202  3.62   0.0571 . 
## bmi     1      188 196  8.71   0.0032 **
## ped     1      187 195 10.01   0.0016 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 3rd variable with smallest p-value is ped
# now run 3rd step and examine all 4 variable models

# STEP 4: 
add1(update(glm.null, ~ . + glu+age+ped),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
## 
## Model:
## type ~ glu + age + ped
##        Df Deviance AIC  LRT Pr(>Chi)  
## <none>         187 195                
## bp      1      187 197 0.03    0.855  
## skin    1      185 195 2.46    0.117  
## bmi     1      181 191 6.02    0.014 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 4th variable with smallest p-value is bmi
# now run 4th step and examine all 5 variable models

# STEP 5: 
add1(update(glm.null, ~ . + glu+age+ped+bmi),scope = ~ glu + bp + skin + bmi + ped + age, test = "LRT")
## Single term additions
## 
## Model:
## type ~ glu + age + ped + bmi
##        Df Deviance AIC    LRT Pr(>Chi)
## <none>         181 191                
## bp      1      181 193 0.0805     0.78
## skin    1      181 193 0.0193     0.89
# STOP ADDING VARIABLES  
# No more variables significant at p-value < .5
# final model is:
final.forward.fit <- glm(type ~ glu + age + ped + bmi, data=pima, family='binomial')
summary(final.forward.fit)
## 
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial", 
##     data = pima)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.086  -0.673  -0.369   0.682   2.209  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.97139    1.52759   -6.53  6.7e-11 ***
## glu          0.03126    0.00663    4.72  2.4e-06 ***
## age          0.05860    0.01757    3.33  0.00085 ***
## ped          1.71979    0.65609    2.62  0.00876 ** 
## bmi          0.07703    0.03225    2.39  0.01692 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 256.41  on 199  degrees of freedom
## Residual deviance: 181.08  on 195  degrees of freedom
## AIC: 191.1
## 
## Number of Fisher Scoring iterations: 5
# - Automate variable selection using the step() function
# -- Note: To perform forward selection using an alpha-to-entery criterion of 0.5, 
#          must set k in the step() function call equal to the 50th percentile of a 
#          chi-square(1) distribution ---> qchisq(0.5,1,lower.tail=FALSE) = 0.4549364
# easy to change to AIC step criterion -- change k = 2
# - Perform forward stepwise selection using the step() function
# glm.null is the null model from above
# scope is the full model we would consider with all variables
forward.fit <- step(glm.null,direction="forward",test="LRT",
                    scope= ~ glu + age + ped + bmi + bp +skin,k=0.45)
## Start:  AIC=257
## type ~ 1
## 
##        Df Deviance AIC  LRT Pr(>Chi)    
## + glu   1      207 208 49.0  2.5e-12 ***
## + age   1      230 231 26.5  2.7e-07 ***
## + bmi   1      240 241 16.4  5.0e-05 ***
## + skin  1      245 246 11.7  0.00062 ***
## + bp    1      248 248  8.9  0.00291 ** 
## + ped   1      248 249  8.3  0.00396 ** 
## <none>         256 257                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=208
## type ~ glu
## 
##        Df Deviance AIC   LRT Pr(>Chi)   
## + age   1      197 198 10.27   0.0014 **
## + bmi   1      198 200  8.90   0.0028 **
## + ped   1      199 201  8.11   0.0044 **
## + skin  1      202 204  5.11   0.0238 * 
## + bp    1      206 207  1.47   0.2254   
## <none>         207 208                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=198
## type ~ glu + age
## 
##        Df Deviance AIC   LRT Pr(>Chi)   
## + ped   1      187 189 10.01   0.0016 **
## + bmi   1      188 190  8.71   0.0032 **
## + skin  1      194 195  3.62   0.0571 . 
## <none>         197 198                  
## + bp    1      197 199  0.01   0.9257   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=189
## type ~ glu + age + ped
## 
##        Df Deviance AIC  LRT Pr(>Chi)  
## + bmi   1      181 183 6.02    0.014 *
## + skin  1      185 187 2.46    0.117  
## <none>         187 189                
## + bp    1      187 189 0.03    0.855  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=183
## type ~ glu + age + ped + bmi
## 
##        Df Deviance AIC    LRT Pr(>Chi)
## <none>         181 183                
## + bp    1      181 184 0.0805     0.78
## + skin  1      181 184 0.0193     0.89
summary(forward.fit)
## 
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial", 
##     data = pima)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.086  -0.673  -0.369   0.682   2.209  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.97139    1.52759   -6.53  6.7e-11 ***
## glu          0.03126    0.00663    4.72  2.4e-06 ***
## age          0.05860    0.01757    3.33  0.00085 ***
## ped          1.71979    0.65609    2.62  0.00876 ** 
## bmi          0.07703    0.03225    2.39  0.01692 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 256.41  on 199  degrees of freedom
## Residual deviance: 181.08  on 195  degrees of freedom
## AIC: 191.1
## 
## Number of Fisher Scoring iterations: 5
#
# ~ Backward Selection Using Alpha-Criterion of 0.5 ~
#

# - Manually perform variable selection 
# remove variable with largest p-value

# STEP 1: 
drop1(glm.full, test = "LRT")
## Single term deletions
## 
## Model:
## type ~ glu + age + ped + bmi + bp + skin
##        Df Deviance AIC   LRT Pr(>Chi)    
## <none>         181 195                   
## glu     1      208 220 26.74  2.3e-07 ***
## age     1      192 204 10.92  0.00095 ***
## ped     1      188 200  7.28  0.00698 ** 
## bmi     1      185 197  3.62  0.05726 .  
## bp      1      181 193  0.07  0.78909    
## skin    1      181 193  0.01  0.91861    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# drop skin (p=.92)
# STEP 2: 
drop1(update(glm.full, ~ . -skin), test = "LRT")
## Single term deletions
## 
## Model:
## type ~ glu + age + ped + bmi + bp
##        Df Deviance AIC   LRT Pr(>Chi)    
## <none>         181 193                   
## glu     1      208 218 26.73  2.3e-07 ***
## age     1      192 202 10.96  0.00093 ***
## ped     1      188 198  7.27  0.00702 ** 
## bmi     1      187 197  6.06  0.01380 *  
## bp      1      181 191  0.08  0.77668    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# drop bp (p=0.78)
# STEP 3: 
drop1(update(glm.full, ~ . -skin -bp), test = "LRT")
## Single term deletions
## 
## Model:
## type ~ glu + age + ped + bmi
##        Df Deviance AIC   LRT Pr(>Chi)    
## <none>         181 191                   
## glu     1      208 216 26.93  2.1e-07 ***
## age     1      193 201 11.78   0.0006 ***
## ped     1      188 196  7.31   0.0069 ** 
## bmi     1      187 195  6.02   0.0142 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ALL remaining significant at 0.5 so stop
# final model is:
final.backward.fit <- glm(type ~ glu + age + ped + bmi, data=pima, family='binomial')
summary(final.backward.fit)
## 
## Call:
## glm(formula = type ~ glu + age + ped + bmi, family = "binomial", 
##     data = pima)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.086  -0.673  -0.369   0.682   2.209  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.97139    1.52759   -6.53  6.7e-11 ***
## glu          0.03126    0.00663    4.72  2.4e-06 ***
## age          0.05860    0.01757    3.33  0.00085 ***
## ped          1.71979    0.65609    2.62  0.00876 ** 
## bmi          0.07703    0.03225    2.39  0.01692 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 256.41  on 199  degrees of freedom
## Residual deviance: 181.08  on 195  degrees of freedom
## AIC: 191.1
## 
## Number of Fisher Scoring iterations: 5
# - Automate variable selection Using the step() function 
# -- Note: To perform forward selection using an alpha-to-stay criterion of 0.5, 
#          must set k in the step() function call equal to the 50th percentile of a 
#          chi-square(1) distribution ---> qchisq(0.5,1,lower.tail=FALSE) = 0.4549364

# - Define the "full" model with all potential covariates
full <- glm(type ~ glu + bp + skin + bmi + ped + age, data=pima, family='binomial')
# - Perform forward stepwise selection using the step() function
backward.fit <- step(full,direction="backward",test="LRT",k=0.45)
## Start:  AIC=184
## type ~ glu + bp + skin + bmi + ped + age
## 
##        Df Deviance AIC   LRT Pr(>Chi)    
## - skin  1      181 184  0.01  0.91861    
## - bp    1      181 184  0.07  0.78909    
## <none>         181 184                   
## - bmi   1      185 187  3.62  0.05726 .  
## - ped   1      188 191  7.28  0.00698 ** 
## - age   1      192 195 10.92  0.00095 ***
## - glu   1      208 210 26.74  2.3e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=184
## type ~ glu + bp + bmi + ped + age
## 
##        Df Deviance AIC   LRT Pr(>Chi)    
## - bp    1      181 183  0.08  0.77668    
## <none>         181 184                   
## - bmi   1      187 189  6.06  0.01380 *  
## - ped   1      188 190  7.27  0.00702 ** 
## - age   1      192 194 10.96  0.00093 ***
## - glu   1      208 210 26.73  2.3e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=183
## type ~ glu + bmi + ped + age
## 
##        Df Deviance AIC   LRT Pr(>Chi)    
## <none>         181 183                   
## - bmi   1      187 189  6.02   0.0142 *  
## - ped   1      188 190  7.31   0.0069 ** 
## - age   1      193 195 11.78   0.0006 ***
## - glu   1      208 210 26.93  2.1e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(backward.fit)
## 
## Call:
## glm(formula = type ~ glu + bmi + ped + age, family = "binomial", 
##     data = pima)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.086  -0.673  -0.369   0.682   2.209  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.97139    1.52759   -6.53  6.7e-11 ***
## glu          0.03126    0.00663    4.72  2.4e-06 ***
## bmi          0.07703    0.03225    2.39  0.01692 *  
## ped          1.71979    0.65609    2.62  0.00876 ** 
## age          0.05860    0.01757    3.33  0.00085 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 256.41  on 199  degrees of freedom
## Residual deviance: 181.08  on 195  degrees of freedom
## AIC: 191.1
## 
## Number of Fisher Scoring iterations: 5
# End of Porgram