Summary

In this document two logic regression models were built on the CDC BRFSS survey data during 2000-2015 to predict the recommendation of cardio exercise (dependent variable). I used the biometrics and exercise information (independent variables) in the input of my shinyapp. Both model perform similar with a precision of > 90% and a recall of > 70%. Pseudo R-squred is 0.34, indicating that the models are not highly predictive and need improvment.

The performance of these two models are relatively similar. Since AIC of Model A is slightly lower, I choose this model for the prediction in the Shiny App.

Model A

Using biometrics and exercise information to build a naive logistic model
    library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
    set.seed(126)
    
    #data = readRDS(file = 'exer_trainD.rds')
    data = readRDS(file = 'exer_data1.rds')
    #data[] = lapply(data[, c(2:8)], factor)
    
    F1 <- createDataPartition(data$X_PAINDX1_F, p = 0.5, groups = 2)
    data_sub = data[F1$Resample1,]
    F2 <- createDataPartition(data_sub$X_PAINDX1_F, p = 0.8, groups = 2)
    
    select = c('X_PAINDX1_F_logic', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'EXRACT11',
               'X_BMI5CAT')
    
    select_col = c()
    for (i in 1:length(names(data))) {
        if (names(data)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train = data_sub[F2$Resample1,select_col]
    test = data_sub[-F2$Resample1,select_col]
    y = 'X_PAINDX1_F_logic'
    x = select[2:8]
    fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
    model <- glm(fmla, data = train, family = binomial(link = 'logit'))
    modelA <- model
Extracting advice from logistic model

By looking at coefficients of our glm model, we find variables that are positively correlated with the cardio recommendation (TRUE) includes: * No health care coverage * BMI category 4: BMI > 30 * Exercise types that are correlated with the cardio recommendation: frisbee, mowing lawn, pilates, tai chi, volley ball, yoga, etc (those activities are low intensity)

Other variables are negatively correlated with the cardio recommendation.

model$coefficients
##             (Intercept)         X_HCVU651_FTRUE         cutPADUR(30,45] 
##           -2.8941808064            0.1185464159            1.4322522467 
##         cutPADUR(45,60]        cutPADUR(60,270]   cutPAFRE(2e+03,3e+03] 
##            2.3276324735            3.2742176027            1.7417225990 
##   cutPAFRE(3e+03,5e+03] cutPAFRE(5e+03,9.8e+04]                X_AGE_G2 
##            2.8593954384            3.8118122283            0.2134305103 
##                X_AGE_G3                X_AGE_G4                X_AGE_G5 
##            0.5191298809            0.6606595894            0.8406146063 
##                    SEX2               EXRACT112               EXRACT113 
##            0.3072533794            1.5570681853            1.6674859744 
##               EXRACT114               EXRACT115               EXRACT116 
##           -0.0098611968            0.5497758383            1.1712131109 
##               EXRACT117               EXRACT118               EXRACT119 
##            1.2555829862            1.1553262455            0.2748450616 
##              EXRACT1110              EXRACT1111              EXRACT1112 
##            1.7065344510           -0.1263498816            0.7977618940 
##              EXRACT1113              EXRACT1114              EXRACT1115 
##            0.2630773204            1.4399583732            0.3265002669 
##              EXRACT1116              EXRACT1117              EXRACT1118 
##            0.3523714936            0.1508665378            0.3959048668 
##              EXRACT1119              EXRACT1120              EXRACT1121 
##            0.4546329688            0.6057635636            1.8467048368 
##              EXRACT1122              EXRACT1123              EXRACT1124 
##            0.6497278721            1.5819036519            0.9295026490 
##              EXRACT1125              EXRACT1126              EXRACT1127 
##            0.8016332814            0.7383232636            0.9613665874 
##              EXRACT1128              EXRACT1129              EXRACT1130 
##            1.1695033804            1.5869964816            0.8539977111 
##              EXRACT1131              EXRACT1132              EXRACT1133 
##            0.1687307085            0.5456608561            0.0008244714 
##              EXRACT1134              EXRACT1135              EXRACT1136 
##           -2.4754357738            1.4378471307           -0.3998923470 
##              EXRACT1137              EXRACT1138              EXRACT1139 
##            0.5741075027            1.3321253580            0.6209129043 
##              EXRACT1140              EXRACT1141              EXRACT1142 
##            1.2419126632            0.9259913731            0.4626071676 
##              EXRACT1143              EXRACT1144              EXRACT1145 
##            0.7616411330            0.6275213156           -0.9765509523 
##              EXRACT1146              EXRACT1147              EXRACT1148 
##            0.4535214962           -2.1773487122            0.4551154347 
##              EXRACT1149              EXRACT1150              EXRACT1151 
##            1.2597723614            0.2901549151            0.7701792496 
##              EXRACT1152              EXRACT1153              EXRACT1154 
##            0.3129697470            0.4749703838            0.8476953041 
##              EXRACT1155              EXRACT1156              EXRACT1157 
##           -0.2363656560            0.3930643354            0.9989104445 
##              EXRACT1158              EXRACT1159              EXRACT1160 
##            1.3814837740            0.1196334849           -3.0640818588 
##              EXRACT1161              EXRACT1162              EXRACT1163 
##            1.0778433206            1.0756644122            0.3013473054 
##              EXRACT1164              EXRACT1165              EXRACT1166 
##           -0.4154515991            5.5153916900            1.2166569847 
##              EXRACT1167              EXRACT1168              EXRACT1169 
##           -3.0631410695            1.0909594898           -2.9949417714 
##              EXRACT1170              EXRACT1171              EXRACT1172 
##            0.1224563504           -0.0269929914            0.0611635874 
##              EXRACT1173              EXRACT1174              EXRACT1175 
##           -0.2706939943            2.1934766833           -0.0691766247 
##              EXRACT1176              EXRACT1198              X_BMI5CAT2 
##           -0.0734553531            0.1631269120            0.1067658044 
##              X_BMI5CAT3              X_BMI5CAT4 
##            0.0628017650           -0.0702466268
Variable importance

The variables with the top 3 highest importance includes the frequency of exercise (cutPAFRE), the duration of exercise (cutPADUR),and age.

library(knitr)
library(tibble)
impDf = varImp(model)
class(impDf)

[1] “data.frame”

impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
                          imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))
varNames imp
cutPAFRE(5e+03,9.8e+04] 179.09528
cutPAFRE(3e+03,5e+03] 163.98075
cutPADUR(60,270] 139.97056
cutPADUR(45,60] 136.95213
cutPAFRE(2e+03,3e+03] 108.43637
cutPADUR(30,45] 83.89473
X_AGE_G5 34.48024
X_AGE_G4 27.09287
SEX2 24.29000
X_AGE_G3 20.95844
EXRACT1167 20.29431
EXRACT1169 18.88318
EXRACT1134 13.15988
EXRACT1160 11.40104
EXRACT112 9.99763

Make predictions and evaluate predictions

predict
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
characterize prediction quality

By looking at the density curve of predicted probabilities, we can see that the positive predictions (recommendation on more cardio exercise) are concentrated on the right and the negative (not giving recommendation on more cardio exercise) are concentrated on the left. Those two instances are well separated. To turn the model into a classifier, we need to pick a thresold.

library(ggplot2)
ggplot(train, aes(x= pred, color = X_PAINDX1_F_logic, linetype=X_PAINDX1_F_logic)) +
    geom_density()

Model trade-offs
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## Warning: Removed 1 rows containing missing values (geom_path).

Pick a threshold

The above figre illustrates the trade-off between precision and recall. In our dataset which incldues 15 years survey, the ‘prevalence’ of giving cardio exercise recommendation is 46.9% (see the original survey statistics here). However, in this cleaned dataset the rate of positive recommendation is 68% (there were a large number of missing data in the negative recommendation cohort). We might want to use 0.68 or a bit lower as our threshold, because we want our model have higher probability of giving positive recommendations on cardio exercise so it can motivate people to exercise. Let’s use 0.47 as a thresold first to measure the lower bound of the precision.

Evaluate the chosen model
ctab.test <- table(pred = test$pred > 0.47, recommend = test$X_PAINDX1_F_logic)
ctab.test
##        recommend
## pred    FALSE  TRUE
##   FALSE 11306  3834
##   TRUE   7424 37054
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))

print(precision)
## [1] 0.833086
print(recall)
## [1] 0.9062317
print(enrich)
## [1] 1.214707

This means that we have can identify 83 % people who need exercise among all positive case and identify a set of potential group of people who need exercise which contains 91 % cases in the true positive cases. Since our prediction will be case-to-case, we may want to increase our precision and lower the recall. Now let’s look at the higher bound of the precision (p > 0.68).

ctab.test <- table(pred = test$pred > 0.68, recommend = test$X_PAINDX1_F_logic)
ctab.test
##        recommend
## pred    FALSE  TRUE
##   FALSE 15049  7802
##   TRUE   3681 33086
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))

print(precision)
## [1] 0.899883
modelA.precision = precision
print(recall)
## [1] 0.8091861
modelA.recall = recall
print(enrich)
## [1] 1.312102
modelA.enrich = enrich

By looking at the density plot and the true rate of positive (0.68), we decide to try to use 0.60 as the thresould.

ctab.test <- table(pred = test$pred > 0.60, recommend = test$X_PAINDX1_F_logic)
ctab.test
##        recommend
## pred    FALSE  TRUE
##   FALSE 13347  5580
##   TRUE   5383 35308
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))

print(precision)
## [1] 0.8677103
modelA.precision = precision
print(recall)
## [1] 0.8635296
modelA.recall = recall
print(enrich)
## [1] 1.265192
modelA.enrich = enrich

So here we have a 86% precision and 86% recall for our model implemented in the shinyapp.

ROC and AUC

It is worthwhile to look at ROC and AUC in logistic regression model. Below is the ROC plot.

ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
plot(roc(test$X_PAINDX1_F_logic, test$pred, direction="<"),
     col="yellow", lwd=3, main="ROC")

Area under the curve

Our logistic regression model has auc of 0.89, which means that it is much better than just random guess.

library(ROCR)
eval <- prediction(test$pred, test$X_PAINDX1_F_logic)
plot(performance(eval, 'tpr','fpr'))

print(attributes(performance(eval, 'auc'))$y.values[1])
## [[1]]
## [1] 0.8893367
The pseudo R^2

The pseduo r squared is based on the deviances of the model. Here I used a function to first calculate the log likelihood, the null deviance (the variances of the data around the average positive cases), and the residual deviance (similar to the variances of the data around the model). I also used a chisq test to see if the model predictions are better than just random guessing. Here the chisq test on the null deviance and the residual deviance is similar to the F-statistictics for the linear regression.

loglikelihood <- function(y, py) {
    sum(y*log(py) + (1 - y)*log(1 - py)) #y is the numeric outcome, py is the predicted probability
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))
null.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pnull)
pred <- predict(model, newdata = train, type = 'response')
resid.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pred)
pr2 <- 1-(resid.dev/null.dev)
model$deviance == resid.dev
## [1] TRUE
## on test dataset
testy <- as.numeric(test$X_PAINDX1_F_logic)
testpred <- predict(model, newdata = test, type = 'response')
pnull.test <- mean(testy)
null.dev.test <- -2*loglikelihood(testy, pnull.test)
resid.dev.test <- -2*loglikelihood(testy, testpred)
pr2.test <- 1-(resid.dev.test/null.dev.test)
print(pr2)
## [1] 0.3717028
print(pr2.test)
## [1] 0.3654533
## calculating signififccane of the observed fit
df.null <- dim(train)[[1]] - 1
df.model <- dim(train)[[1]] - length(model$coefficients)

delDev <- null.dev - resid.dev
deldf <- df.null - df.model
p <- pchisq(delDev, deldf, lower.tail = T)
print(p)
## [1] 1
## we can also use pscl package to calcualte pseudo r^2
library(pscl)
## Loading required package: MASS
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
##           llh       llhNull            G2      McFadden          r2ML 
## -9.325436e+04 -1.484240e+05  1.103392e+05  3.717028e-01  3.704126e-01 
##          r2CU 
##  5.202442e-01

The model explains about 37% of the deviance. This model has limited predicted power as there may be more importance variables that I did not identify in the original dataset.

Model B

Using biometrics and exercise information to build a naive logistic model

The first model performs ok but not perfect. One reason I suspect is that there were too many categories in the exercise type, which lead to more than 100 variables, resulting in a overfitted model. so here I tried to use activities’ MET (Metabolic Equivalent of Task) value as a variable to predict the recommendation. I used this as a guidance to categorize the exercise type based on MET.

 library(caret)
    set.seed(126)
    
    #data = readRDS(file = 'exer_trainD.rds')
    data = readRDS(file = 'exer_data1.rds')
    #data[] = lapply(data[, c(2:8)], factor)
    
    F1 <- createDataPartition(data$X_PAINDX1_F, p = 0.5, groups = 2)
    data_sub = data[F1$Resample1,]
    F2 <- createDataPartition(data_sub$X_PAINDX1_F, p = 0.8, groups = 2)
    
    select = c('X_PAINDX1_F_logic', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met',
               'X_BMI5CAT')
    
    select_col = c()
    for (i in 1:length(names(data))) {
        if (names(data)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train = data_sub[F2$Resample1,select_col]
    test = data_sub[-F2$Resample1,select_col]
    y = 'X_PAINDX1_F_logic'
    x = select[2:8]
    fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
    model <- glm(fmla, data = train, family = binomial(link = 'logit'))
    modelB <- model
library(knitr)
library(tibble)
impDf = varImp(model)
class(impDf)
## [1] "data.frame"
impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
                          imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))
varNames imp
cutPAFRE(5e+03,9.8e+04] 177.55567
cutPAFRE(3e+03,5e+03] 161.37485
cutPADUR(60,270] 149.64550
cutPADUR(45,60] 136.01189
cutPAFRE(2e+03,3e+03] 105.56264
met60 90.50231
cutPADUR(30,45] 82.79419
met68 78.62668
met50 68.63656
met73 64.45925
met35 63.57750
met70 61.38488
X_AGE_G5 41.02671
met65 36.34076
met38 35.12440
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
library(ggplot2)
ggplot(train, aes(x= pred, color = X_PAINDX1_F_logic, linetype=X_PAINDX1_F_logic)) +
    geom_density()

### model trade-offs
library(ROCR)
library(grid)

predObj <- prediction(train$pred, train$X_PAINDX1_F_logic)
precObj <- performance(predObj, measure = 'prec')
recObj <- performance(predObj, measure = 'rec')

precision <- (precObj@y.values)[[1]]
prec.x <- (precObj@x.values)[[1]]
recall <- (recObj@y.values)[[1]]

rocFrame <- data.frame(threshould = prec.x, precision = precision,
                       recall = recall)

nplot <- function(plist) {
    n <- length(plist)
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(n, 1)))
    vplayout = function (x, y) {
        viewport(layout.pos.row = x, layout.pos.col = y)
    }
    for (i in 1:n) {
        print(plist[[i]], vp = vplayout(i, 1))
    }
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))

p1 <- ggplot(rocFrame, aes(x=threshould)) + 
    geom_line(aes(y = precision/pnull)) +
    coord_cartesian(xlim = c(0,1), ylim = c(0, 10))

p2 <- ggplot(rocFrame, aes(x=threshould)) +
    geom_line(aes(y = recall)) +
    coord_cartesian(xlim = c(0, 1))

nplot(list(p1, p2))
## Warning: Removed 1 rows containing missing values (geom_path).

Evaluating
ctab.test <- table(pred = test$pred > 0.47, recommend = test$X_PAINDX1_F_logic)
ctab.test
##        recommend
## pred    FALSE  TRUE
##   FALSE 11012  4182
##   TRUE   7718 36706
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))

print(precision)
## [1] 0.8262651
print(recall)
## [1] 0.8977206
print(enrich)
## [1] 1.204761
ctab.test <- table(pred = test$pred > 0.75, recommend = test$X_PAINDX1_F_logic)
ctab.test
##        recommend
## pred    FALSE  TRUE
##   FALSE 16073 10972
##   TRUE   2657 29916
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))

print(precision)
## [1] 0.9184294
modelB.precision = precision
print(recall)
## [1] 0.7316572
modelB.recall = recall
print(enrich)
## [1] 1.339144
modelB.enrich = enrich
ROC
library(pROC)
plot(roc(test$X_PAINDX1_F_logic, test$pred, direction="<"),
     col="yellow", lwd=3, main="ROC")

Area under the curve
library(ROCR)
eval <- prediction(test$pred, test$X_PAINDX1_F_logic)
plot(performance(eval, 'tpr','fpr'))

print(attributes(performance(eval, 'auc'))$y.values[1])
## [[1]]
## [1] 0.8745886
The pseudo R^2
loglikelihood <- function(y, py) {
    sum(y*log(py) + (1 - y)*log(1 - py)) #y is the numeric outcome, py is the predicted probability
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))
null.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pnull)
pred <- predict(model, newdata = train, type = 'response')
resid.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pred)
pr2 <- 1-(resid.dev/null.dev)
model$deviance == resid.dev
## [1] TRUE
## on test dataset
testy <- as.numeric(test$X_PAINDX1_F_logic)
testpred <- predict(model, newdata = test, type = 'response')
pnull.test <- mean(testy)
null.dev.test <- -2*loglikelihood(testy, pnull.test)
resid.dev.test <- -2*loglikelihood(testy, testpred)
pr2.test <- 1-(resid.dev.test/null.dev.test)
print(pr2)
## [1] 0.3440615
print(pr2.test)
## [1] 0.3374944
## calculating signififccane of the observed fit
df.null <- dim(train)[[1]] - 1
df.model <- dim(train)[[1]] - length(model$coefficients)

delDev <- null.dev - resid.dev
deldf <- df.null - df.model
p <- pchisq(delDev, deldf, lower.tail = T)
print(p)
## [1] 1
## we can also use pscl package to calcualte pseudo r^2
library(pscl)
pR2(model)
##           llh       llhNull            G2      McFadden          r2ML 
## -9.735699e+04 -1.484240e+05  1.021339e+05  3.440615e-01  3.483730e-01 
##          r2CU 
##  4.892896e-01

Comparison of two models

anova(modelA, modelB, test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: X_PAINDX1_F_logic ~ X_HCVU651_F + cutPADUR + cutPAFRE + X_AGE_G + 
##     SEX + EXRACT11 + X_BMI5CAT
## Model 2: X_PAINDX1_F_logic ~ X_HCVU651_F + cutPADUR + cutPAFRE + X_AGE_G + 
##     SEX + met + X_BMI5CAT
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1    238381     186509                           
## 2    238430     194714 -49  -8205.3 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(modelA$aic)
## [1] 186692.7
print(modelB$aic)
## [1] 194800
compare = data.frame(model = c('A', 'B'),
                     AIC = c(modelA$aic, modelB$aic),
                     precision = c(modelA.precision, modelB.precision),
                     recall = c(modelA.recall, modelB.recall),
                     enrich = c(modelA.enrich, modelB.enrich)
                     )
kable(compare)
model AIC precision recall enrich
A 186692.7 0.8677103 0.8635296 1.265192
B 194800.0 0.9184294 0.7316572 1.339144