In this document two logic regression models were built on the CDC BRFSS survey data during 2000-2015 to predict the recommendation of cardio exercise (dependent variable). I used the biometrics and exercise information (independent variables) in the input of my shinyapp. Both model perform similar with a precision of > 90% and a recall of > 70%. Pseudo R-squred is 0.34, indicating that the models are not highly predictive and need improvment.
The performance of these two models are relatively similar. Since AIC of Model A is slightly lower, I choose this model for the prediction in the Shiny App.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(126)
#data = readRDS(file = 'exer_trainD.rds')
data = readRDS(file = 'exer_data1.rds')
#data[] = lapply(data[, c(2:8)], factor)
F1 <- createDataPartition(data$X_PAINDX1_F, p = 0.5, groups = 2)
data_sub = data[F1$Resample1,]
F2 <- createDataPartition(data_sub$X_PAINDX1_F, p = 0.8, groups = 2)
select = c('X_PAINDX1_F_logic', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'EXRACT11',
'X_BMI5CAT')
select_col = c()
for (i in 1:length(names(data))) {
if (names(data)[i] %in% select) {
select_col = c(select_col, i)
}
}
train = data_sub[F2$Resample1,select_col]
test = data_sub[-F2$Resample1,select_col]
y = 'X_PAINDX1_F_logic'
x = select[2:8]
fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
model <- glm(fmla, data = train, family = binomial(link = 'logit'))
modelA <- model
By looking at coefficients of our glm model, we find variables that are positively correlated with the cardio recommendation (TRUE) includes: * No health care coverage * BMI category 4: BMI > 30 * Exercise types that are correlated with the cardio recommendation: frisbee, mowing lawn, pilates, tai chi, volley ball, yoga, etc (those activities are low intensity)
Other variables are negatively correlated with the cardio recommendation.
model$coefficients
## (Intercept) X_HCVU651_FTRUE cutPADUR(30,45]
## -2.8941808064 0.1185464159 1.4322522467
## cutPADUR(45,60] cutPADUR(60,270] cutPAFRE(2e+03,3e+03]
## 2.3276324735 3.2742176027 1.7417225990
## cutPAFRE(3e+03,5e+03] cutPAFRE(5e+03,9.8e+04] X_AGE_G2
## 2.8593954384 3.8118122283 0.2134305103
## X_AGE_G3 X_AGE_G4 X_AGE_G5
## 0.5191298809 0.6606595894 0.8406146063
## SEX2 EXRACT112 EXRACT113
## 0.3072533794 1.5570681853 1.6674859744
## EXRACT114 EXRACT115 EXRACT116
## -0.0098611968 0.5497758383 1.1712131109
## EXRACT117 EXRACT118 EXRACT119
## 1.2555829862 1.1553262455 0.2748450616
## EXRACT1110 EXRACT1111 EXRACT1112
## 1.7065344510 -0.1263498816 0.7977618940
## EXRACT1113 EXRACT1114 EXRACT1115
## 0.2630773204 1.4399583732 0.3265002669
## EXRACT1116 EXRACT1117 EXRACT1118
## 0.3523714936 0.1508665378 0.3959048668
## EXRACT1119 EXRACT1120 EXRACT1121
## 0.4546329688 0.6057635636 1.8467048368
## EXRACT1122 EXRACT1123 EXRACT1124
## 0.6497278721 1.5819036519 0.9295026490
## EXRACT1125 EXRACT1126 EXRACT1127
## 0.8016332814 0.7383232636 0.9613665874
## EXRACT1128 EXRACT1129 EXRACT1130
## 1.1695033804 1.5869964816 0.8539977111
## EXRACT1131 EXRACT1132 EXRACT1133
## 0.1687307085 0.5456608561 0.0008244714
## EXRACT1134 EXRACT1135 EXRACT1136
## -2.4754357738 1.4378471307 -0.3998923470
## EXRACT1137 EXRACT1138 EXRACT1139
## 0.5741075027 1.3321253580 0.6209129043
## EXRACT1140 EXRACT1141 EXRACT1142
## 1.2419126632 0.9259913731 0.4626071676
## EXRACT1143 EXRACT1144 EXRACT1145
## 0.7616411330 0.6275213156 -0.9765509523
## EXRACT1146 EXRACT1147 EXRACT1148
## 0.4535214962 -2.1773487122 0.4551154347
## EXRACT1149 EXRACT1150 EXRACT1151
## 1.2597723614 0.2901549151 0.7701792496
## EXRACT1152 EXRACT1153 EXRACT1154
## 0.3129697470 0.4749703838 0.8476953041
## EXRACT1155 EXRACT1156 EXRACT1157
## -0.2363656560 0.3930643354 0.9989104445
## EXRACT1158 EXRACT1159 EXRACT1160
## 1.3814837740 0.1196334849 -3.0640818588
## EXRACT1161 EXRACT1162 EXRACT1163
## 1.0778433206 1.0756644122 0.3013473054
## EXRACT1164 EXRACT1165 EXRACT1166
## -0.4154515991 5.5153916900 1.2166569847
## EXRACT1167 EXRACT1168 EXRACT1169
## -3.0631410695 1.0909594898 -2.9949417714
## EXRACT1170 EXRACT1171 EXRACT1172
## 0.1224563504 -0.0269929914 0.0611635874
## EXRACT1173 EXRACT1174 EXRACT1175
## -0.2706939943 2.1934766833 -0.0691766247
## EXRACT1176 EXRACT1198 X_BMI5CAT2
## -0.0734553531 0.1631269120 0.1067658044
## X_BMI5CAT3 X_BMI5CAT4
## 0.0628017650 -0.0702466268
The variables with the top 3 highest importance includes the frequency of exercise (cutPAFRE), the duration of exercise (cutPADUR),and age.
library(knitr)
library(tibble)
impDf = varImp(model)
class(impDf)
[1] “data.frame”
impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))
| varNames | imp |
|---|---|
| cutPAFRE(5e+03,9.8e+04] | 179.09528 |
| cutPAFRE(3e+03,5e+03] | 163.98075 |
| cutPADUR(60,270] | 139.97056 |
| cutPADUR(45,60] | 136.95213 |
| cutPAFRE(2e+03,3e+03] | 108.43637 |
| cutPADUR(30,45] | 83.89473 |
| X_AGE_G5 | 34.48024 |
| X_AGE_G4 | 27.09287 |
| SEX2 | 24.29000 |
| X_AGE_G3 | 20.95844 |
| EXRACT1167 | 20.29431 |
| EXRACT1169 | 18.88318 |
| EXRACT1134 | 13.15988 |
| EXRACT1160 | 11.40104 |
| EXRACT112 | 9.99763 |
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
By looking at the density curve of predicted probabilities, we can see that the positive predictions (recommendation on more cardio exercise) are concentrated on the right and the negative (not giving recommendation on more cardio exercise) are concentrated on the left. Those two instances are well separated. To turn the model into a classifier, we need to pick a thresold.
library(ggplot2)
ggplot(train, aes(x= pred, color = X_PAINDX1_F_logic, linetype=X_PAINDX1_F_logic)) +
geom_density()
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Warning: Removed 1 rows containing missing values (geom_path).
The above figre illustrates the trade-off between precision and recall. In our dataset which incldues 15 years survey, the ‘prevalence’ of giving cardio exercise recommendation is 46.9% (see the original survey statistics here). However, in this cleaned dataset the rate of positive recommendation is 68% (there were a large number of missing data in the negative recommendation cohort). We might want to use 0.68 or a bit lower as our threshold, because we want our model have higher probability of giving positive recommendations on cardio exercise so it can motivate people to exercise. Let’s use 0.47 as a thresold first to measure the lower bound of the precision.
ctab.test <- table(pred = test$pred > 0.47, recommend = test$X_PAINDX1_F_logic)
ctab.test
## recommend
## pred FALSE TRUE
## FALSE 11306 3834
## TRUE 7424 37054
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))
print(precision)
## [1] 0.833086
print(recall)
## [1] 0.9062317
print(enrich)
## [1] 1.214707
This means that we have can identify 83 % people who need exercise among all positive case and identify a set of potential group of people who need exercise which contains 91 % cases in the true positive cases. Since our prediction will be case-to-case, we may want to increase our precision and lower the recall. Now let’s look at the higher bound of the precision (p > 0.68).
ctab.test <- table(pred = test$pred > 0.68, recommend = test$X_PAINDX1_F_logic)
ctab.test
## recommend
## pred FALSE TRUE
## FALSE 15049 7802
## TRUE 3681 33086
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))
print(precision)
## [1] 0.899883
modelA.precision = precision
print(recall)
## [1] 0.8091861
modelA.recall = recall
print(enrich)
## [1] 1.312102
modelA.enrich = enrich
By looking at the density plot and the true rate of positive (0.68), we decide to try to use 0.60 as the thresould.
ctab.test <- table(pred = test$pred > 0.60, recommend = test$X_PAINDX1_F_logic)
ctab.test
## recommend
## pred FALSE TRUE
## FALSE 13347 5580
## TRUE 5383 35308
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))
print(precision)
## [1] 0.8677103
modelA.precision = precision
print(recall)
## [1] 0.8635296
modelA.recall = recall
print(enrich)
## [1] 1.265192
modelA.enrich = enrich
So here we have a 86% precision and 86% recall for our model implemented in the shinyapp.
It is worthwhile to look at ROC and AUC in logistic regression model. Below is the ROC plot.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
plot(roc(test$X_PAINDX1_F_logic, test$pred, direction="<"),
col="yellow", lwd=3, main="ROC")
Our logistic regression model has auc of 0.89, which means that it is much better than just random guess.
library(ROCR)
eval <- prediction(test$pred, test$X_PAINDX1_F_logic)
plot(performance(eval, 'tpr','fpr'))
print(attributes(performance(eval, 'auc'))$y.values[1])
## [[1]]
## [1] 0.8893367
The pseduo r squared is based on the deviances of the model. Here I used a function to first calculate the log likelihood, the null deviance (the variances of the data around the average positive cases), and the residual deviance (similar to the variances of the data around the model). I also used a chisq test to see if the model predictions are better than just random guessing. Here the chisq test on the null deviance and the residual deviance is similar to the F-statistictics for the linear regression.
loglikelihood <- function(y, py) {
sum(y*log(py) + (1 - y)*log(1 - py)) #y is the numeric outcome, py is the predicted probability
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))
null.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pnull)
pred <- predict(model, newdata = train, type = 'response')
resid.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pred)
pr2 <- 1-(resid.dev/null.dev)
model$deviance == resid.dev
## [1] TRUE
## on test dataset
testy <- as.numeric(test$X_PAINDX1_F_logic)
testpred <- predict(model, newdata = test, type = 'response')
pnull.test <- mean(testy)
null.dev.test <- -2*loglikelihood(testy, pnull.test)
resid.dev.test <- -2*loglikelihood(testy, testpred)
pr2.test <- 1-(resid.dev.test/null.dev.test)
print(pr2)
## [1] 0.3717028
print(pr2.test)
## [1] 0.3654533
## calculating signififccane of the observed fit
df.null <- dim(train)[[1]] - 1
df.model <- dim(train)[[1]] - length(model$coefficients)
delDev <- null.dev - resid.dev
deldf <- df.null - df.model
p <- pchisq(delDev, deldf, lower.tail = T)
print(p)
## [1] 1
## we can also use pscl package to calcualte pseudo r^2
library(pscl)
## Loading required package: MASS
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
## llh llhNull G2 McFadden r2ML
## -9.325436e+04 -1.484240e+05 1.103392e+05 3.717028e-01 3.704126e-01
## r2CU
## 5.202442e-01
The model explains about 37% of the deviance. This model has limited predicted power as there may be more importance variables that I did not identify in the original dataset.
The first model performs ok but not perfect. One reason I suspect is that there were too many categories in the exercise type, which lead to more than 100 variables, resulting in a overfitted model. so here I tried to use activities’ MET (Metabolic Equivalent of Task) value as a variable to predict the recommendation. I used this as a guidance to categorize the exercise type based on MET.
library(caret)
set.seed(126)
#data = readRDS(file = 'exer_trainD.rds')
data = readRDS(file = 'exer_data1.rds')
#data[] = lapply(data[, c(2:8)], factor)
F1 <- createDataPartition(data$X_PAINDX1_F, p = 0.5, groups = 2)
data_sub = data[F1$Resample1,]
F2 <- createDataPartition(data_sub$X_PAINDX1_F, p = 0.8, groups = 2)
select = c('X_PAINDX1_F_logic', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met',
'X_BMI5CAT')
select_col = c()
for (i in 1:length(names(data))) {
if (names(data)[i] %in% select) {
select_col = c(select_col, i)
}
}
train = data_sub[F2$Resample1,select_col]
test = data_sub[-F2$Resample1,select_col]
y = 'X_PAINDX1_F_logic'
x = select[2:8]
fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
model <- glm(fmla, data = train, family = binomial(link = 'logit'))
modelB <- model
library(knitr)
library(tibble)
impDf = varImp(model)
class(impDf)
## [1] "data.frame"
impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))
| varNames | imp |
|---|---|
| cutPAFRE(5e+03,9.8e+04] | 177.55567 |
| cutPAFRE(3e+03,5e+03] | 161.37485 |
| cutPADUR(60,270] | 149.64550 |
| cutPADUR(45,60] | 136.01189 |
| cutPAFRE(2e+03,3e+03] | 105.56264 |
| met60 | 90.50231 |
| cutPADUR(30,45] | 82.79419 |
| met68 | 78.62668 |
| met50 | 68.63656 |
| met73 | 64.45925 |
| met35 | 63.57750 |
| met70 | 61.38488 |
| X_AGE_G5 | 41.02671 |
| met65 | 36.34076 |
| met38 | 35.12440 |
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
library(ggplot2)
ggplot(train, aes(x= pred, color = X_PAINDX1_F_logic, linetype=X_PAINDX1_F_logic)) +
geom_density()
### model trade-offs
library(ROCR)
library(grid)
predObj <- prediction(train$pred, train$X_PAINDX1_F_logic)
precObj <- performance(predObj, measure = 'prec')
recObj <- performance(predObj, measure = 'rec')
precision <- (precObj@y.values)[[1]]
prec.x <- (precObj@x.values)[[1]]
recall <- (recObj@y.values)[[1]]
rocFrame <- data.frame(threshould = prec.x, precision = precision,
recall = recall)
nplot <- function(plist) {
n <- length(plist)
grid.newpage()
pushViewport(viewport(layout = grid.layout(n, 1)))
vplayout = function (x, y) {
viewport(layout.pos.row = x, layout.pos.col = y)
}
for (i in 1:n) {
print(plist[[i]], vp = vplayout(i, 1))
}
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))
p1 <- ggplot(rocFrame, aes(x=threshould)) +
geom_line(aes(y = precision/pnull)) +
coord_cartesian(xlim = c(0,1), ylim = c(0, 10))
p2 <- ggplot(rocFrame, aes(x=threshould)) +
geom_line(aes(y = recall)) +
coord_cartesian(xlim = c(0, 1))
nplot(list(p1, p2))
## Warning: Removed 1 rows containing missing values (geom_path).
ctab.test <- table(pred = test$pred > 0.47, recommend = test$X_PAINDX1_F_logic)
ctab.test
## recommend
## pred FALSE TRUE
## FALSE 11012 4182
## TRUE 7718 36706
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))
print(precision)
## [1] 0.8262651
print(recall)
## [1] 0.8977206
print(enrich)
## [1] 1.204761
ctab.test <- table(pred = test$pred > 0.75, recommend = test$X_PAINDX1_F_logic)
ctab.test
## recommend
## pred FALSE TRUE
## FALSE 16073 10972
## TRUE 2657 29916
precision <- ctab.test[2,2]/sum(ctab.test[2,])
recall <- ctab.test[2,2]/sum(ctab.test[,2])
enrich <- precision/mean(as.numeric(test$X_PAINDX1_F_logic))
print(precision)
## [1] 0.9184294
modelB.precision = precision
print(recall)
## [1] 0.7316572
modelB.recall = recall
print(enrich)
## [1] 1.339144
modelB.enrich = enrich
library(pROC)
plot(roc(test$X_PAINDX1_F_logic, test$pred, direction="<"),
col="yellow", lwd=3, main="ROC")
library(ROCR)
eval <- prediction(test$pred, test$X_PAINDX1_F_logic)
plot(performance(eval, 'tpr','fpr'))
print(attributes(performance(eval, 'auc'))$y.values[1])
## [[1]]
## [1] 0.8745886
loglikelihood <- function(y, py) {
sum(y*log(py) + (1 - y)*log(1 - py)) #y is the numeric outcome, py is the predicted probability
}
pnull <- mean(as.numeric(train$X_PAINDX1_F_logic))
null.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pnull)
pred <- predict(model, newdata = train, type = 'response')
resid.dev <- -2*loglikelihood(as.numeric(train$X_PAINDX1_F_logic), pred)
pr2 <- 1-(resid.dev/null.dev)
model$deviance == resid.dev
## [1] TRUE
## on test dataset
testy <- as.numeric(test$X_PAINDX1_F_logic)
testpred <- predict(model, newdata = test, type = 'response')
pnull.test <- mean(testy)
null.dev.test <- -2*loglikelihood(testy, pnull.test)
resid.dev.test <- -2*loglikelihood(testy, testpred)
pr2.test <- 1-(resid.dev.test/null.dev.test)
print(pr2)
## [1] 0.3440615
print(pr2.test)
## [1] 0.3374944
## calculating signififccane of the observed fit
df.null <- dim(train)[[1]] - 1
df.model <- dim(train)[[1]] - length(model$coefficients)
delDev <- null.dev - resid.dev
deldf <- df.null - df.model
p <- pchisq(delDev, deldf, lower.tail = T)
print(p)
## [1] 1
## we can also use pscl package to calcualte pseudo r^2
library(pscl)
pR2(model)
## llh llhNull G2 McFadden r2ML
## -9.735699e+04 -1.484240e+05 1.021339e+05 3.440615e-01 3.483730e-01
## r2CU
## 4.892896e-01
anova(modelA, modelB, test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: X_PAINDX1_F_logic ~ X_HCVU651_F + cutPADUR + cutPAFRE + X_AGE_G +
## SEX + EXRACT11 + X_BMI5CAT
## Model 2: X_PAINDX1_F_logic ~ X_HCVU651_F + cutPADUR + cutPAFRE + X_AGE_G +
## SEX + met + X_BMI5CAT
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 238381 186509
## 2 238430 194714 -49 -8205.3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(modelA$aic)
## [1] 186692.7
print(modelB$aic)
## [1] 194800
compare = data.frame(model = c('A', 'B'),
AIC = c(modelA$aic, modelB$aic),
precision = c(modelA.precision, modelB.precision),
recall = c(modelA.recall, modelB.recall),
enrich = c(modelA.enrich, modelB.enrich)
)
kable(compare)
| model | AIC | precision | recall | enrich |
|---|---|---|---|---|
| A | 186692.7 | 0.8677103 | 0.8635296 | 1.265192 |
| B | 194800.0 | 0.9184294 | 0.7316572 | 1.339144 |