Libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble 3.0.3 v dplyr 1.0.3
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(patchwork)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(ggsn)
## Loading required package: grid
library(foreign)
library(survey)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
Importing NHANES data
DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT",
NULL)
ALQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT",
NULL)
BMX_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT",
NULL)
DIQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT",
NULL)
SMQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT",
NULL)
Merging data into master file
Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)
Recoding, filtering, and cleaning up the data
nams<-names(Master4)
head(nams, n=10)
## [1] "SEQN" "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
## [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(Master4)<-newnames
#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
"Male",
"Female"))
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0")
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")
Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )
# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty",
ifelse(Master4$indfmpir > 1, "above poverty", NA))
Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; 'above poverty'=0", as.factor=T)
# education
Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)
# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)
# age
Master4$agec<-cut(Master4$ridageyr,
breaks=c(0,17,65,80))
# smoking status
Master4$cur_smk <- Recode(Master4$smq040, recodes="1:2 = 1; else=0", as.factor=T)
# ever used e-cig
Master4$evr_ecig <- Recode(Master4$smq900, recodes="1= 1; 2=0; else=NA", as.factor=T)
# ever consumed alcohol
Master4$evr_alc <- Recode(Master4$alq111, recodes="1= 1; 2=0; else=NA", as.factor=T)
# diabetes
Master4$diabetes <- Recode(Master4$diq010, recodes="1= 1; 2:3=0; else=NA", as.factor=T)
# prediabetes
Master4$pre_dbts <- Recode(Master4$diq160, recodes="1= 1; 2=0; else=NA", as.factor=T)
# health risk for diabetes
Master4$hlth_rsk <- Recode(Master4$diq170, recodes="1= 1; 2=0; else=NA", as.factor=T)
# BMI
Master4$bmicat <- ifelse(Master4$bmxbmi < 18.5, "1 underweight",
ifelse(Master4$bmxbmi >=18.5 & Master4$bmxbmi < 25, "2 normal",
ifelse(Master4$bmxbmi >=25 & Master4$bmxbmi< 30, "3 overweight",
ifelse(Master4$bmxbmi >= 30, "4 obese", NA))))
Master4$obese <- Recode (Master4$bmicat, recodes= "'4 obese'=1; else=0 ", as.factor=T)
library(dplyr)
sub<-Master4%>%
select(bmicat, hlth_rsk, obese, cur_smk, agec,ridageyr, marst, educ, pov, pov2, race_eth, hispanic, black, white, asian, other, sex, wtint2yr, sdmvpsu, sdmvstra) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~sdmvpsu, strata=~sdmvstra, weights=~wtint2yr, data =sub, nest = TRUE )
#Logit model
fit.logit<-svyglm(obese~race_eth+educ+agec,
design= des,
family=binomial("logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = obese ~ race_eth + educ + agec, design = des,
## family = binomial("logit"))
##
## Survey design:
## svydesign(ids = ~sdmvpsu, strata = ~sdmvstra, weights = ~wtint2yr,
## data = sub, nest = TRUE)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3877 0.1169 -3.316 0.0128 *
## race_eth2 hispanic 0.1421 0.1128 1.260 0.2479
## race_eth3 black 0.2655 0.1272 2.088 0.0752 .
## race_eth4 asian -1.2473 0.1455 -8.570 5.86e-05 ***
## race_eth5 other 0.1614 0.2938 0.550 0.5997
## educ2 hs 0.1556 0.1907 0.816 0.4413
## educ3 some col 0.1310 0.1592 0.822 0.4380
## educ4 col grad -0.2021 0.1566 -1.291 0.2378
## agec(65,80] -0.1535 0.1902 -0.807 0.4463
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9997445)
##
## Number of Fisher Scoring iterations: 4
Creating training and test data sets
set.seed(1115)
train<-createDataPartition(y = sub$obese,
p = .80,
list=F)
subtrain<-sub[train,]
subtest<-sub[-train,]
table(subtrain$obese)
##
## 0 1
## 1834 1184
glm1<-glm(obese~scale(ridageyr)+factor(hlth_rsk)+factor(cur_smk)+factor(marst)+factor(educ)+factor(pov2)+factor(race_eth)+factor(sex),
data=sub[,-1],
family = binomial("logit"), maxit=100)
summary(glm1)
##
## Call:
## glm(formula = obese ~ scale(ridageyr) + factor(hlth_rsk) + factor(cur_smk) +
## factor(marst) + factor(educ) + factor(pov2) + factor(race_eth) +
## factor(sex), family = binomial("logit"), data = sub[, -1],
## maxit = 100)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5711 -1.0298 -0.6207 1.2236 2.2884
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.232218 0.123595 -1.879 0.06026 .
## scale(ridageyr) -0.180785 0.042777 -4.226 2.38e-05 ***
## factor(hlth_rsk)1 0.671102 0.089932 7.462 8.50e-14 ***
## factor(cur_smk)1 -0.433215 0.093934 -4.612 3.99e-06 ***
## factor(marst)2 widowed 0.297410 0.148357 2.005 0.04500 *
## factor(marst)3 divorced 0.072404 0.116074 0.624 0.53277
## factor(marst)4 separated -0.042063 0.197842 -0.213 0.83163
## factor(marst)5 nm -0.230999 0.105485 -2.190 0.02853 *
## factor(marst)6 cohab -0.102741 0.126240 -0.814 0.41573
## factor(educ)2 hs -0.036471 0.112245 -0.325 0.74524
## factor(educ)3 some col 0.002225 0.108814 0.020 0.98369
## factor(educ)4 col grad -0.255766 0.121314 -2.108 0.03500 *
## factor(pov2)1 -0.032218 0.094030 -0.343 0.73187
## factor(race_eth)2 hispanic 0.063660 0.097655 0.652 0.51448
## factor(race_eth)3 black 0.280052 0.091172 3.072 0.00213 **
## factor(race_eth)4 asian -1.483749 0.140583 -10.554 < 2e-16 ***
## factor(race_eth)5 other 0.209509 0.158052 1.326 0.18498
## factor(sex)Male -0.146657 0.071749 -2.044 0.04095 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5052.9 on 3771 degrees of freedom
## Residual deviance: 4730.3 on 3754 degrees of freedom
## AIC: 4766.3
##
## Number of Fisher Scoring iterations: 4
ob_pred<- predict(glm1,
newdata = sub,
type = "response")
head(ob_pred)
## 1 2 3 4 5 6
## 0.4776552 0.1308544 0.4040427 0.1004473 0.2760832 0.2956076
ob_predcl<-factor(ifelse(ob_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=ob_pred, gr=ob_predcl, modcon=sub$obese)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Obesity", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of Obesity", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( ob_predcl,sub$obese)
##
## ob_predcl 0 1
## 0 2005 1070
## 1 287 410
cm1<-confusionMatrix(data = ob_predcl,
reference = sub$obese )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2005 1070
## 1 287 410
##
## Accuracy : 0.6402
## 95% CI : (0.6247, 0.6556)
## No Information Rate : 0.6076
## P-Value [Acc > NIR] : 2.018e-05
##
## Kappa : 0.1675
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8748
## Specificity : 0.2770
## Pos Pred Value : 0.6520
## Neg Pred Value : 0.5882
## Prevalence : 0.6076
## Detection Rate : 0.5315
## Detection Prevalence : 0.8152
## Balanced Accuracy : 0.5759
##
## 'Positive' Class : 0
##
ob_predcl<-factor(ifelse(ob_pred>mean(I(sub$obese==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=ob_pred, gr=ob_predcl, modcon=sub$obese)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Obesity", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(sub$obese==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Obesity", subtitle = "Truth")+
geom_vline(xintercept=mean(I(sub$obese==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
confusionMatrix(data = ob_predcl,
sub$obese,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1215 478
## 1 1077 1002
##
## Accuracy : 0.5878
## 95% CI : (0.5718, 0.6035)
## No Information Rate : 0.6076
## P-Value [Acc > NIR] : 0.994
##
## Kappa : 0.1933
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6770
## Specificity : 0.5301
## Pos Pred Value : 0.4820
## Neg Pred Value : 0.7177
## Prevalence : 0.3924
## Detection Rate : 0.2656
## Detection Prevalence : 0.5512
## Balanced Accuracy : 0.6036
##
## 'Positive' Class : 1
##
# Changing the rule threshold from 0.5 outcome decision to mean outcome decision resulted in a slightly lower percent correct classification, but also increased specificity by lowering sensitivity
pred_test<-predict(glm1,
newdata=sub,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(sub$obese==1)), 1, 0))
table(sub$obese,pred_cl)
## pred_cl
## 0 1
## 0 1215 1077
## 1 478 1002
confusionMatrix(data = pred_cl,sub$obese )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1215 478
## 1 1077 1002
##
## Accuracy : 0.5878
## 95% CI : (0.5718, 0.6035)
## No Information Rate : 0.6076
## P-Value [Acc > NIR] : 0.994
##
## Kappa : 0.1933
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5301
## Specificity : 0.6770
## Pos Pred Value : 0.7177
## Neg Pred Value : 0.4820
## Prevalence : 0.6076
## Detection Rate : 0.3221
## Detection Prevalence : 0.4488
## Balanced Accuracy : 0.6036
##
## 'Positive' Class : 0
##