Stats HW4

Libraries

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(tidyverse)

## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --

## v tibble  3.0.3     v dplyr   1.0.3
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4

## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

library(patchwork)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(ggsn)

## Loading required package: grid

library(foreign)
library(survey)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

Importing NHANES data

DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT", 
    NULL)
ALQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT", 
    NULL)
BMX_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT", 
    NULL)
DIQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT", 
    NULL)
SMQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT", 
    NULL)

Merging data into master file

Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)

Recoding, filtering, and cleaning up the data

nams<-names(Master4)
head(nams, n=10)

##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(Master4)<-newnames

#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
                                "Male",
                                "Female"))
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0")
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")

Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )

# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty", 
                      ifelse(Master4$indfmpir > 1, "above poverty", NA))
Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; 'above poverty'=0", as.factor=T)

# education
Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)

# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)

# age
Master4$agec<-cut(Master4$ridageyr,
                   breaks=c(0,17,65,80))

# smoking status
Master4$cur_smk <- Recode(Master4$smq040, recodes="1:2 = 1; else=0", as.factor=T)

# ever used e-cig
Master4$evr_ecig <- Recode(Master4$smq900, recodes="1= 1; 2=0; else=NA", as.factor=T)

# ever consumed alcohol
Master4$evr_alc <- Recode(Master4$alq111, recodes="1= 1; 2=0; else=NA", as.factor=T)

# diabetes
Master4$diabetes <- Recode(Master4$diq010, recodes="1= 1; 2:3=0; else=NA", as.factor=T)

# prediabetes
Master4$pre_dbts <- Recode(Master4$diq160, recodes="1= 1; 2=0; else=NA", as.factor=T)

# health risk for diabetes
Master4$hlth_rsk <- Recode(Master4$diq170, recodes="1= 1; 2=0; else=NA", as.factor=T)

# BMI
Master4$bmicat <- ifelse(Master4$bmxbmi < 18.5, "1 underweight",
                         ifelse(Master4$bmxbmi >=18.5 & Master4$bmxbmi < 25, "2 normal",
                                ifelse(Master4$bmxbmi >=25 & Master4$bmxbmi< 30, "3 overweight",
                                       ifelse(Master4$bmxbmi >= 30, "4 obese", NA))))

Master4$obese <- Recode (Master4$bmicat, recodes= "'4 obese'=1; else=0 ", as.factor=T)

Analysis

library(dplyr)
sub<-Master4%>%
  select(bmicat, hlth_rsk, obese, cur_smk, agec,ridageyr, marst, educ, pov, pov2, race_eth, hispanic, black, white, asian, other, sex, wtint2yr, sdmvpsu, sdmvstra) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~sdmvpsu, strata=~sdmvstra, weights=~wtint2yr, data =sub, nest = TRUE )

#Logit model
fit.logit<-svyglm(obese~race_eth+educ+agec,
                  design= des,
                  family=binomial("logit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit)

## 
## Call:
## svyglm(formula = obese ~ race_eth + educ + agec, design = des, 
##     family = binomial("logit"))
## 
## Survey design:
## svydesign(ids = ~sdmvpsu, strata = ~sdmvstra, weights = ~wtint2yr, 
##     data = sub, nest = TRUE)
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.3877     0.1169  -3.316   0.0128 *  
## race_eth2 hispanic   0.1421     0.1128   1.260   0.2479    
## race_eth3 black      0.2655     0.1272   2.088   0.0752 .  
## race_eth4 asian     -1.2473     0.1455  -8.570 5.86e-05 ***
## race_eth5 other      0.1614     0.2938   0.550   0.5997    
## educ2 hs             0.1556     0.1907   0.816   0.4413    
## educ3 some col       0.1310     0.1592   0.822   0.4380    
## educ4 col grad      -0.2021     0.1566  -1.291   0.2378    
## agec(65,80]         -0.1535     0.1902  -0.807   0.4463    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9997445)
## 
## Number of Fisher Scoring iterations: 4

Creating training and test data sets

set.seed(1115)
train<-createDataPartition(y = sub$obese,
                            p = .80,
                            list=F)

subtrain<-sub[train,]

subtest<-sub[-train,]

table(subtrain$obese)

## 
##    0    1 
## 1834 1184

Logistic Regression for classification

glm1<-glm(obese~scale(ridageyr)+factor(hlth_rsk)+factor(cur_smk)+factor(marst)+factor(educ)+factor(pov2)+factor(race_eth)+factor(sex),
          data=sub[,-1],
          family = binomial("logit"), maxit=100)
summary(glm1)

## 
## Call:
## glm(formula = obese ~ scale(ridageyr) + factor(hlth_rsk) + factor(cur_smk) + 
##     factor(marst) + factor(educ) + factor(pov2) + factor(race_eth) + 
##     factor(sex), family = binomial("logit"), data = sub[, -1], 
##     maxit = 100)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5711  -1.0298  -0.6207   1.2236   2.2884  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.232218   0.123595  -1.879  0.06026 .  
## scale(ridageyr)            -0.180785   0.042777  -4.226 2.38e-05 ***
## factor(hlth_rsk)1           0.671102   0.089932   7.462 8.50e-14 ***
## factor(cur_smk)1           -0.433215   0.093934  -4.612 3.99e-06 ***
## factor(marst)2 widowed      0.297410   0.148357   2.005  0.04500 *  
## factor(marst)3 divorced     0.072404   0.116074   0.624  0.53277    
## factor(marst)4 separated   -0.042063   0.197842  -0.213  0.83163    
## factor(marst)5 nm          -0.230999   0.105485  -2.190  0.02853 *  
## factor(marst)6 cohab       -0.102741   0.126240  -0.814  0.41573    
## factor(educ)2 hs           -0.036471   0.112245  -0.325  0.74524    
## factor(educ)3 some col      0.002225   0.108814   0.020  0.98369    
## factor(educ)4 col grad     -0.255766   0.121314  -2.108  0.03500 *  
## factor(pov2)1              -0.032218   0.094030  -0.343  0.73187    
## factor(race_eth)2 hispanic  0.063660   0.097655   0.652  0.51448    
## factor(race_eth)3 black     0.280052   0.091172   3.072  0.00213 ** 
## factor(race_eth)4 asian    -1.483749   0.140583 -10.554  < 2e-16 ***
## factor(race_eth)5 other     0.209509   0.158052   1.326  0.18498    
## factor(sex)Male            -0.146657   0.071749  -2.044  0.04095 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5052.9  on 3771  degrees of freedom
## Residual deviance: 4730.3  on 3754  degrees of freedom
## AIC: 4766.3
## 
## Number of Fisher Scoring iterations: 4

Training data prediction using 0.5 decision rule

ob_pred<- predict(glm1,
                  newdata = sub,
                  type = "response")

head(ob_pred)

##         1         2         3         4         5         6 
## 0.4776552 0.1308544 0.4040427 0.1004473 0.2760832 0.2956076

ob_predcl<-factor(ifelse(ob_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=ob_pred, gr=ob_predcl, modcon=sub$obese)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Obesity", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of Obesity", subtitle = "Truth")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( ob_predcl,sub$obese)

##          
## ob_predcl    0    1
##         0 2005 1070
##         1  287  410

cm1<-confusionMatrix(data = ob_predcl,
                     reference = sub$obese )
cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2005 1070
##          1  287  410
##                                           
##                Accuracy : 0.6402          
##                  95% CI : (0.6247, 0.6556)
##     No Information Rate : 0.6076          
##     P-Value [Acc > NIR] : 2.018e-05       
##                                           
##                   Kappa : 0.1675          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8748          
##             Specificity : 0.2770          
##          Pos Pred Value : 0.6520          
##          Neg Pred Value : 0.5882          
##              Prevalence : 0.6076          
##          Detection Rate : 0.5315          
##    Detection Prevalence : 0.8152          
##       Balanced Accuracy : 0.5759          
##                                           
##        'Positive' Class : 0               
##

Training data prediction using mean of outcome decision rule

ob_predcl<-factor(ifelse(ob_pred>mean(I(sub$obese==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=ob_pred, gr=ob_predcl, modcon=sub$obese)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Obesity", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(sub$obese==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Obesity", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(sub$obese==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = ob_predcl,
                sub$obese,
                positive = "1" )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1215  478
##          1 1077 1002
##                                           
##                Accuracy : 0.5878          
##                  95% CI : (0.5718, 0.6035)
##     No Information Rate : 0.6076          
##     P-Value [Acc > NIR] : 0.994           
##                                           
##                   Kappa : 0.1933          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6770          
##             Specificity : 0.5301          
##          Pos Pred Value : 0.4820          
##          Neg Pred Value : 0.7177          
##              Prevalence : 0.3924          
##          Detection Rate : 0.2656          
##    Detection Prevalence : 0.5512          
##       Balanced Accuracy : 0.6036          
##                                           
##        'Positive' Class : 1               
##

# Changing the rule threshold from 0.5 outcome decision to mean outcome decision resulted in a slightly lower percent correct classification, but also increased specificity by lowering sensitivity

Using the test data set

pred_test<-predict(glm1,
                   newdata=sub,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(sub$obese==1)), 1, 0))

table(sub$obese,pred_cl)

##    pred_cl
##        0    1
##   0 1215 1077
##   1  478 1002

confusionMatrix(data = pred_cl,sub$obese )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1215  478
##          1 1077 1002
##                                           
##                Accuracy : 0.5878          
##                  95% CI : (0.5718, 0.6035)
##     No Information Rate : 0.6076          
##     P-Value [Acc > NIR] : 0.994           
##                                           
##                   Kappa : 0.1933          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5301          
##             Specificity : 0.6770          
##          Pos Pred Value : 0.7177          
##          Neg Pred Value : 0.4820          
##              Prevalence : 0.6076          
##          Detection Rate : 0.3221          
##    Detection Prevalence : 0.4488          
##       Balanced Accuracy : 0.6036          
##                                           
##        'Positive' Class : 0               
##

Stats HW4

Daniel Mamani

2/26/2021

Analysis

Logistic Regression for classification

Training data prediction using 0.5 decision rule

Training data prediction using mean of outcome decision rule

Using the test data set