R Code for Biostats Research Report

https://rpubs.com/ScottWeiner19/1065230

library(readr)
library(tidyverse)
library(dplyr)
library(broom)

Importing NHEFS dataset

nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv", 
     col_types = cols(qsmk = col_factor(levels = c("0", 
         "1")), death = col_factor(levels = c("0", 
         "1")), sex = col_factor(levels = c("0", 
         "1")), race = col_factor(levels = c("0", 
         "1")), marital = col_skip(), sbp= col_skip(), race= col_skip(),
         education = col_factor(levels = c("1", "2", "3", "4", "5")),
         dbp= col_skip(), school = col_skip(), ht= col_skip(), birthplace= col_skip(),
         asthma= col_skip(),bronch= col_skip(),tb= col_skip(),hf= col_skip(),hbp= col_skip(),pepticulcer= col_skip(),colitis= col_skip(),hepatitis= col_skip(),
         hayfever= col_skip(),diabetes= col_skip(),polio= col_skip(),tumor= col_skip(),
         nervousbreak= col_skip(),alcoholpy= col_skip(), alcoholtype= col_skip(),
         alcoholhowmuch= col_skip(), pica= col_skip(), otherpain= col_skip(),
         weakheart= col_skip(), allergies= col_skip(), nerves= col_skip(), 
         lackpep= col_skip(), hbpmed= col_skip(), boweltrouble= col_skip(), 
         infection= col_skip(), headache= col_skip(), birthcontrol= col_skip(), 
         pregnancies= col_skip(), cholesterol= col_skip(),
         hightax82= col_skip(), price71= col_skip(), price82= col_skip(),tax71= col_skip(),
         tax82= col_skip(), tax71_82= col_skip(), price71_82= col_skip(), 
        dadth= col_skip(), modth= col_skip(), active= col_skip()))

Recoding

nhefs2 <-  nhefs %>%

   mutate(qsmk = case_match(qsmk,
                          "0" ~ "FALSE",
                          "1" ~ "TRUE",
                          .ptype = factor(qsmk, levels=c("FALSE","TRUE")))) %>%
  

  mutate(exercise = case_match(exercise, 
                           0 ~ "Much Exercise",
                           1 ~ "Moderate Exercise", 
                           2 ~ "Little or No Exercise", 
                           .ptype = factor(exercise, levels = c("Moderate Exercise","Much Exercise", "Little or No Exercise"))))  %>%
 
  mutate(sex = case_match(sex,
                          "0" ~ "Male",
                          "1" ~ "Female",
                          .ptype = factor(sex, levels=c("Female","Male")))) %>%
  
   mutate(death = case_match(death,
                          "0" ~ "FALSE",
                          "1" ~ "TRUE",
                          .ptype = factor(death, levels=c("FALSE","TRUE"))))

Creating New Columns

nhefs_life <- nhefs2%>%
  mutate(birth_year=1971-age,
         year_of_death=yrdth+1900,
         lifespan=year_of_death-birth_year,
         lived_to_60 = factor(ifelse(lifespan >= 60, TRUE, FALSE)),
       lived_to_65 = factor(ifelse(lifespan >= 65, TRUE, FALSE)),
       cig_years = smokeintensity * smokeyrs)

DAG

library(dagitty)
 

dag1 <- dagitty('dag {
"Quitting Smoking * Smoking Intensity" [pos="-0.65,-0.439"]
"Smoking Intensity" [exposure,pos="-1.790,-0.219"]
"Quitting Smoking" [exposure,pos="-0.65,-0.914"]
"Lifespan" [outcome,pos="0.15,-0.207"]
Age [pos="-0.65,0.276"]
Weight [pos="0.15,0.987"]
"Sex/Race" [pos="-0.65,0.987"]
"Quitting Smoking * Smoking Intensity" -> "Lifespan"
"Quitting Smoking" -> "Quitting Smoking * Smoking Intensity"
"Smoking Intensity" -> "Quitting Smoking * Smoking Intensity"
Age -> "Smoking Intensity"
Age -> "Lifespan"
Weight -> "Lifespan"
"Sex/Race" -> "Smoking Intensity"
"Sex/Race" -> "Lifespan"
"Sex/Race" -> "Weight"

}')

plot(dag1)

Table 1

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
label(nhefs_life$age)       <- "Age"
label(nhefs_life$wt71) <- "Weight (KG)"
label(nhefs_life$exercise) <- "Exercise Level"
label(nhefs_life$smokeyrs) <- "Years of Smoking"
label(nhefs_life$smokeintensity) <- "Daily Cigarettes Smoked"
label(nhefs_life$qsmk) <- "Quit Smoking Between 1971 and 1982"
label(nhefs_life$death) <- "Died Between 1971 and 1992"



Epi_Table <- 
table1::table1(
  ~  age + wt71 + exercise+ smokeintensity + smokeyrs + qsmk+ death| sex,
  data = nhefs_life,
  overall= "Total",
)
Epi_Table
Female
(N=830)
Male
(N=799)
Total
(N=1629)
Age
Mean (SD) 43.3 (11.9) 44.6 (12.5) 43.9 (12.2)
Median [Min, Max] 43.0 [25.0, 74.0] 45.0 [25.0, 74.0] 44.0 [25.0, 74.0]
Weight (KG)
Mean (SD) 64.9 (14.7) 77.5 (14.1) 71.1 (15.7)
Median [Min, Max] 61.6 [36.2, 152] 76.2 [42.4, 169] 69.4 [36.2, 169]
Exercise Level
Moderate Exercise 349 (42.0%) 328 (41.1%) 677 (41.6%)
Much Exercise 115 (13.9%) 202 (25.3%) 317 (19.5%)
Little or No Exercise 366 (44.1%) 269 (33.7%) 635 (39.0%)
Daily Cigarettes Smoked
Mean (SD) 18.0 (10.6) 23.2 (12.4) 20.6 (11.8)
Median [Min, Max] 20.0 [1.00, 60.0] 20.0 [1.00, 80.0] 20.0 [1.00, 80.0]
Years of Smoking
Mean (SD) 22.5 (11.1) 27.3 (12.8) 24.9 (12.2)
Median [Min, Max] 22.0 [1.00, 64.0] 27.0 [1.00, 60.0] 24.0 [1.00, 64.0]
Quit Smoking Between 1971 and 1982
FALSE 639 (77.0%) 562 (70.3%) 1201 (73.7%)
TRUE 191 (23.0%) 237 (29.7%) 428 (26.3%)
Died Between 1971 and 1992
FALSE 707 (85.2%) 604 (75.6%) 1311 (80.5%)
TRUE 123 (14.8%) 195 (24.4%) 318 (19.5%)
# subset who have died 
nhefs_death <- nhefs_life %>% filter(death==TRUE)

library(table1)


label(nhefs_death$lifespan) <- "Lifespan"
label(nhefs_death$lived_to_65) <- "Lived to 65"
label(nhefs_death$lived_to_60) <- "Lived to 60"
label(nhefs_death$wt71) <- "Weight (KG)"
label(nhefs_death$qsmk) <- "Quit Smoking Between 1971 and 1982"



Epi_Table <- 
table1::table1(
  ~  lifespan + lived_to_60 + lived_to_65 + wt71 +qsmk | sex,
  data = nhefs_death,
  overall= "Total",
)
Epi_Table
Female
(N=123)
Male
(N=195)
Total
(N=318)
Lifespan
Mean (SD) 71.7 (10.8) 72.2 (10.5) 72.0 (10.6)
Median [Min, Max] 72.0 [45.0, 91.0] 73.0 [38.0, 93.0] 73.0 [38.0, 93.0]
Lived to 60
FALSE 18 (14.6%) 26 (13.3%) 44 (13.8%)
TRUE 105 (85.4%) 169 (86.7%) 274 (86.2%)
Lived to 65
FALSE 26 (21.1%) 43 (22.1%) 69 (21.7%)
TRUE 97 (78.9%) 152 (77.9%) 249 (78.3%)
Weight (KG)
Mean (SD) 67.1 (16.1) 76.0 (16.0) 72.6 (16.6)
Median [Min, Max] 64.4 [39.6, 127] 74.4 [42.4, 169] 71.4 [39.6, 169]
Quit Smoking Between 1971 and 1982
FALSE 89 (72.4%) 127 (65.1%) 216 (67.9%)
TRUE 34 (27.6%) 68 (34.9%) 102 (32.1%)
# model training subset 

#making subset with equal lived_to_60

set.seed(123)  # for reproducibility

nhefs_death_true <- nhefs_death[nhefs_death$lived_to_60 == TRUE, ]
nhefs_death_false <- nhefs_death[nhefs_death$lived_to_60 == FALSE, ]

# number of samples to take from each subset
n_samples <- min(nrow(nhefs_death_true), nrow(nhefs_death_false))

# Sample from each subset
sample_true <- nhefs_death_true[sample(nrow(nhefs_death_true), n_samples), ]
sample_false <- nhefs_death_false[sample(nrow(nhefs_death_false), n_samples), ]

# Combined samples
nhefs_death_s <- rbind(sample_true, sample_false)



library(table1)


label(nhefs_death_s$lifespan) <- "Lifespan"
label(nhefs_death_s$lived_to_60) <- "Lived to 60"
label(nhefs_death_s$wt71) <- "Weight(KG)"
label(nhefs_death_s$qsmk) <- "Quit Smoking Between 1971 and 1982"



Epi_Table <- 
table1::table1(
  ~  lifespan + lived_to_60 + wt71 +qsmk | sex,
  data = nhefs_death_s,
  overall= "Total",
)
Epi_Table
Female
(N=34)
Male
(N=54)
Total
(N=88)
Lifespan
Mean (SD) 65.1 (14.7) 64.3 (13.0) 64.6 (13.6)
Median [Min, Max] 59.0 [45.0, 91.0] 62.0 [38.0, 91.0] 60.0 [38.0, 91.0]
Lived to 60
FALSE 18 (52.9%) 26 (48.1%) 44 (50.0%)
TRUE 16 (47.1%) 28 (51.9%) 44 (50.0%)
Weight(KG)
Mean (SD) 67.9 (19.3) 78.7 (19.2) 74.5 (19.8)
Median [Min, Max] 65.1 [43.0, 122] 76.9 [47.0, 169] 73.8 [43.0, 169]
Quit Smoking Between 1971 and 1982
FALSE 28 (82.4%) 37 (68.5%) 65 (73.9%)
TRUE 6 (17.6%) 17 (31.5%) 23 (26.1%)

Exploratory Data Analysis

ggplot(nhefs_death, aes(x=smokeintensity, color=lived_to_60)) +
  geom_density()

ggplot(nhefs_death, aes(x=wt71, color=lived_to_60)) +
  geom_density()

ggplot(nhefs_death, aes(x=lifespan, color=qsmk)) +
  geom_density()

Lostistic Regression Models

#Defining model_1
model_1 <- glm(lived_to_60 ~ smokeintensity, data = nhefs_death_s, family = binomial())
summary(model_1)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.56694  -1.22235   0.05162   1.13313   1.70535  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     0.96715    0.44872   2.155   0.0311 *
## smokeintensity -0.04310    0.01754  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 115.47  on 86  degrees of freedom
## AIC: 119.47
## 
## Number of Fisher Scoring iterations: 4
#model_1Adding fitted values back into nhefs_death
nhefs_death_s$predicted_1 <- fitted(model_1)


# model_1: Plotting data with logistic trend line
ggplot(nhefs_death_s, aes(x = smokeintensity, y = (as.numeric(lived_to_60)) - 1)) +
  geom_point(alpha = 0.5) +
  geom_jitter(width = 0.1, height = 0.1)+
  geom_line(aes(y= predicted_1), color = "red") +
  labs(x = "Smoke Intensity (Cigarettes/Day)", 
       y = "Probability of Living to 60", 
       title = "Smoke Intensity and Living to 60") +
      scale_y_continuous(breaks = seq(0, 1, by = 0.2))

########


#Defining model_2


model_2 <- glm(lived_to_60 ~ smokeintensity + wt71, data=nhefs_death_s, family=binomial())

#model_2 Adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_2 <- fitted(model_2)


summary(model_2)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.68390  -1.10882   0.01504   1.03713   1.74698  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     2.41695    1.04097   2.322  0.02024 * 
## smokeintensity -0.04671    0.01787  -2.614  0.00895 **
## wt71           -0.01835    0.01179  -1.556  0.11962   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 112.88  on 85  degrees of freedom
## AIC: 118.88
## 
## Number of Fisher Scoring iterations: 4
########


#Defining model_3


model_3 <- glm(lived_to_60 ~ smokeintensity + qsmk + wt71, data=nhefs_death_s, family=binomial())

#model_3 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_3 <- fitted(model_3)



model_4 <- glm(lived_to_60 ~ smokeintensity * qsmk + wt71, data=nhefs_death_s, family=binomial())

#model_4 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_4 <- fitted(model_4)

########

summary(model_1)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.56694  -1.22235   0.05162   1.13313   1.70535  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     0.96715    0.44872   2.155   0.0311 *
## smokeintensity -0.04310    0.01754  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 115.47  on 86  degrees of freedom
## AIC: 119.47
## 
## Number of Fisher Scoring iterations: 4
summary(model_2)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.68390  -1.10882   0.01504   1.03713   1.74698  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     2.41695    1.04097   2.322  0.02024 * 
## smokeintensity -0.04671    0.01787  -2.614  0.00895 **
## wt71           -0.01835    0.01179  -1.556  0.11962   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 112.88  on 85  degrees of freedom
## AIC: 118.88
## 
## Number of Fisher Scoring iterations: 4
summary(model_3)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + qsmk + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.76937  -1.02657  -0.08342   1.04835   1.85629  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.83905    1.06527   1.726   0.0843 .
## smokeintensity -0.04717    0.01873  -2.518   0.0118 *
## qsmkTRUE        1.33835    0.56818   2.356   0.0185 *
## wt71           -0.01483    0.01185  -1.252   0.2107  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 106.79  on 84  degrees of freedom
## AIC: 114.79
## 
## Number of Fisher Scoring iterations: 4
summary(model_4)
## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity * qsmk + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7787  -1.0238  -0.1003   1.0573   1.8440  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)  
## (Intercept)              1.808730   1.088486   1.662   0.0966 .
## smokeintensity          -0.045568   0.022129  -2.059   0.0395 *
## qsmkTRUE                 1.470958   1.147166   1.282   0.1998  
## wt71                    -0.014893   0.011874  -1.254   0.2097  
## smokeintensity:qsmkTRUE -0.005483   0.040911  -0.134   0.8934  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 106.77  on 83  degrees of freedom
## AIC: 116.77
## 
## Number of Fisher Scoring iterations: 4
#mod 3 suggest quitting by 1982 made it more likely to live to 60... but probably is also survivor bias (people who died young may not have been able to report that they quit, and had less time to quit)

Diagnostic and Model Performance/Comparison

library(yardstick)
## 
## Attaching package: 'yardstick'
## The following object is masked from 'package:readr':
## 
##     spec
# model_1 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_1), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_1 <- conf_mat(outcomes)

sumcon1 <- summary(confusion_1,event_level="second")
autoplot(confusion_1)

# model_2 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_2), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_2 <- conf_mat(outcomes)

sumcon2 <- summary(confusion_2,event_level="second")
autoplot(confusion_2)

# model_3 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_3), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_3 <- conf_mat(outcomes)

sumcon3 <- summary(confusion_3,event_level="second")
autoplot(confusion_3)

# model_4 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_4), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_4 <- conf_mat(outcomes)

sumcon4 <- summary(confusion_4,event_level="second")
autoplot(confusion_4)

mod1nostic <- sumcon1 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_1"= .estimate)
mod2nostic <- sumcon2 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_2"= .estimate)
mod3nostic <- sumcon3 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_3"= .estimate)
mod4nostic <- sumcon4 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_4"= .estimate)

combined_nostic <- t(data.frame(mod1nostic,mod2nostic,mod3nostic,mod4nostic))

colnames(combined_nostic) <- c("Accuracy","Sensitivity","Specificity")

data.frame(combined_nostic)
##          Accuracy Sensitivity Specificity
## Model_1 0.6022727   0.7272727   0.4772727
## Model_2 0.6363636   0.7045455   0.5681818
## Model_3 0.6704545   0.6363636   0.7045455
## Model_4 0.6590909   0.6136364   0.7045455
#model 3 wins!

Binned Residuals

library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: lme4
## 
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is /Users/scottweiner/Biostats Labs
binnedplot(model_4$fitted, model_1$residuals,main ="Model 1 Binned Residual")

binnedplot(model_4$fitted, model_2$residuals,main ="Model 2 Binned Residual")

binnedplot(model_4$fitted, model_3$residuals,main ="Model 3 Binned Residual")

binnedplot(model_4$fitted, model_4$residuals,main ="Model 4 Binned Residual")