library(readr)
library(tidyverse)
library(dplyr)
library(broom)
nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv",
col_types = cols(qsmk = col_factor(levels = c("0",
"1")), death = col_factor(levels = c("0",
"1")), sex = col_factor(levels = c("0",
"1")), race = col_factor(levels = c("0",
"1")), marital = col_skip(), sbp= col_skip(), race= col_skip(),
education = col_factor(levels = c("1", "2", "3", "4", "5")),
dbp= col_skip(), school = col_skip(), ht= col_skip(), birthplace= col_skip(),
asthma= col_skip(),bronch= col_skip(),tb= col_skip(),hf= col_skip(),hbp= col_skip(),pepticulcer= col_skip(),colitis= col_skip(),hepatitis= col_skip(),
hayfever= col_skip(),diabetes= col_skip(),polio= col_skip(),tumor= col_skip(),
nervousbreak= col_skip(),alcoholpy= col_skip(), alcoholtype= col_skip(),
alcoholhowmuch= col_skip(), pica= col_skip(), otherpain= col_skip(),
weakheart= col_skip(), allergies= col_skip(), nerves= col_skip(),
lackpep= col_skip(), hbpmed= col_skip(), boweltrouble= col_skip(),
infection= col_skip(), headache= col_skip(), birthcontrol= col_skip(),
pregnancies= col_skip(), cholesterol= col_skip(),
hightax82= col_skip(), price71= col_skip(), price82= col_skip(),tax71= col_skip(),
tax82= col_skip(), tax71_82= col_skip(), price71_82= col_skip(),
dadth= col_skip(), modth= col_skip(), active= col_skip()))
nhefs2 <- nhefs %>%
mutate(qsmk = case_match(qsmk,
"0" ~ "FALSE",
"1" ~ "TRUE",
.ptype = factor(qsmk, levels=c("FALSE","TRUE")))) %>%
mutate(exercise = case_match(exercise,
0 ~ "Much Exercise",
1 ~ "Moderate Exercise",
2 ~ "Little or No Exercise",
.ptype = factor(exercise, levels = c("Moderate Exercise","Much Exercise", "Little or No Exercise")))) %>%
mutate(sex = case_match(sex,
"0" ~ "Male",
"1" ~ "Female",
.ptype = factor(sex, levels=c("Female","Male")))) %>%
mutate(death = case_match(death,
"0" ~ "FALSE",
"1" ~ "TRUE",
.ptype = factor(death, levels=c("FALSE","TRUE"))))
nhefs_life <- nhefs2%>%
mutate(birth_year=1971-age,
year_of_death=yrdth+1900,
lifespan=year_of_death-birth_year,
lived_to_60 = factor(ifelse(lifespan >= 60, TRUE, FALSE)),
lived_to_65 = factor(ifelse(lifespan >= 65, TRUE, FALSE)),
cig_years = smokeintensity * smokeyrs)
library(dagitty)
dag1 <- dagitty('dag {
"Quitting Smoking * Smoking Intensity" [pos="-0.65,-0.439"]
"Smoking Intensity" [exposure,pos="-1.790,-0.219"]
"Quitting Smoking" [exposure,pos="-0.65,-0.914"]
"Lifespan" [outcome,pos="0.15,-0.207"]
Age [pos="-0.65,0.276"]
Weight [pos="0.15,0.987"]
"Sex/Race" [pos="-0.65,0.987"]
"Quitting Smoking * Smoking Intensity" -> "Lifespan"
"Quitting Smoking" -> "Quitting Smoking * Smoking Intensity"
"Smoking Intensity" -> "Quitting Smoking * Smoking Intensity"
Age -> "Smoking Intensity"
Age -> "Lifespan"
Weight -> "Lifespan"
"Sex/Race" -> "Smoking Intensity"
"Sex/Race" -> "Lifespan"
"Sex/Race" -> "Weight"
}')
plot(dag1)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
label(nhefs_life$age) <- "Age"
label(nhefs_life$wt71) <- "Weight (KG)"
label(nhefs_life$exercise) <- "Exercise Level"
label(nhefs_life$smokeyrs) <- "Years of Smoking"
label(nhefs_life$smokeintensity) <- "Daily Cigarettes Smoked"
label(nhefs_life$qsmk) <- "Quit Smoking Between 1971 and 1982"
label(nhefs_life$death) <- "Died Between 1971 and 1992"
Epi_Table <-
table1::table1(
~ age + wt71 + exercise+ smokeintensity + smokeyrs + qsmk+ death| sex,
data = nhefs_life,
overall= "Total",
)
Epi_Table
| Female (N=830) |
Male (N=799) |
Total (N=1629) |
|
|---|---|---|---|
| Age | |||
| Mean (SD) | 43.3 (11.9) | 44.6 (12.5) | 43.9 (12.2) |
| Median [Min, Max] | 43.0 [25.0, 74.0] | 45.0 [25.0, 74.0] | 44.0 [25.0, 74.0] |
| Weight (KG) | |||
| Mean (SD) | 64.9 (14.7) | 77.5 (14.1) | 71.1 (15.7) |
| Median [Min, Max] | 61.6 [36.2, 152] | 76.2 [42.4, 169] | 69.4 [36.2, 169] |
| Exercise Level | |||
| Moderate Exercise | 349 (42.0%) | 328 (41.1%) | 677 (41.6%) |
| Much Exercise | 115 (13.9%) | 202 (25.3%) | 317 (19.5%) |
| Little or No Exercise | 366 (44.1%) | 269 (33.7%) | 635 (39.0%) |
| Daily Cigarettes Smoked | |||
| Mean (SD) | 18.0 (10.6) | 23.2 (12.4) | 20.6 (11.8) |
| Median [Min, Max] | 20.0 [1.00, 60.0] | 20.0 [1.00, 80.0] | 20.0 [1.00, 80.0] |
| Years of Smoking | |||
| Mean (SD) | 22.5 (11.1) | 27.3 (12.8) | 24.9 (12.2) |
| Median [Min, Max] | 22.0 [1.00, 64.0] | 27.0 [1.00, 60.0] | 24.0 [1.00, 64.0] |
| Quit Smoking Between 1971 and 1982 | |||
| FALSE | 639 (77.0%) | 562 (70.3%) | 1201 (73.7%) |
| TRUE | 191 (23.0%) | 237 (29.7%) | 428 (26.3%) |
| Died Between 1971 and 1992 | |||
| FALSE | 707 (85.2%) | 604 (75.6%) | 1311 (80.5%) |
| TRUE | 123 (14.8%) | 195 (24.4%) | 318 (19.5%) |
# subset who have died
nhefs_death <- nhefs_life %>% filter(death==TRUE)
library(table1)
label(nhefs_death$lifespan) <- "Lifespan"
label(nhefs_death$lived_to_65) <- "Lived to 65"
label(nhefs_death$lived_to_60) <- "Lived to 60"
label(nhefs_death$wt71) <- "Weight (KG)"
label(nhefs_death$qsmk) <- "Quit Smoking Between 1971 and 1982"
Epi_Table <-
table1::table1(
~ lifespan + lived_to_60 + lived_to_65 + wt71 +qsmk | sex,
data = nhefs_death,
overall= "Total",
)
Epi_Table
| Female (N=123) |
Male (N=195) |
Total (N=318) |
|
|---|---|---|---|
| Lifespan | |||
| Mean (SD) | 71.7 (10.8) | 72.2 (10.5) | 72.0 (10.6) |
| Median [Min, Max] | 72.0 [45.0, 91.0] | 73.0 [38.0, 93.0] | 73.0 [38.0, 93.0] |
| Lived to 60 | |||
| FALSE | 18 (14.6%) | 26 (13.3%) | 44 (13.8%) |
| TRUE | 105 (85.4%) | 169 (86.7%) | 274 (86.2%) |
| Lived to 65 | |||
| FALSE | 26 (21.1%) | 43 (22.1%) | 69 (21.7%) |
| TRUE | 97 (78.9%) | 152 (77.9%) | 249 (78.3%) |
| Weight (KG) | |||
| Mean (SD) | 67.1 (16.1) | 76.0 (16.0) | 72.6 (16.6) |
| Median [Min, Max] | 64.4 [39.6, 127] | 74.4 [42.4, 169] | 71.4 [39.6, 169] |
| Quit Smoking Between 1971 and 1982 | |||
| FALSE | 89 (72.4%) | 127 (65.1%) | 216 (67.9%) |
| TRUE | 34 (27.6%) | 68 (34.9%) | 102 (32.1%) |
# model training subset
#making subset with equal lived_to_60
set.seed(123) # for reproducibility
nhefs_death_true <- nhefs_death[nhefs_death$lived_to_60 == TRUE, ]
nhefs_death_false <- nhefs_death[nhefs_death$lived_to_60 == FALSE, ]
# number of samples to take from each subset
n_samples <- min(nrow(nhefs_death_true), nrow(nhefs_death_false))
# Sample from each subset
sample_true <- nhefs_death_true[sample(nrow(nhefs_death_true), n_samples), ]
sample_false <- nhefs_death_false[sample(nrow(nhefs_death_false), n_samples), ]
# Combined samples
nhefs_death_s <- rbind(sample_true, sample_false)
library(table1)
label(nhefs_death_s$lifespan) <- "Lifespan"
label(nhefs_death_s$lived_to_60) <- "Lived to 60"
label(nhefs_death_s$wt71) <- "Weight(KG)"
label(nhefs_death_s$qsmk) <- "Quit Smoking Between 1971 and 1982"
Epi_Table <-
table1::table1(
~ lifespan + lived_to_60 + wt71 +qsmk | sex,
data = nhefs_death_s,
overall= "Total",
)
Epi_Table
| Female (N=34) |
Male (N=54) |
Total (N=88) |
|
|---|---|---|---|
| Lifespan | |||
| Mean (SD) | 65.1 (14.7) | 64.3 (13.0) | 64.6 (13.6) |
| Median [Min, Max] | 59.0 [45.0, 91.0] | 62.0 [38.0, 91.0] | 60.0 [38.0, 91.0] |
| Lived to 60 | |||
| FALSE | 18 (52.9%) | 26 (48.1%) | 44 (50.0%) |
| TRUE | 16 (47.1%) | 28 (51.9%) | 44 (50.0%) |
| Weight(KG) | |||
| Mean (SD) | 67.9 (19.3) | 78.7 (19.2) | 74.5 (19.8) |
| Median [Min, Max] | 65.1 [43.0, 122] | 76.9 [47.0, 169] | 73.8 [43.0, 169] |
| Quit Smoking Between 1971 and 1982 | |||
| FALSE | 28 (82.4%) | 37 (68.5%) | 65 (73.9%) |
| TRUE | 6 (17.6%) | 17 (31.5%) | 23 (26.1%) |
ggplot(nhefs_death, aes(x=smokeintensity, color=lived_to_60)) +
geom_density()
ggplot(nhefs_death, aes(x=wt71, color=lived_to_60)) +
geom_density()
ggplot(nhefs_death, aes(x=lifespan, color=qsmk)) +
geom_density()
#Defining model_1
model_1 <- glm(lived_to_60 ~ smokeintensity, data = nhefs_death_s, family = binomial())
summary(model_1)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.56694 -1.22235 0.05162 1.13313 1.70535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.96715 0.44872 2.155 0.0311 *
## smokeintensity -0.04310 0.01754 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 115.47 on 86 degrees of freedom
## AIC: 119.47
##
## Number of Fisher Scoring iterations: 4
#model_1Adding fitted values back into nhefs_death
nhefs_death_s$predicted_1 <- fitted(model_1)
# model_1: Plotting data with logistic trend line
ggplot(nhefs_death_s, aes(x = smokeintensity, y = (as.numeric(lived_to_60)) - 1)) +
geom_point(alpha = 0.5) +
geom_jitter(width = 0.1, height = 0.1)+
geom_line(aes(y= predicted_1), color = "red") +
labs(x = "Smoke Intensity (Cigarettes/Day)",
y = "Probability of Living to 60",
title = "Smoke Intensity and Living to 60") +
scale_y_continuous(breaks = seq(0, 1, by = 0.2))
########
#Defining model_2
model_2 <- glm(lived_to_60 ~ smokeintensity + wt71, data=nhefs_death_s, family=binomial())
#model_2 Adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_2 <- fitted(model_2)
summary(model_2)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.68390 -1.10882 0.01504 1.03713 1.74698
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.41695 1.04097 2.322 0.02024 *
## smokeintensity -0.04671 0.01787 -2.614 0.00895 **
## wt71 -0.01835 0.01179 -1.556 0.11962
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 112.88 on 85 degrees of freedom
## AIC: 118.88
##
## Number of Fisher Scoring iterations: 4
########
#Defining model_3
model_3 <- glm(lived_to_60 ~ smokeintensity + qsmk + wt71, data=nhefs_death_s, family=binomial())
#model_3 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_3 <- fitted(model_3)
model_4 <- glm(lived_to_60 ~ smokeintensity * qsmk + wt71, data=nhefs_death_s, family=binomial())
#model_4 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_4 <- fitted(model_4)
########
summary(model_1)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.56694 -1.22235 0.05162 1.13313 1.70535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.96715 0.44872 2.155 0.0311 *
## smokeintensity -0.04310 0.01754 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 115.47 on 86 degrees of freedom
## AIC: 119.47
##
## Number of Fisher Scoring iterations: 4
summary(model_2)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.68390 -1.10882 0.01504 1.03713 1.74698
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.41695 1.04097 2.322 0.02024 *
## smokeintensity -0.04671 0.01787 -2.614 0.00895 **
## wt71 -0.01835 0.01179 -1.556 0.11962
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 112.88 on 85 degrees of freedom
## AIC: 118.88
##
## Number of Fisher Scoring iterations: 4
summary(model_3)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + qsmk + wt71, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.76937 -1.02657 -0.08342 1.04835 1.85629
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.83905 1.06527 1.726 0.0843 .
## smokeintensity -0.04717 0.01873 -2.518 0.0118 *
## qsmkTRUE 1.33835 0.56818 2.356 0.0185 *
## wt71 -0.01483 0.01185 -1.252 0.2107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 106.79 on 84 degrees of freedom
## AIC: 114.79
##
## Number of Fisher Scoring iterations: 4
summary(model_4)
##
## Call:
## glm(formula = lived_to_60 ~ smokeintensity * qsmk + wt71, family = binomial(),
## data = nhefs_death_s)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7787 -1.0238 -0.1003 1.0573 1.8440
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.808730 1.088486 1.662 0.0966 .
## smokeintensity -0.045568 0.022129 -2.059 0.0395 *
## qsmkTRUE 1.470958 1.147166 1.282 0.1998
## wt71 -0.014893 0.011874 -1.254 0.2097
## smokeintensity:qsmkTRUE -0.005483 0.040911 -0.134 0.8934
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.99 on 87 degrees of freedom
## Residual deviance: 106.77 on 83 degrees of freedom
## AIC: 116.77
##
## Number of Fisher Scoring iterations: 4
#mod 3 suggest quitting by 1982 made it more likely to live to 60... but probably is also survivor bias (people who died young may not have been able to report that they quit, and had less time to quit)
library(yardstick)
##
## Attaching package: 'yardstick'
## The following object is masked from 'package:readr':
##
## spec
# model_1 diagnostics
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_1), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)
confusion_1 <- conf_mat(outcomes)
sumcon1 <- summary(confusion_1,event_level="second")
autoplot(confusion_1)
# model_2 diagnostics
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_2), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)
confusion_2 <- conf_mat(outcomes)
sumcon2 <- summary(confusion_2,event_level="second")
autoplot(confusion_2)
# model_3 diagnostics
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_3), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)
confusion_3 <- conf_mat(outcomes)
sumcon3 <- summary(confusion_3,event_level="second")
autoplot(confusion_3)
# model_4 diagnostics
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_4), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)
confusion_4 <- conf_mat(outcomes)
sumcon4 <- summary(confusion_4,event_level="second")
autoplot(confusion_4)
mod1nostic <- sumcon1 %>% select(3) %>% slice (1,3,4) %>%
rename("Model_1"= .estimate)
mod2nostic <- sumcon2 %>% select(3) %>% slice (1,3,4) %>%
rename("Model_2"= .estimate)
mod3nostic <- sumcon3 %>% select(3) %>% slice (1,3,4) %>%
rename("Model_3"= .estimate)
mod4nostic <- sumcon4 %>% select(3) %>% slice (1,3,4) %>%
rename("Model_4"= .estimate)
combined_nostic <- t(data.frame(mod1nostic,mod2nostic,mod3nostic,mod4nostic))
colnames(combined_nostic) <- c("Accuracy","Sensitivity","Specificity")
data.frame(combined_nostic)
## Accuracy Sensitivity Specificity
## Model_1 0.6022727 0.7272727 0.4772727
## Model_2 0.6363636 0.7045455 0.5681818
## Model_3 0.6704545 0.6363636 0.7045455
## Model_4 0.6590909 0.6136364 0.7045455
#model 3 wins!
library(arm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: lme4
##
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is /Users/scottweiner/Biostats Labs
binnedplot(model_4$fitted, model_1$residuals,main ="Model 1 Binned Residual")
binnedplot(model_4$fitted, model_2$residuals,main ="Model 2 Binned Residual")
binnedplot(model_4$fitted, model_3$residuals,main ="Model 3 Binned Residual")
binnedplot(model_4$fitted, model_4$residuals,main ="Model 4 Binned Residual")