R Code for Biostats Research Report

https://rpubs.com/ScottWeiner19/1065230

library(readr)
library(tidyverse)
library(dplyr)
library(broom)

Importing NHEFS dataset

nhefs <- read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv", 
     col_types = cols(qsmk = col_factor(levels = c("0", 
         "1")), death = col_factor(levels = c("0", 
         "1")), sex = col_factor(levels = c("0", 
         "1")), race = col_factor(levels = c("0", 
         "1")), marital = col_skip(), sbp= col_skip(), race= col_skip(),
         education = col_factor(levels = c("1", "2", "3", "4", "5")),
         dbp= col_skip(), school = col_skip(), ht= col_skip(), birthplace= col_skip(),
         asthma= col_skip(),bronch= col_skip(),tb= col_skip(),hf= col_skip(),hbp= col_skip(),pepticulcer= col_skip(),colitis= col_skip(),hepatitis= col_skip(),
         hayfever= col_skip(),diabetes= col_skip(),polio= col_skip(),tumor= col_skip(),
         nervousbreak= col_skip(),alcoholpy= col_skip(), alcoholtype= col_skip(),
         alcoholhowmuch= col_skip(), pica= col_skip(), otherpain= col_skip(),
         weakheart= col_skip(), allergies= col_skip(), nerves= col_skip(), 
         lackpep= col_skip(), hbpmed= col_skip(), boweltrouble= col_skip(), 
         infection= col_skip(), headache= col_skip(), birthcontrol= col_skip(), 
         pregnancies= col_skip(), cholesterol= col_skip(),
         hightax82= col_skip(), price71= col_skip(), price82= col_skip(),tax71= col_skip(),
         tax82= col_skip(), tax71_82= col_skip(), price71_82= col_skip(), 
        dadth= col_skip(), modth= col_skip(), active= col_skip()))

Recoding

nhefs2 <-  nhefs %>%

   mutate(qsmk = case_match(qsmk,
                          "0" ~ "FALSE",
                          "1" ~ "TRUE",
                          .ptype = factor(qsmk, levels=c("FALSE","TRUE")))) %>%
  

  mutate(exercise = case_match(exercise, 
                           0 ~ "Much Exercise",
                           1 ~ "Moderate Exercise", 
                           2 ~ "Little or No Exercise", 
                           .ptype = factor(exercise, levels = c("Moderate Exercise","Much Exercise", "Little or No Exercise"))))  %>%
 
  mutate(sex = case_match(sex,
                          "0" ~ "Male",
                          "1" ~ "Female",
                          .ptype = factor(sex, levels=c("Female","Male")))) %>%
  
   mutate(death = case_match(death,
                          "0" ~ "FALSE",
                          "1" ~ "TRUE",
                          .ptype = factor(death, levels=c("FALSE","TRUE"))))

Creating New Columns

nhefs_life <- nhefs2%>%
  mutate(birth_year=1971-age,
         year_of_death=yrdth+1900,
         lifespan=year_of_death-birth_year,
         lived_to_60 = factor(ifelse(lifespan >= 60, TRUE, FALSE)),
       lived_to_65 = factor(ifelse(lifespan >= 65, TRUE, FALSE)),
       cig_years = smokeintensity * smokeyrs)

DAG

library(dagitty)
 

dag1 <- dagitty('dag {
"Quitting Smoking * Smoking Intensity" [pos="-0.65,-0.439"]
"Smoking Intensity" [exposure,pos="-1.790,-0.219"]
"Quitting Smoking" [exposure,pos="-0.65,-0.914"]
"Lifespan" [outcome,pos="0.15,-0.207"]
Age [pos="-0.65,0.276"]
Weight [pos="0.15,0.987"]
"Sex/Race" [pos="-0.65,0.987"]
"Quitting Smoking * Smoking Intensity" -> "Lifespan"
"Quitting Smoking" -> "Quitting Smoking * Smoking Intensity"
"Smoking Intensity" -> "Quitting Smoking * Smoking Intensity"
Age -> "Smoking Intensity"
Age -> "Lifespan"
Weight -> "Lifespan"
"Sex/Race" -> "Smoking Intensity"
"Sex/Race" -> "Lifespan"
"Sex/Race" -> "Weight"

}')

plot(dag1)

Table 1

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

label(nhefs_life$age)       <- "Age"
label(nhefs_life$wt71) <- "Weight (KG)"
label(nhefs_life$exercise) <- "Exercise Level"
label(nhefs_life$smokeyrs) <- "Years of Smoking"
label(nhefs_life$smokeintensity) <- "Daily Cigarettes Smoked"
label(nhefs_life$qsmk) <- "Quit Smoking Between 1971 and 1982"
label(nhefs_life$death) <- "Died Between 1971 and 1992"



Epi_Table <- 
table1::table1(
  ~  age + wt71 + exercise+ smokeintensity + smokeyrs + qsmk+ death| sex,
  data = nhefs_life,
  overall= "Total",
)
Epi_Table

	Female (N=830)	Male (N=799)	Total (N=1629)
Age
Mean (SD)	43.3 (11.9)	44.6 (12.5)	43.9 (12.2)
Median [Min, Max]	43.0 [25.0, 74.0]	45.0 [25.0, 74.0]	44.0 [25.0, 74.0]
Weight (KG)
Mean (SD)	64.9 (14.7)	77.5 (14.1)	71.1 (15.7)
Median [Min, Max]	61.6 [36.2, 152]	76.2 [42.4, 169]	69.4 [36.2, 169]
Exercise Level
Moderate Exercise	349 (42.0%)	328 (41.1%)	677 (41.6%)
Much Exercise	115 (13.9%)	202 (25.3%)	317 (19.5%)
Little or No Exercise	366 (44.1%)	269 (33.7%)	635 (39.0%)
Daily Cigarettes Smoked
Mean (SD)	18.0 (10.6)	23.2 (12.4)	20.6 (11.8)
Median [Min, Max]	20.0 [1.00, 60.0]	20.0 [1.00, 80.0]	20.0 [1.00, 80.0]
Years of Smoking
Mean (SD)	22.5 (11.1)	27.3 (12.8)	24.9 (12.2)
Median [Min, Max]	22.0 [1.00, 64.0]	27.0 [1.00, 60.0]	24.0 [1.00, 64.0]
Quit Smoking Between 1971 and 1982
FALSE	639 (77.0%)	562 (70.3%)	1201 (73.7%)
TRUE	191 (23.0%)	237 (29.7%)	428 (26.3%)
Died Between 1971 and 1992
FALSE	707 (85.2%)	604 (75.6%)	1311 (80.5%)
TRUE	123 (14.8%)	195 (24.4%)	318 (19.5%)

# subset who have died 
nhefs_death <- nhefs_life %>% filter(death==TRUE)

library(table1)


label(nhefs_death$lifespan) <- "Lifespan"
label(nhefs_death$lived_to_65) <- "Lived to 65"
label(nhefs_death$lived_to_60) <- "Lived to 60"
label(nhefs_death$wt71) <- "Weight (KG)"
label(nhefs_death$qsmk) <- "Quit Smoking Between 1971 and 1982"



Epi_Table <- 
table1::table1(
  ~  lifespan + lived_to_60 + lived_to_65 + wt71 +qsmk | sex,
  data = nhefs_death,
  overall= "Total",
)
Epi_Table

	Female (N=123)	Male (N=195)	Total (N=318)
Lifespan
Mean (SD)	71.7 (10.8)	72.2 (10.5)	72.0 (10.6)
Median [Min, Max]	72.0 [45.0, 91.0]	73.0 [38.0, 93.0]	73.0 [38.0, 93.0]
Lived to 60
FALSE	18 (14.6%)	26 (13.3%)	44 (13.8%)
TRUE	105 (85.4%)	169 (86.7%)	274 (86.2%)
Lived to 65
FALSE	26 (21.1%)	43 (22.1%)	69 (21.7%)
TRUE	97 (78.9%)	152 (77.9%)	249 (78.3%)
Weight (KG)
Mean (SD)	67.1 (16.1)	76.0 (16.0)	72.6 (16.6)
Median [Min, Max]	64.4 [39.6, 127]	74.4 [42.4, 169]	71.4 [39.6, 169]
Quit Smoking Between 1971 and 1982
FALSE	89 (72.4%)	127 (65.1%)	216 (67.9%)
TRUE	34 (27.6%)	68 (34.9%)	102 (32.1%)

# model training subset 

#making subset with equal lived_to_60

set.seed(123)  # for reproducibility

nhefs_death_true <- nhefs_death[nhefs_death$lived_to_60 == TRUE, ]
nhefs_death_false <- nhefs_death[nhefs_death$lived_to_60 == FALSE, ]

# number of samples to take from each subset
n_samples <- min(nrow(nhefs_death_true), nrow(nhefs_death_false))

# Sample from each subset
sample_true <- nhefs_death_true[sample(nrow(nhefs_death_true), n_samples), ]
sample_false <- nhefs_death_false[sample(nrow(nhefs_death_false), n_samples), ]

# Combined samples
nhefs_death_s <- rbind(sample_true, sample_false)



library(table1)


label(nhefs_death_s$lifespan) <- "Lifespan"
label(nhefs_death_s$lived_to_60) <- "Lived to 60"
label(nhefs_death_s$wt71) <- "Weight(KG)"
label(nhefs_death_s$qsmk) <- "Quit Smoking Between 1971 and 1982"



Epi_Table <- 
table1::table1(
  ~  lifespan + lived_to_60 + wt71 +qsmk | sex,
  data = nhefs_death_s,
  overall= "Total",
)
Epi_Table

	Female (N=34)	Male (N=54)	Total (N=88)
Lifespan
Mean (SD)	65.1 (14.7)	64.3 (13.0)	64.6 (13.6)
Median [Min, Max]	59.0 [45.0, 91.0]	62.0 [38.0, 91.0]	60.0 [38.0, 91.0]
Lived to 60
FALSE	18 (52.9%)	26 (48.1%)	44 (50.0%)
TRUE	16 (47.1%)	28 (51.9%)	44 (50.0%)
Weight(KG)
Mean (SD)	67.9 (19.3)	78.7 (19.2)	74.5 (19.8)
Median [Min, Max]	65.1 [43.0, 122]	76.9 [47.0, 169]	73.8 [43.0, 169]
Quit Smoking Between 1971 and 1982
FALSE	28 (82.4%)	37 (68.5%)	65 (73.9%)
TRUE	6 (17.6%)	17 (31.5%)	23 (26.1%)

Exploratory Data Analysis

ggplot(nhefs_death, aes(x=smokeintensity, color=lived_to_60)) +
  geom_density()

ggplot(nhefs_death, aes(x=wt71, color=lived_to_60)) +
  geom_density()

ggplot(nhefs_death, aes(x=lifespan, color=qsmk)) +
  geom_density()

Lostistic Regression Models

#Defining model_1
model_1 <- glm(lived_to_60 ~ smokeintensity, data = nhefs_death_s, family = binomial())
summary(model_1)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.56694  -1.22235   0.05162   1.13313   1.70535  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     0.96715    0.44872   2.155   0.0311 *
## smokeintensity -0.04310    0.01754  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 115.47  on 86  degrees of freedom
## AIC: 119.47
## 
## Number of Fisher Scoring iterations: 4

#model_1Adding fitted values back into nhefs_death
nhefs_death_s$predicted_1 <- fitted(model_1)


# model_1: Plotting data with logistic trend line
ggplot(nhefs_death_s, aes(x = smokeintensity, y = (as.numeric(lived_to_60)) - 1)) +
  geom_point(alpha = 0.5) +
  geom_jitter(width = 0.1, height = 0.1)+
  geom_line(aes(y= predicted_1), color = "red") +
  labs(x = "Smoke Intensity (Cigarettes/Day)", 
       y = "Probability of Living to 60", 
       title = "Smoke Intensity and Living to 60") +
      scale_y_continuous(breaks = seq(0, 1, by = 0.2))

########


#Defining model_2


model_2 <- glm(lived_to_60 ~ smokeintensity + wt71, data=nhefs_death_s, family=binomial())

#model_2 Adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_2 <- fitted(model_2)


summary(model_2)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.68390  -1.10882   0.01504   1.03713   1.74698  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     2.41695    1.04097   2.322  0.02024 * 
## smokeintensity -0.04671    0.01787  -2.614  0.00895 **
## wt71           -0.01835    0.01179  -1.556  0.11962   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 112.88  on 85  degrees of freedom
## AIC: 118.88
## 
## Number of Fisher Scoring iterations: 4

########


#Defining model_3


model_3 <- glm(lived_to_60 ~ smokeintensity + qsmk + wt71, data=nhefs_death_s, family=binomial())

#model_3 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_3 <- fitted(model_3)



model_4 <- glm(lived_to_60 ~ smokeintensity * qsmk + wt71, data=nhefs_death_s, family=binomial())

#model_4 adding fitted values back into nhefs_death_s
nhefs_death_s$predicted_4 <- fitted(model_4)

########

summary(model_1)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.56694  -1.22235   0.05162   1.13313   1.70535  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     0.96715    0.44872   2.155   0.0311 *
## smokeintensity -0.04310    0.01754  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 115.47  on 86  degrees of freedom
## AIC: 119.47
## 
## Number of Fisher Scoring iterations: 4

summary(model_2)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.68390  -1.10882   0.01504   1.03713   1.74698  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     2.41695    1.04097   2.322  0.02024 * 
## smokeintensity -0.04671    0.01787  -2.614  0.00895 **
## wt71           -0.01835    0.01179  -1.556  0.11962   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 112.88  on 85  degrees of freedom
## AIC: 118.88
## 
## Number of Fisher Scoring iterations: 4

summary(model_3)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity + qsmk + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.76937  -1.02657  -0.08342   1.04835   1.85629  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.83905    1.06527   1.726   0.0843 .
## smokeintensity -0.04717    0.01873  -2.518   0.0118 *
## qsmkTRUE        1.33835    0.56818   2.356   0.0185 *
## wt71           -0.01483    0.01185  -1.252   0.2107  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 106.79  on 84  degrees of freedom
## AIC: 114.79
## 
## Number of Fisher Scoring iterations: 4

summary(model_4)

## 
## Call:
## glm(formula = lived_to_60 ~ smokeintensity * qsmk + wt71, family = binomial(), 
##     data = nhefs_death_s)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7787  -1.0238  -0.1003   1.0573   1.8440  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)  
## (Intercept)              1.808730   1.088486   1.662   0.0966 .
## smokeintensity          -0.045568   0.022129  -2.059   0.0395 *
## qsmkTRUE                 1.470958   1.147166   1.282   0.1998  
## wt71                    -0.014893   0.011874  -1.254   0.2097  
## smokeintensity:qsmkTRUE -0.005483   0.040911  -0.134   0.8934  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.99  on 87  degrees of freedom
## Residual deviance: 106.77  on 83  degrees of freedom
## AIC: 116.77
## 
## Number of Fisher Scoring iterations: 4

#mod 3 suggest quitting by 1982 made it more likely to live to 60... but probably is also survivor bias (people who died young may not have been able to report that they quit, and had less time to quit)

Diagnostic and Model Performance/Comparison

library(yardstick)

## 
## Attaching package: 'yardstick'

## The following object is masked from 'package:readr':
## 
##     spec

# model_1 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_1), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_1 <- conf_mat(outcomes)

sumcon1 <- summary(confusion_1,event_level="second")
autoplot(confusion_1)

# model_2 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_2), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_2 <- conf_mat(outcomes)

sumcon2 <- summary(confusion_2,event_level="second")
autoplot(confusion_2)

# model_3 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_3), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_3 <- conf_mat(outcomes)

sumcon3 <- summary(confusion_3,event_level="second")
autoplot(confusion_3)

# model_4 diagnostics  
outcomes <- table("predicted values"=round(nhefs_death_s$predicted_4), "actual values"=as.numeric(nhefs_death_s$lived_to_60)-1)

confusion_4 <- conf_mat(outcomes)

sumcon4 <- summary(confusion_4,event_level="second")
autoplot(confusion_4)

mod1nostic <- sumcon1 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_1"= .estimate)
mod2nostic <- sumcon2 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_2"= .estimate)
mod3nostic <- sumcon3 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_3"= .estimate)
mod4nostic <- sumcon4 %>% select(3) %>% slice (1,3,4) %>%
  rename("Model_4"= .estimate)

combined_nostic <- t(data.frame(mod1nostic,mod2nostic,mod3nostic,mod4nostic))

colnames(combined_nostic) <- c("Accuracy","Sensitivity","Specificity")

data.frame(combined_nostic)

##          Accuracy Sensitivity Specificity
## Model_1 0.6022727   0.7272727   0.4772727
## Model_2 0.6363636   0.7045455   0.5681818
## Model_3 0.6704545   0.6363636   0.7045455
## Model_4 0.6590909   0.6136364   0.7045455

#model 3 wins!

Binned Residuals

library(arm)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: lme4

## 
## arm (Version 1.13-1, built: 2022-8-25)

## Working directory is /Users/scottweiner/Biostats Labs

binnedplot(model_4$fitted, model_1$residuals,main ="Model 1 Binned Residual")

binnedplot(model_4$fitted, model_2$residuals,main ="Model 2 Binned Residual")

binnedplot(model_4$fitted, model_3$residuals,main ="Model 3 Binned Residual")

binnedplot(model_4$fitted, model_4$residuals,main ="Model 4 Binned Residual")

Biostats: Research Report

Scott Weiner

2023-07-21

R Code for Biostats Research Report

https://rpubs.com/ScottWeiner19/1065230

Importing NHEFS dataset

Recoding

Creating New Columns

DAG

Table 1

Exploratory Data Analysis

Lostistic Regression Models

Diagnostic and Model Performance/Comparison

Binned Residuals