1. Create a contingency table of exposed vs unexposed and outcome vs no outcome
# Number of participants
nrow(data)
## [1] 117
# Describe data
library(Hmisc)
Hmisc::describe(data)
## data
##
## 20 Variables 117 Observations
## ---------------------------------------------------------------------------
## duration (seconds)
## n missing distinct Info Mean Gmd .05 .10
## 117 0 75 1 430.2 700.9 55.6 57.0
## .25 .50 .75 .90 .95
## 67.0 88.0 118.0 176.0 240.2
##
## Value 50 100 150 200 300 350 700 2150 13100 23200
## Frequency 41 49 15 6 1 1 1 1 1 1
## Proportion 0.350 0.419 0.128 0.051 0.009 0.009 0.009 0.009 0.009 0.009
## ---------------------------------------------------------------------------
## q1
## n missing distinct Info Mean Gmd
## 117 0 4 0.589 1.803 0.4515
##
## Value 1 2 3 4
## Frequency 28 86 1 2
## Proportion 0.239 0.735 0.009 0.017
## ---------------------------------------------------------------------------
## q2
## n missing distinct Info Mean Gmd
## 117 0 5 0.933 2021 1.332
##
## Value 2019 2020 2021 2022 2023
## Frequency 1 38 25 25 28
## Proportion 0.009 0.325 0.214 0.214 0.239
## ---------------------------------------------------------------------------
## q3
## n missing distinct
## 117 0 15
##
## lowest : Biochemistry Biology Biomedical Engineering Chemistry Comparative Literature
## highest: Philosophy Political Science Psychology Sociology Studio Art
## ---------------------------------------------------------------------------
## stem1
## n missing distinct Info Sum Mean Gmd
## 117 0 2 0.169 110 0.9402 0.1135
##
## ---------------------------------------------------------------------------
## q5
## n missing distinct
## 30 87 18
##
## lowest : Anthropology Asian Studies Biology Chemistry Communications
## highest: Music Neuroscience Peace War and Defense Political science Psychology
## ---------------------------------------------------------------------------
## stem2
## n missing distinct Info Sum Mean Gmd
## 30 87 2 0.697 11 0.3667 0.4805
##
## ---------------------------------------------------------------------------
## q6
## n missing distinct Info Sum Mean Gmd
## 117 0 2 0.746 63 0.5385 0.5013
##
## ---------------------------------------------------------------------------
## q7
## n missing distinct Info Sum Mean Gmd
## 117 0 2 0.316 14 0.1197 0.2125
##
## ---------------------------------------------------------------------------
## q8
## n missing distinct Info Mean Gmd .05 .10
## 117 0 32 0.991 18.22 13 4 6
## .25 .50 .75 .90 .95
## 10 15 25 35 40
##
## lowest : 0 1 2 3 4, highest: 35 40 42 50 63
## ---------------------------------------------------------------------------
## q9
## n missing distinct Info Mean Gmd .05 .10
## 117 0 15 0.876 1.794 2.579 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.2 3.0 5.0 8.0
##
## Value 0.0 0.2 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0
## Frequency 58 2 4 13 1 4 1 8 1 6
## Proportion 0.496 0.017 0.034 0.111 0.009 0.034 0.009 0.068 0.009 0.051
##
## Value 5.0 6.0 8.0 10.0 15.0
## Frequency 10 2 3 3 1
## Proportion 0.085 0.017 0.026 0.026 0.009
## ---------------------------------------------------------------------------
## q10
## n missing distinct Info Mean Gmd .05 .10
## 117 0 12 0.945 2.406 2.752 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 2.0 4.0 6.0 6.2
##
## Value 0.0 0.5 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0
## Frequency 43 1 12 10 11 14 12 8 2 2
## Proportion 0.368 0.009 0.103 0.085 0.094 0.120 0.103 0.068 0.017 0.017
##
## Value 10.0 12.0
## Frequency 1 1
## Proportion 0.009 0.009
## ---------------------------------------------------------------------------
## q11
## n missing distinct Info Mean Gmd
## 117 0 5 0.931 3.385 1.286
##
## Value 1 2 3 4 5
## Frequency 7 23 25 42 20
## Proportion 0.060 0.197 0.214 0.359 0.171
## ---------------------------------------------------------------------------
## q12
## n missing distinct Info Mean Gmd
## 117 0 5 0.914 3.641 1.133
##
## Value 1 2 3 4 5
## Frequency 3 15 28 46 25
## Proportion 0.026 0.128 0.239 0.393 0.214
## ---------------------------------------------------------------------------
## q13
## n missing distinct Info Mean Gmd
## 117 0 4 0.904 3.65 1.022
##
## Value 2 3 4 5
## Frequency 15 33 47 22
## Proportion 0.128 0.282 0.402 0.188
## ---------------------------------------------------------------------------
## q14
## n missing distinct Info Mean Gmd
## 117 0 5 0.856 4.197 0.9425
##
## Value 1 2 3 4 5
## Frequency 1 7 13 43 53
## Proportion 0.009 0.060 0.111 0.368 0.453
## ---------------------------------------------------------------------------
## q15
## n missing distinct Info Mean Gmd
## 117 0 5 0.861 3.521 0.9304
##
## Value 1 2 3 4 5
## Frequency 1 16 32 57 11
## Proportion 0.009 0.137 0.274 0.487 0.094
## ---------------------------------------------------------------------------
## q15reversed
## n missing distinct Info Mean Gmd
## 117 0 5 0.861 2.479 0.9304
##
## Value 1 2 3 4 5
## Frequency 11 57 32 16 1
## Proportion 0.094 0.487 0.274 0.137 0.009
## ---------------------------------------------------------------------------
## q16
## n missing distinct Info Mean Gmd
## 117 0 5 0.934 2.513 1.549
##
## Value 1 2 3 4 5
## Frequency 35 37 10 20 15
## Proportion 0.299 0.316 0.085 0.171 0.128
## ---------------------------------------------------------------------------
## q17
## n missing distinct Info Mean Gmd
## 117 0 5 0.916 3.778 1.336
##
## Value 1 2 3 4 5
## Frequency 9 12 16 39 41
## Proportion 0.077 0.103 0.137 0.333 0.350
## ---------------------------------------------------------------------------
# Making dichotomous variables for exposure and outcome
library(dplyr)
# Exposure
# We will calculate the mean stress score for all participants using the aforementioned scale. Then, if a participant falls above one standard deviation of the mean among all participants, we will classify them as having “high stress” and they will be coded as a 1. If they fall below the mean, they will be categorized as “low stress” and coded as a 0.
# Two data points are missing for two individuals. I will use a random number generator to fill in these numbers
# Q11-Q17 (Q15 will be reversed)
# Making a total stress scale score
data = data %>% mutate(stress = q11 + q12 + q13 + q14 + q15reversed + q16 + q17)
# mean stress score
mean(data$stress)
## [1] 23.64103
# One standard deviation
sqrt(var(data$stress))
## [1] 4.756958
# Creating a stress factor variable based off of whether participants are above/below the mean + 1 SD
data$stress_f = ifelse(data$stress > (mean(data$stress)+sqrt(var(data$stress))) , 1, 0)
# stress_f table
table(data$stress_f)
##
## 0 1
## 98 19
# Outcome
# Our outcome of interest is binge drinking, or the consumption of 4 or more alcoholic beverages by a female and 5 or more by a male in a 2 hour time period.
# Based off of q1 and q10
data$binge_drinking = ifelse(data$q1 == 1 & data$q10 >= 5, 1, ifelse(data$q1 == 2 & data$q10 >= 4,1,0))
# binge_drinking table
table(data$binge_drinking)
##
## 0 1
## 81 36
# Contingency Table
disease_exp = sum(ifelse(data$stress_f == 1 & data$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data$stress_f == 1 & data$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data$stress_f == 0 & data$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data$stress_f == 0 & data$binge_drinking == 0, 1, 0), na.rm = T)
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 7 12
## Not Stressed 29 69
2. Prevalence/Prevalence Ratio/Prevalence Odds Ratio analysis
# Prevalence in exposed pop
prev_exp = table[1,1]/sum(table[1,])
prev_exp
## [1] 0.3684211
# Prevalence in unexposed pop
prev_unexp = table[2,1]/sum(table[2,])
prev_unexp
## [1] 0.2959184
# Prevalence in total cohort
prev_tot = sum(table[,1])/sum(table)
prev_tot
## [1] 0.3076923
# Prevalence ratio
prev_ratio = (prev_exp/ prev_unexp)
prev_ratio
## [1] 1.245009
exp(log(prev_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,1]))))
## [1] 0.5453937
exp(log(prev_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,1]))))
## [1] 2.842071
# confidence interval crosses the null value!
# Prevalence Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.387931
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.4964221
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 3.880473
# confidence interval crosses the null value!
a) Stem Majors
# subsetting data
data_stem = data %>% filter(stem1 == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_stem$stress_f == 1 & data_stem$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_stem$stress_f == 1 & data_stem$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_stem$stress_f == 0 & data_stem$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_stem$stress_f == 0 & data_stem$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 7 11
## Not Stressed 25 67
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.705455
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.5949506
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 4.888768
# non-stem majors
data_nonstem = data %>% filter(stem1 == 0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_nonstem$stress_f == 1 & data_nonstem$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_nonstem$stress_f == 1 & data_nonstem$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_nonstem$stress_f == 0 & data_nonstem$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_nonstem$stress_f == 0 & data_nonstem$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 0 1
## Not Stressed 4 2
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 0
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] NaN
# Because there are no non-stem majors that are academically stressed and binge drink, we are unable to do this analysis
# percent change with modeling approach
m_crude_or = glm(binge_drinking ~ stress_f, family = binomial("logit"), data = data)
m_crude_or_results = cbind(broom::tidy(m_crude_or), broom::confint_tidy(m_crude_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_crude_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.4202899 0.2213040 -3.9168324 8.972008e-05 0.2684130
## 2 stress_f 1.3879310 0.5245626 0.6249286 5.320179e-01 0.4749929
## conf.high
## 1 0.6413797
## 2 3.8178005
m_adj_or = glm(binge_drinking ~ stress_f + stem1, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low conf.high
## 1 (Intercept) 1.2707355 0.7683818 0.3118187 0.7551783 0.2773065 6.494869
## 2 stress_f 1.4131672 0.5297752 0.6527928 0.5138899 0.4788882 3.928208
## 3 stem1 0.3040961 0.7938509 -1.4995404 0.1337335 0.0569942 1.457051
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 1.818261
# Less than 10%, therefore not a confounder.
b) Two Majors
# subsetting data
data_two_majors = data %>% filter(!is.na(data$stem2))
# inserting a two majors dichotomous variable into the original dataset
data$two_majors = ifelse(!is.na(data$stem2),1,0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_two_majors$stress_f == 1 & data_two_majors$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_two_majors$stress_f == 1 & data_two_majors$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_two_majors$stress_f == 0 & data_two_majors$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_two_majors$stress_f == 0 & data_two_majors$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 1 5
## Not Stressed 7 17
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 0.4857143
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.04770927
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 4.944917
# one major
data_one_major = data %>% filter(is.na(data$stem2))
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_one_major$stress_f == 1 & data_one_major$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_one_major$stress_f == 1 & data_one_major$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_one_major$stress_f == 0 & data_one_major$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_one_major$stress_f == 0 & data_one_major$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 6 7
## Not Stressed 22 52
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 2.025974
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.6108322
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 6.719638
# Because the stratified estimates fall on either side of the crude estimate, having two majors serves as an effect measure modifier.
# percent change with modeling approach
m_adj_or = glm(binge_drinking ~ stress_f + two_majors, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.4493323 0.2454759 -3.2589463 0.001118268 0.2732728
## 2 stress_f 1.4173617 0.5267888 0.6621196 0.507894560 0.4834509
## 3 two_majors 0.7514332 0.4744156 -0.6023683 0.546928974 0.2828000
## conf.high
## 1 0.7185785
## 2 3.9209564
## 3 1.8517556
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 2.120473
# Less than 10%, therefore not a confounder.
c) Living on/off campus
# subsetting data
data_on_campus = data %>% filter(data$q6 == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_on_campus$stress_f == 1 & data_on_campus$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_on_campus$stress_f == 1 & data_on_campus$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_on_campus$stress_f == 0 & data_on_campus$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_on_campus$stress_f == 0 & data_on_campus$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 2 9
## Not Stressed 13 39
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 0.6666667
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.1272936
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 3.491492
# live off campus
data_off_campus = data %>% filter(data$q6 == 0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_off_campus$stress_f == 1 & data_off_campus$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_off_campus$stress_f == 1 & data_off_campus$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_off_campus$stress_f == 0 & data_off_campus$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_off_campus$stress_f == 0 & data_off_campus$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 5 3
## Not Stressed 16 30
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 3.125
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.660183
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 14.7923
# Because the stratified estimates fall on either side of the crude estimate, living on/off campus serves as an effect measure modifier.
# percent change with modeling approach
m_adj_or = glm(binge_drinking ~ stress_f + q6, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.6011445 0.2918416 -1.7438228 0.08119000 0.3339018
## 2 stress_f 1.4527207 0.5333983 0.7001112 0.48385786 0.4899876
## 3 q6 0.4845195 0.4083236 -1.7745672 0.07596938 0.2143978
## conf.high
## 1 1.055898
## 2 4.079001
## 3 1.071045
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 4.668073
# Less than 10%, therefore living on/off campus is not a confounder.
d) Involvement in Greek Life
# subsetting data
data_greek = data %>% filter(data$q7 == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_greek$stress_f == 1 & data_greek$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_greek$stress_f == 1 & data_greek$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_greek$stress_f == 0 & data_greek$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_greek$stress_f == 0 & data_greek$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 1 3
## Not Stressed 6 4
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 0.2222222
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.0166239
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 2.970585
# not involved in greek life
data_non_greek = data %>% filter(data$q7 == 0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_non_greek$stress_f == 1 & data_non_greek$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_non_greek$stress_f == 1 & data_non_greek$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_non_greek$stress_f == 0 & data_non_greek$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_non_greek$stress_f == 0 & data_non_greek$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 6 9
## Not Stressed 23 65
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.884058
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.6042431
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 5.87458
# Because the stratified estimates fall on either side of the crude estimate, being in/out of greek life serves as an effect measure modifier.
# percent change with modeling approach
m_adj_or = glm(binge_drinking ~ stress_f + q7, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.3785166 0.2349692 -4.1345657 3.556265e-05 0.2346493
## 2 stress_f 1.2572868 0.5367035 0.4265969 6.696729e-01 0.4182929
## 3 q7 2.4748179 0.5824019 1.5559133 1.197287e-01 0.7756704
## conf.high
## 1 0.5918922
## 2 3.5265009
## 3 7.9081584
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 9.412878
# Less than 10%, therefore involvement in greek life is not a confounder.
e) Year in School
# create an upper classman (junior/senior) (coded as 1)
# versus lower classman (freshman/sophomore) (coded as 0)
data$class = ifelse(data$q2 > 2021, 0, 1)
# upperclassman
data_upperclassman = data %>% filter(data$class == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_upperclassman$stress_f == 1 & data_upperclassman$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_upperclassman$stress_f == 1 & data_upperclassman$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_upperclassman$stress_f == 0 & data_upperclassman$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_upperclassman$stress_f == 0 & data_upperclassman$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 4 6
## Not Stressed 14 40
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.904762
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.4678516
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 7.754847
# lowerclassman
data_lowerclassman = data %>% filter(data$class == 0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_lowerclassman$stress_f == 1 & data_lowerclassman$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_lowerclassman$stress_f == 1 & data_lowerclassman$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_lowerclassman$stress_f == 0 & data_lowerclassman$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_lowerclassman$stress_f == 0 & data_lowerclassman$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 3 6
## Not Stressed 15 29
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 0.9666667
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.2114928
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 4.418328
# Because the stratified estimates fall on either side of the crude estimate, being an lower/upperclassman serves as an effect measure modifier.
# percent change with modeling approach
m_adj_or = glm(binge_drinking ~ stress_f + class, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.4858342 0.3059936 -2.3591595 0.01831638 0.2606412
## 2 stress_f 1.3804848 0.5256871 0.6133587 0.53963919 0.4714151
## 3 class 0.7636566 0.4024577 -0.6699762 0.50287298 0.3452862
## conf.high
## 1 0.8721452
## 2 3.8055889
## 3 1.6847882
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 0.5364971
# less than 10%, therefore being a lower/upper classman is not a confounder.
f) Hourse Studied per Week
# stratifying by median (16 hours)
# Those that study 16 or more hours are coded as 1
data$studyhours = ifelse(data$q8 >= 16, 1, 0)
# committed students
data_committed = data %>% filter(data$studyhours == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_committed$stress_f == 1 & data_committed$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_committed$stress_f == 1 & data_committed$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_committed$stress_f == 0 & data_committed$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_committed$stress_f == 0 & data_committed$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 3 7
## Not Stressed 10 36
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.542857
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.3363558
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 7.077054
# non committed students
data_noncommitted = data %>% filter(data$studyhours == 0)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_noncommitted$stress_f == 1 & data_noncommitted$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_noncommitted$stress_f == 1 & data_noncommitted$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_noncommitted$stress_f == 0 & data_noncommitted$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_noncommitted$stress_f == 0 & data_noncommitted$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 4 5
## Not Stressed 19 33
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.389474
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.3322321
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 5.811109
# I'm not entirley sure how to interpret this one. Because the stratified estimates fall on either side of the crude estimate (one of which is very close), being a committed student could serve as both a confounder and an effect measure modifier.
# percent change with modeling approach
m_adj_or = glm(binge_drinking ~ stress_f + studyhours, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.5713834 0.2776038 -2.0161646 0.04378277 0.3265159
## 2 stress_f 1.4590712 0.5332366 0.7085037 0.47863253 0.4923834
## 3 studyhours 0.4918705 0.4140616 -1.7136091 0.08660053 0.2138743
## conf.high
## 1 0.9755478
## 2 4.0968751
## 3 1.0940928
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 5.12563
# less than 10%, therefore being a committed student is not a confounder.
g) Sex
# males
data_male = data %>% filter(data$q1 == 1)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_male$stress_f == 1 & data_male$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_male$stress_f == 1 & data_male$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_male$stress_f == 0 & data_male$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_male$stress_f == 0 & data_male$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 1 2
## Not Stressed 6 19
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.583333
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.1211823
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 20.68738
# females
data_female = data %>% filter(data$q1 == 2)
# Find the number of those that are exposed and do/do not have the disease
disease_exp = sum(ifelse(data_female$stress_f == 1 & data_female$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_exp = sum(ifelse(data_female$stress_f == 1 & data_female$binge_drinking == 0, 1, 0), na.rm = T)
disease_noexp = sum(ifelse(data_female$stress_f == 0 & data_female$binge_drinking == 1, 1, 0), na.rm = T)
nodisease_noexp = sum(ifelse(data_female$stress_f == 0 & data_female$binge_drinking == 0, 1, 0), na.rm = T)
# Create table variables
a = disease_exp; b = nodisease_exp
c = disease_noexp; d = nodisease_noexp
# Create table
table = matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = T)
dimnames(table) = list (c("Stressed", "Not Stressed"), c("Binge Drinking", "No Binge Brinking"))
table
## Binge Drinking No Binge Brinking
## Stressed 6 9
## Not Stressed 23 48
# Odds Ratio (cross product)
odds_ratio = (table[1,1]*table[2,2])/(table[2,1]*table[1,2])
odds_ratio
## [1] 1.391304
exp(log(odds_ratio)-(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 0.4421389
exp(log(odds_ratio)+(1.96*sqrt((1/table[1,1])+(1/table[2,2])+(1/table[2,1])+(1/table[1,2]))))
## [1] 4.378098
# Because the stratified estimates fall on either side of the crude estimate, sex serves as an effect measure modifier.
# percent change with modeling approach
# recoding sex as 0 and 1 (excluding those that who put 3 and 4)
data$sex = ifelse(data$q1 == 1, 0, ifelse(data$q1 == 2,1,NA))
m_adj_or = glm(binge_drinking ~ stress_f + sex, family = binomial("logit"), data = data)
m_adj_or_results = cbind(broom::tidy(m_adj_or), broom::confint_tidy(m_adj_or)) %>%
mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))
m_adj_or_results
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 0.320098 0.4419179 -2.5776920 0.009946261 0.1247787
## 2 stress_f 1.421096 0.5348450 0.6570665 0.511138134 0.4781331
## 3 sex 1.490975 0.4944038 0.8079035 0.419146139 0.5853140
## conf.high
## 1 0.7253392
## 2 4.0036940
## 3 4.1643951
# percent difference calculation
(abs((m_crude_or_results$estimate[2] - m_adj_or_results$estimate[2]))/m_crude_or_results$estimate[2])*100
## [1] 2.389561
# less than 10%, therefore sex is not a confounder.
4. Table One
library(tableone)
a = tableone::CreateTableOne(vars = colnames(data), data = data, factorVars = c("q1", "q2", "q3", "q5", "q6", "q7", "stem1", "studyhours", "binge_drinking", "stress_f", "class"))
print(summary(a))
##
## ### Summary of continuous variables ###
##
## strata: Overall
## n miss p.miss mean sd median p25 p75 min max
## duration (seconds) 117 0 0 430.2 2e+03 88.0 67 118 52 23202
## stem2 117 87 74 0.4 5e-01 0.0 0 1 0 1
## q8 117 0 0 18.2 1e+01 15.0 10 25 0 63
## q9 117 0 0 1.8 3e+00 0.2 0 3 0 15
## q10 117 0 0 2.4 3e+00 2.0 0 4 0 12
## q11 117 0 0 3.4 1e+00 4.0 2 4 1 5
## q12 117 0 0 3.6 1e+00 4.0 3 4 1 5
## q13 117 0 0 3.6 9e-01 4.0 3 4 2 5
## q14 117 0 0 4.2 9e-01 4.0 4 5 1 5
## q15 117 0 0 3.5 9e-01 4.0 3 4 1 5
## q15reversed 117 0 0 2.5 9e-01 2.0 2 3 1 5
## q16 117 0 0 2.5 1e+00 2.0 1 4 1 5
## q17 117 0 0 3.8 1e+00 4.0 3 5 1 5
## stress 117 0 0 23.6 5e+00 23.0 20 27 10 35
## two_majors 117 0 0 0.3 4e-01 0.0 0 1 0 1
## sex 117 3 3 0.8 4e-01 1.0 1 1 0 1
## skew kurt
## duration (seconds) 8.29 71.5
## stem2 0.58 -1.8
## q8 1.05 1.0
## q9 2.04 4.9
## q10 0.98 0.8
## q11 -0.36 -0.8
## q12 -0.50 -0.4
## q13 -0.22 -0.8
## q14 -1.14 0.9
## q15 -0.46 -0.3
## q15reversed 0.46 -0.3
## q16 0.54 -1.1
## q17 -0.87 -0.2
## stress -0.07 -0.1
## two_majors 1.13 -0.7
## sex -1.20 -0.6
##
## =======================================================================================
##
## ### Summary of categorical variables ###
##
## strata: Overall
## var n miss p.miss level freq percent
## q1 117 0 0.0 1 28 23.9
## 2 86 73.5
## 3 1 0.9
## 4 2 1.7
##
## q2 117 0 0.0 2019 1 0.9
## 2020 38 32.5
## 2021 25 21.4
## 2022 25 21.4
## 2023 28 23.9
##
## q3 117 0 0.0 Biochemistry 3 2.6
## Biology 93 79.5
## Biomedical Engineering 2 1.7
## Chemistry 2 1.7
## Comparative Literature 1 0.9
## Computer Science 1 0.9
## Excercise and Sport Science 2 1.7
## Neuroscience 4 3.4
## Nursing 1 0.9
## Peace, War, and Defense 1 0.9
## Philosophy 2 1.7
## Political Science 1 0.9
## Psychology 2 1.7
## Sociology 1 0.9
## Studio Art 1 0.9
##
## stem1 117 0 0.0 0 7 6.0
## 1 110 94.0
##
## q5 117 87 74.4 Anthropology 1 3.3
## Asian Studies 1 3.3
## Biology 2 6.7
## Chemistry 3 10.0
## Communications 1 3.3
## English 2 6.7
## Exercise and Sports Science 1 3.3
## German 4 13.3
## Global Studies 1 3.3
## Hispanic Linguistics 1 3.3
## History 3 10.0
## Mathematics 2 6.7
## Medical Anthropology 1 3.3
## Music 2 6.7
## Neuroscience 2 6.7
## Peace War and Defense 1 3.3
## Political science 1 3.3
## Psychology 1 3.3
##
## q6 117 0 0.0 0 54 46.2
## 1 63 53.8
##
## q7 117 0 0.0 0 103 88.0
## 1 14 12.0
##
## stress_f 117 0 0.0 0 98 83.8
## 1 19 16.2
##
## binge_drinking 117 0 0.0 0 81 69.2
## 1 36 30.8
##
## class 117 0 0.0 0 53 45.3
## 1 64 54.7
##
## studyhours 117 0 0.0 0 61 52.1
## 1 56 47.9
##
## cum.percent
## 23.9
## 97.4
## 98.3
## 100.0
##
## 0.9
## 33.3
## 54.7
## 76.1
## 100.0
##
## 2.6
## 82.1
## 83.8
## 85.5
## 86.3
## 87.2
## 88.9
## 92.3
## 93.2
## 94.0
## 95.7
## 96.6
## 98.3
## 99.1
## 100.0
##
## 6.0
## 100.0
##
## 3.3
## 6.7
## 13.3
## 23.3
## 26.7
## 33.3
## 36.7
## 50.0
## 53.3
## 56.7
## 66.7
## 73.3
## 76.7
## 83.3
## 90.0
## 93.3
## 96.7
## 100.0
##
## 46.2
## 100.0
##
## 88.0
## 100.0
##
## 83.8
## 100.0
##
## 69.2
## 100.0
##
## 45.3
## 100.0
##
## 52.1
## 100.0
##
## NULL
5. Crude versus adjusted model
library(broom)
library(purrr)
# Writing a function for model results
model_results = function(m){bind_cols(tidy(m), confint_tidy(m)) %>% filter(term == "stress_f") %>% mutate(estimate = exp(estimate), conf.low = exp(conf.low), conf.high = exp(conf.high))}
m1 = glm(binge_drinking ~ stress_f, family = binomial("logit"), data = data)
m2 = glm(binge_drinking ~ stress_f + q7, family = binomial("logit"), data = data)
# using dplyr and purrr functions
model_df = tibble(models = list(m1, m2),
model_name = c("Crude", "Adjusted for Greek Life"),
results = map(models, model_results),
estimate = map_dbl(results, "estimate"))
results_df = bind_rows(model_df$results) %>%
mutate(model_name = model_df$model_name)
# plotting all the model estimates and confidence intervals
ggplot(results_df, aes(model_name, color=std.error)) +
geom_point(aes(y=estimate)) +
geom_linerange(aes(ymin = conf.low, ymax=conf.high)) +
ylab("Estimate") + xlab("Models") + geom_hline(yintercept=1, linetype="dashed", color = "black") +
ggtitle("Crude and Adjusted Models")
