library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dbplyr)
##
## Attaching package: 'dbplyr'
##
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(knitr)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(cowplot)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
proj_data <- readRDS("/cloud/project/proj_data.rds")
proj_data <- proj_data %>% filter(!is.na(WTMEC2YR))
dsn <- svydesign(ids=~ SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=T, data=proj_data)
# The distribution of the exposure variable: Beta-carotene (mcg).
p <- ggplot(proj_data, aes(DR1TBCAR))
p <- p + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Beta-carotene (mcg)')
p
## Warning: Removed 30982 rows containing non-finite values (`stat_bin()`).
c <- ggplot(proj_data, aes(DR1TKCAL))
c <- c + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Beta-carotene (mcg)')
proj_data <- proj_data %>% mutate(energy_logged = log(DR1TKCAL+ .001)) # add 0.001 for values that are 0
# since we are conducting linear regression model, non-linearity of the exposure variable violates the assumption of the linear regression model. one way to deal with this is by log transforming the variable.
proj_data <- proj_data %>% mutate(Beta_logged = log(DR1TBCAR+ .001)) # add 0.001 for values that are 0
plog <- ggplot(proj_data, aes(Beta_logged))
plog <- plog + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Natural log transformed Beta-carotene (mcg)')
plog
## Warning: Removed 30982 rows containing non-finite values (`stat_bin()`).
I will adjust the model for age and sex, as well for known inflammatory states (obesity, diabetes), I will also control for caloric intake (confounder of the level of beta Beta-carotene).
# diabetes variable
proj_data <- proj_data %>% mutate(diabetic = ifelse(LBXGLU > 126, 1, 0))
dsn <- svydesign(ids=~ SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=T, data=proj_data)
# Running the model
main_model <- svyglm(LBXCRP~ +RIAGENDR+RIDAGEYR+Beta_logged+BMXBMI+diabetic+ energy_logged ,dsn)
summary(main_model)
##
## Call:
## svyglm(formula = LBXCRP ~ +RIAGENDR + RIDAGEYR + Beta_logged +
## BMXBMI + diabetic + energy_logged, design = dsn)
##
## Survey design:
## svydesign(ids = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR,
## nest = T, data = proj_data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3267353 0.1635362 -1.998 0.049555 *
## RIAGENDR 0.1163656 0.0163101 7.135 6.65e-10 ***
## RIDAGEYR 0.0017279 0.0005266 3.281 0.001605 **
## Beta_logged -0.0201028 0.0057815 -3.477 0.000869 ***
## BMXBMI 0.0287853 0.0014326 20.092 < 2e-16 ***
## diabetic 0.0631019 0.0424416 1.487 0.141496
## energy_logged -0.0275649 0.0194573 -1.417 0.160947
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6775547)
##
## Number of Fisher Scoring iterations: 2
nutrient_1 <- svyglm(LBXP1~ +RIAGENDR+RIDAGEYR+Beta_logged+BMXBMI+diabetic+ energy_logged ,dsn)
summary(nutrient_1 )
##
## Call:
## svyglm(formula = LBXP1 ~ +RIAGENDR + RIDAGEYR + Beta_logged +
## BMXBMI + diabetic + energy_logged, design = dsn)
##
## Survey design:
## svydesign(ids = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR,
## nest = T, data = proj_data)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.822606 0.818757 -1.005 0.318405
## RIDAGEYR 0.063744 0.005128 12.431 < 2e-16 ***
## Beta_logged -0.028638 0.020933 -1.368 0.175533
## BMXBMI -0.019250 0.004833 -3.983 0.000161 ***
## diabetic -0.329536 0.088278 -3.733 0.000375 ***
## energy_logged -0.053862 0.102647 -0.525 0.601383
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 6.596179)
##
## Number of Fisher Scoring iterations: 2
nutrient_1$call
## svyglm(formula = LBXP1 ~ +RIAGENDR + RIDAGEYR + Beta_logged +
## BMXBMI + diabetic + energy_logged, design = dsn)
main_model$call
## svyglm(formula = LBXCRP ~ +RIAGENDR + RIDAGEYR + Beta_logged +
## BMXBMI + diabetic + energy_logged, design = dsn)