Guided Project

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dbplyr)

## 
## Attaching package: 'dbplyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

library(knitr)
library(kableExtra)

## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(cowplot)
library(survey)

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

proj_data <- readRDS("/cloud/project/proj_data.rds")

proj_data <- proj_data %>% filter(!is.na(WTMEC2YR))
dsn <- svydesign(ids=~ SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=T, data=proj_data)

# The distribution of the exposure variable: Beta-carotene (mcg).
p <- ggplot(proj_data, aes(DR1TBCAR))
p <- p + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Beta-carotene (mcg)')

p

## Warning: Removed 30982 rows containing non-finite values (`stat_bin()`).

c <- ggplot(proj_data, aes(DR1TKCAL))
c <- c + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Beta-carotene (mcg)')

proj_data <- proj_data %>% mutate(energy_logged = log(DR1TKCAL+ .001)) # add 0.001 for values that are 0

# since we are conducting linear regression model, non-linearity of the exposure variable violates the assumption of the linear regression model. one way to deal with this is by log transforming the variable.

proj_data <- proj_data %>% mutate(Beta_logged = log(DR1TBCAR+ .001)) # add 0.001 for values that are 0

plog <- ggplot(proj_data, aes(Beta_logged))
plog <- plog + geom_histogram(bins=30,color = "#000000", fill = "#5191A9", alpha=0.6)+labs(x='Natural log transformed Beta-carotene (mcg)')

plog

## Warning: Removed 30982 rows containing non-finite values (`stat_bin()`).

I will adjust the model for age and sex, as well for known inflammatory states (obesity, diabetes), I will also control for caloric intake (confounder of the level of beta Beta-carotene).

# diabetes variable 

proj_data <- proj_data %>% mutate(diabetic = ifelse(LBXGLU > 126, 1, 0))

dsn <- svydesign(ids=~ SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=T, data=proj_data)

# Running the model 
main_model <- svyglm(LBXCRP~ +RIAGENDR+RIDAGEYR+Beta_logged+BMXBMI+diabetic+ energy_logged ,dsn)

summary(main_model)

## 
## Call:
## svyglm(formula = LBXCRP ~ +RIAGENDR + RIDAGEYR + Beta_logged + 
##     BMXBMI + diabetic + energy_logged, design = dsn)
## 
## Survey design:
## svydesign(ids = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     nest = T, data = proj_data)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.3267353  0.1635362  -1.998 0.049555 *  
## RIAGENDR       0.1163656  0.0163101   7.135 6.65e-10 ***
## RIDAGEYR       0.0017279  0.0005266   3.281 0.001605 ** 
## Beta_logged   -0.0201028  0.0057815  -3.477 0.000869 ***
## BMXBMI         0.0287853  0.0014326  20.092  < 2e-16 ***
## diabetic       0.0631019  0.0424416   1.487 0.141496    
## energy_logged -0.0275649  0.0194573  -1.417 0.160947    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.6775547)
## 
## Number of Fisher Scoring iterations: 2

nutrient_1 <-  svyglm(LBXP1~ +RIAGENDR+RIDAGEYR+Beta_logged+BMXBMI+diabetic+ energy_logged ,dsn)
summary(nutrient_1 )

## 
## Call:
## svyglm(formula = LBXP1 ~ +RIAGENDR + RIDAGEYR + Beta_logged + 
##     BMXBMI + diabetic + energy_logged, design = dsn)
## 
## Survey design:
## svydesign(ids = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     nest = T, data = proj_data)
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.822606   0.818757  -1.005 0.318405    
## RIDAGEYR       0.063744   0.005128  12.431  < 2e-16 ***
## Beta_logged   -0.028638   0.020933  -1.368 0.175533    
## BMXBMI        -0.019250   0.004833  -3.983 0.000161 ***
## diabetic      -0.329536   0.088278  -3.733 0.000375 ***
## energy_logged -0.053862   0.102647  -0.525 0.601383    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 6.596179)
## 
## Number of Fisher Scoring iterations: 2

nutrient_1$call

## svyglm(formula = LBXP1 ~ +RIAGENDR + RIDAGEYR + Beta_logged + 
##     BMXBMI + diabetic + energy_logged, design = dsn)

main_model$call

## svyglm(formula = LBXCRP ~ +RIAGENDR + RIDAGEYR + Beta_logged + 
##     BMXBMI + diabetic + energy_logged, design = dsn)

Guided Project

Aya Awwad