About

This is the R notebook with detailed statistical output for the study:

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, haven, readxl, osfr)
theme_set(theme_bw())

Data

VES

Read

#data file from VES data paper
d = read_rds("data/VES_dataset.rds")
d_variables = read_csv("data/VES_dataset_variables.csv")
## Parsed with column specification:
## cols(
##   number = col_double(),
##   code = col_character(),
##   name = col_character()
## )
#other file
ncv = read_sav("data/nerve conduction_workfile_3.sav")
ncv_variables = df_var_table(ncv)

#occupation data
occupation_data = read_excel("data/Emil_VES occupation subset.xlsx")
## New names:
## * EDUC -> EDUC...19
## * EDUC -> EDUC...34
d$occupational_status = occupation_data$JULIEVOC * -1

Recode

d %<>% mutate(
  #gout
  # gout = MH05064B != 1,
  # diabetes = MH05064C != 1,
  
  #occupation status
  # occupation_status = OCCSTAT,
  
  #general health
  general_health = MH01009 %>% ordered(),
  general_health2 = HLTHSTAT %>% ordered()
)

#medical history diagnoses
#loop and add
mh_variables = d %>% select(MH05064A:MH05066T) %>% names()
for (v in mh_variables) {
  #find nice name
  v_name = d_variables$name[d_variables$code == v] %>% str_to_lower()
  
  #save with better name, recoded
  d[[v_name]] = d[[v]] != 1
}

  
#main variables
main_vars = c("gout", "g", "age", "BMI", "income", "education", "occupational_status")

#standardized
d_std = df_standardize(d, messages = F)

NHANES

Read

#NHANES 2017
#https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2017
NHANES_2017_demo = read_xpt("data/NHANES2017/DEMO_J.XPT")
NHANES_2017_biopro = read_xpt("data/NHANES2017/BIOPRO_J.XPT")
NHANES_2017_mcq = read_xpt("data/NHANES2017/MCQ_J.XPT")

#merge
NHANES_2017 = full_join(NHANES_2017_demo, NHANES_2017_biopro, by = "SEQN") %>% 
  full_join(NHANES_2017_mcq, by = "SEQN")

Recode

NHANES_2017 %<>% 
  mutate(
    sex = RIAGENDR %>% mapvalues(c(1, 2), c("Male", "Female")),
    age = RIDAGEYR,
    age_z = age %>% standardize(),
    race = RIDRETH3 %>% mapvalues(c(1, 2, 3, 4, 6, 7), c("Mexican American", "Other Hispanic", "White", "Black", "Asian", "Other/mixed")) %>% fct_relevel("White"),
    gout = MCQ160N %>% mapvalues(c(1, 2, 9), c(T, F, NA)),
    uric_acid = LBDSUASI,
    uric_acid_z = uric_acid %>% standardize()
  )

Descriptives

#simple
d$race %>% table2()
d$gout %>% table2()
#a table
d %>% 
  select(!!main_vars) %>% 
  describe(check = F) %>% 
  as.data.frame() %>% 
  select(-vars, -se, -trimmed, -range)

Results

#correlation matrix
d %>% 
  select(!!main_vars) %>% 
  # map_df(as.numeric) %>% 
  #correlations
  mixedCor() ->
  # hetcor() ->
  main_cors
main_cors
## Call: mixedCor(data = .)
##                     gout  g     age   BMI   incom edctn occp_
## gout                 1.00                                    
## g                   -0.05  1.00                              
## age                  0.10  0.08  1.00                        
## BMI                  0.17 -0.05  0.06  1.00                  
## income               0.03  0.41  0.18  0.02  1.00            
## education           -0.06  0.55  0.16 -0.02  0.36  1.00      
## occupational_status -0.04  0.38  0.16  0.00  0.39  0.45  1.00
#boxplots
d_std %>% 
  filter(!is.na(gout)) %>% 
  select(AOP_ID, !!main_vars) %>% 
  gather(value = value, key = variable, g:occupational_status) %>% 
  ggplot(aes(variable, value, fill = gout)) +
  geom_boxplot() +
  scale_y_continuous("Value (z-score)") +
  scale_fill_discrete("Gout\ndiagnosis", labels = c("No", "Yes"))
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).

GG_save("figs/boxplots.png")
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).
#violin
d_std %>% 
  filter(!is.na(gout)) %>% 
  select(AOP_ID, !!main_vars) %>% 
  gather(value = value, key = variable, g:occupational_status) %>% 
  ggplot(aes(variable, value, fill = gout)) +
  geom_violin() +
  stat_summary(fun.y=mean,
               geom="point",
               shape=23,
               size=3,
               position = position_dodge(0.9)) +
  scale_y_continuous("Value (z-score)") +
  scale_fill_discrete("Gout\ndiagnosis", labels = c("No", "Yes"))
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 90 rows containing non-finite values (stat_ydensity).
## Warning: Removed 90 rows containing non-finite values (stat_summary).

GG_save("figs/violin.png")
## Warning: Removed 90 rows containing non-finite values (stat_ydensity).

## Warning: Removed 90 rows containing non-finite values (stat_summary).
#cohen's d
cohen.d(d %>% select(!!main_vars), "gout")
## Call: cohen.d(x = d %>% select(!!main_vars), group = "gout")
## Cohen d statistic of difference between two means
##                     lower effect upper
## g                   -0.35  -0.13  0.08
## age                  0.04   0.26  0.47
## BMI                  0.22   0.43  0.65
## income              -0.15   0.07  0.28
## education           -0.36  -0.14  0.07
## occupational_status -0.32  -0.10  0.11
## 
## Multivariate (Mahalanobis) distance between groups
## [1] 0.54
## r equivalent of difference between two means
##                   g                 age                 BMI              income 
##               -0.02                0.04                0.06                0.01 
##           education occupational_status 
##               -0.02               -0.01
#regressions
#income
list(
  ols(income ~ gout, data = d_std),
  ols(income ~ gout + age + g + BMI + race, data = d_std),
  ols(income ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#education
list(
  ols(education ~ gout, data = d_std),
  ols(education ~ gout + age + g + BMI + race, data = d_std),
  ols(education ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#occupation
list(
  ols(occupational_status ~ gout, data = d_std),
  ols(occupational_status ~ gout + age + g + BMI + race, data = d_std),
  ols(occupational_status ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#medical history overlap
d %>% 
  select(arthritis:melioidosis) %>% 
  map_df(as.numeric) %>%
  #remove those without variance, there is one
  {
    .[map_lgl(., ~n_distinct(., na.rm = T) > 1)]
  } %>% 
  mixedCor() ->
  # hetcor() ->
  mh_cors
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
#plot
mh_cors$rho %>% 
  GG_heatmap(color_label = "Latent\ncorrelation",
             font_size = 1) +
  theme(axis.text.x = element_text(size = 7)) +
  ggtitle("Medical history heatmap")

GG_save("figs/medical_history_heatmap.png", width = 20)

#strongest relations
#make into long format
mh_cors$rho %>% 
  #remove values in upper and diag
  {
    diag(.) = NA
    .[upper.tri(.)] = NA
    .
  } %>% 
  reshape2::melt() %>% 
  filter(!is.na(value)) ->
  mh_cors_long

#inspect strongest correlations
mh_cors_long %>% 
  arrange(-abs(value))

Gout and uric acid

#models for uric acid
list(
  simple = ols(uric_acid_z ~ gout, data = NHANES_2017),
  demographics = ols(uric_acid_z ~ sex * rcs(age) + race, data = NHANES_2017),
  basic = ols(uric_acid_z ~ gout + sex * rcs(age) + race, data = NHANES_2017),
  basic_men = ols(uric_acid_z ~ gout + rcs(age) + race, data = NHANES_2017 %>% filter(sex == "Male")),
  basic_women = ols(uric_acid_z ~ gout + rcs(age) + race, data = NHANES_2017 %>% filter(sex == "Female")),
  whites = ols(uric_acid_z ~ gout + sex * rcs(age), data = NHANES_2017 %>% filter(race == "White")),
  white_men = ols(uric_acid_z ~ gout + rcs(age), data = NHANES_2017 %>% filter(sex == "Male", race == "White")),
  white_women = ols(uric_acid_z ~ gout + rcs(age), data = NHANES_2017 %>% filter(sex == "Female", race == "White"))
) %>% 
  summarize_models()

Meta

#data out
d %>% write_rds("data/data_out.rds", compress = "xz")

#versions
write_sessioninfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] osfr_0.2.8            readxl_1.3.1          haven_2.3.1          
##  [4] rms_6.0-1             SparseM_1.78          kirkegaard_2020-09-08
##  [7] metafor_2.4-0         Matrix_1.2-18         psych_2.0.8          
## [10] magrittr_1.5          assertthat_0.2.1      weights_1.0.1        
## [13] mice_3.11.0           gdata_2.18.0          Hmisc_4.4-1          
## [16] Formula_1.2-3         survival_3.1-12       lattice_0.20-41      
## [19] forcats_0.5.0         stringr_1.4.0         dplyr_1.0.2          
## [22] purrr_0.3.4           readr_1.3.1           tidyr_1.1.2          
## [25] tibble_3.0.3          ggplot2_3.3.2         tidyverse_1.3.0      
## [28] pacman_0.5.1         
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10      colorspace_1.4-1    ellipsis_0.3.1     
##  [4] htmlTable_2.0.1     base64enc_0.1-3     fs_1.5.0           
##  [7] rstudioapi_0.11     httpcode_0.3.0      farver_2.0.3       
## [10] MatrixModels_0.4-1  fansi_0.4.1         mvtnorm_1.1-1      
## [13] lubridate_1.7.9     xml2_1.3.2          codetools_0.2-16   
## [16] splines_4.0.2       mnormt_2.0.2        knitr_1.29         
## [19] jsonlite_1.7.1      broom_0.5.6         cluster_2.1.0      
## [22] dbplyr_1.4.4        png_0.1-7           compiler_4.0.2     
## [25] httr_1.4.2          backports_1.1.9     cli_2.0.2          
## [28] htmltools_0.5.0     quantreg_5.61       tools_4.0.2        
## [31] gtable_0.3.0        glue_1.4.2          reshape2_1.4.4     
## [34] Rcpp_1.0.5          cellranger_1.1.0    vctrs_0.3.4        
## [37] crul_1.0.0          nlme_3.1-149        conquer_1.0.2      
## [40] xfun_0.16           rvest_0.3.6         lifecycle_0.2.0    
## [43] gtools_3.8.2        polspline_1.1.19    MASS_7.3-52        
## [46] zoo_1.8-8           scales_1.1.1        hms_0.5.3          
## [49] parallel_4.0.2      sandwich_2.5-1      RColorBrewer_1.1-2 
## [52] yaml_2.2.1          curl_4.3            memoise_1.1.0      
## [55] gridExtra_2.3       rpart_4.1-15        latticeExtra_0.6-29
## [58] stringi_1.4.6       checkmate_2.0.0     rlang_0.4.7        
## [61] pkgconfig_2.0.3     matrixStats_0.56.0  evaluate_0.14      
## [64] labeling_0.3        htmlwidgets_1.5.1   tidyselect_1.1.0   
## [67] plyr_1.8.6          R6_2.4.1            generics_0.0.2     
## [70] multcomp_1.4-13     DBI_1.1.0           pillar_1.4.6       
## [73] foreign_0.8-79      withr_2.2.0         nnet_7.3-14        
## [76] modelr_0.1.8        crayon_1.3.4        tmvnsim_1.0-2      
## [79] rmarkdown_2.3       jpeg_0.1-8.1        grid_4.0.2         
## [82] data.table_1.13.0   blob_1.2.1          reprex_0.3.0       
## [85] digest_0.6.25       munsell_0.5.0
#upload to OSF
#auth
osf_auth(read_lines("~/.config/osf_token"))
## Registered PAT from the provided token
#project
osf_proj = osf_retrieve_node("https://osf.io/k57ca/")

#upload all files in project
#overwrite existing (versioning)
osf_upload(osf_proj, path = list.files(), conflicts = "overwrite", recurse = T)
## Searching for conflicting files on OSF
## Updating 12 existing file(s) on OSF
## Uploading 66 new file(s) to OSF