This is the R notebook with detailed statistical output for the study:
options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, haven, readxl, osfr)
theme_set(theme_bw())
#data file from VES data paper
d = read_rds("data/VES_dataset.rds")
d_variables = read_csv("data/VES_dataset_variables.csv")
## Parsed with column specification:
## cols(
## number = col_double(),
## code = col_character(),
## name = col_character()
## )
#other file
ncv = read_sav("data/nerve conduction_workfile_3.sav")
ncv_variables = df_var_table(ncv)
#occupation data
occupation_data = read_excel("data/Emil_VES occupation subset.xlsx")
## New names:
## * EDUC -> EDUC...19
## * EDUC -> EDUC...34
d$occupational_status = occupation_data$JULIEVOC * -1
d %<>% mutate(
#gout
# gout = MH05064B != 1,
# diabetes = MH05064C != 1,
#occupation status
# occupation_status = OCCSTAT,
#general health
general_health = MH01009 %>% ordered(),
general_health2 = HLTHSTAT %>% ordered()
)
#medical history diagnoses
#loop and add
mh_variables = d %>% select(MH05064A:MH05066T) %>% names()
for (v in mh_variables) {
#find nice name
v_name = d_variables$name[d_variables$code == v] %>% str_to_lower()
#save with better name, recoded
d[[v_name]] = d[[v]] != 1
}
#main variables
main_vars = c("gout", "g", "age", "BMI", "income", "education", "occupational_status")
#standardized
d_std = df_standardize(d, messages = F)
#NHANES 2017
#https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2017
NHANES_2017_demo = read_xpt("data/NHANES2017/DEMO_J.XPT")
NHANES_2017_biopro = read_xpt("data/NHANES2017/BIOPRO_J.XPT")
NHANES_2017_mcq = read_xpt("data/NHANES2017/MCQ_J.XPT")
#merge
NHANES_2017 = full_join(NHANES_2017_demo, NHANES_2017_biopro, by = "SEQN") %>%
full_join(NHANES_2017_mcq, by = "SEQN")
NHANES_2017 %<>%
mutate(
sex = RIAGENDR %>% mapvalues(c(1, 2), c("Male", "Female")),
age = RIDAGEYR,
age_z = age %>% standardize(),
race = RIDRETH3 %>% mapvalues(c(1, 2, 3, 4, 6, 7), c("Mexican American", "Other Hispanic", "White", "Black", "Asian", "Other/mixed")) %>% fct_relevel("White"),
gout = MCQ160N %>% mapvalues(c(1, 2, 9), c(T, F, NA)),
uric_acid = LBDSUASI,
uric_acid_z = uric_acid %>% standardize()
)
#simple
d$race %>% table2()
d$gout %>% table2()
#a table
d %>%
select(!!main_vars) %>%
describe(check = F) %>%
as.data.frame() %>%
select(-vars, -se, -trimmed, -range)
#correlation matrix
d %>%
select(!!main_vars) %>%
# map_df(as.numeric) %>%
#correlations
mixedCor() ->
# hetcor() ->
main_cors
main_cors
## Call: mixedCor(data = .)
## gout g age BMI incom edctn occp_
## gout 1.00
## g -0.05 1.00
## age 0.10 0.08 1.00
## BMI 0.17 -0.05 0.06 1.00
## income 0.03 0.41 0.18 0.02 1.00
## education -0.06 0.55 0.16 -0.02 0.36 1.00
## occupational_status -0.04 0.38 0.16 0.00 0.39 0.45 1.00
#boxplots
d_std %>%
filter(!is.na(gout)) %>%
select(AOP_ID, !!main_vars) %>%
gather(value = value, key = variable, g:occupational_status) %>%
ggplot(aes(variable, value, fill = gout)) +
geom_boxplot() +
scale_y_continuous("Value (z-score)") +
scale_fill_discrete("Gout\ndiagnosis", labels = c("No", "Yes"))
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).
GG_save("figs/boxplots.png")
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).
#violin
d_std %>%
filter(!is.na(gout)) %>%
select(AOP_ID, !!main_vars) %>%
gather(value = value, key = variable, g:occupational_status) %>%
ggplot(aes(variable, value, fill = gout)) +
geom_violin() +
stat_summary(fun.y=mean,
geom="point",
shape=23,
size=3,
position = position_dodge(0.9)) +
scale_y_continuous("Value (z-score)") +
scale_fill_discrete("Gout\ndiagnosis", labels = c("No", "Yes"))
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 90 rows containing non-finite values (stat_ydensity).
## Warning: Removed 90 rows containing non-finite values (stat_summary).
GG_save("figs/violin.png")
## Warning: Removed 90 rows containing non-finite values (stat_ydensity).
## Warning: Removed 90 rows containing non-finite values (stat_summary).
#cohen's d
cohen.d(d %>% select(!!main_vars), "gout")
## Call: cohen.d(x = d %>% select(!!main_vars), group = "gout")
## Cohen d statistic of difference between two means
## lower effect upper
## g -0.35 -0.13 0.08
## age 0.04 0.26 0.47
## BMI 0.22 0.43 0.65
## income -0.15 0.07 0.28
## education -0.36 -0.14 0.07
## occupational_status -0.32 -0.10 0.11
##
## Multivariate (Mahalanobis) distance between groups
## [1] 0.54
## r equivalent of difference between two means
## g age BMI income
## -0.02 0.04 0.06 0.01
## education occupational_status
## -0.02 -0.01
#regressions
#income
list(
ols(income ~ gout, data = d_std),
ols(income ~ gout + age + g + BMI + race, data = d_std),
ols(income ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#education
list(
ols(education ~ gout, data = d_std),
ols(education ~ gout + age + g + BMI + race, data = d_std),
ols(education ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#occupation
list(
ols(occupational_status ~ gout, data = d_std),
ols(occupational_status ~ gout + age + g + BMI + race, data = d_std),
ols(occupational_status ~ gout + age + g + BMI + race + gout*g, data = d_std)
) %>% summarize_models()
#medical history overlap
d %>%
select(arthritis:melioidosis) %>%
map_df(as.numeric) %>%
#remove those without variance, there is one
{
.[map_lgl(., ~n_distinct(., na.rm = T) > 1)]
} %>%
mixedCor() ->
# hetcor() ->
mh_cors
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
#plot
mh_cors$rho %>%
GG_heatmap(color_label = "Latent\ncorrelation",
font_size = 1) +
theme(axis.text.x = element_text(size = 7)) +
ggtitle("Medical history heatmap")
GG_save("figs/medical_history_heatmap.png", width = 20)
#strongest relations
#make into long format
mh_cors$rho %>%
#remove values in upper and diag
{
diag(.) = NA
.[upper.tri(.)] = NA
.
} %>%
reshape2::melt() %>%
filter(!is.na(value)) ->
mh_cors_long
#inspect strongest correlations
mh_cors_long %>%
arrange(-abs(value))
#models for uric acid
list(
simple = ols(uric_acid_z ~ gout, data = NHANES_2017),
demographics = ols(uric_acid_z ~ sex * rcs(age) + race, data = NHANES_2017),
basic = ols(uric_acid_z ~ gout + sex * rcs(age) + race, data = NHANES_2017),
basic_men = ols(uric_acid_z ~ gout + rcs(age) + race, data = NHANES_2017 %>% filter(sex == "Male")),
basic_women = ols(uric_acid_z ~ gout + rcs(age) + race, data = NHANES_2017 %>% filter(sex == "Female")),
whites = ols(uric_acid_z ~ gout + sex * rcs(age), data = NHANES_2017 %>% filter(race == "White")),
white_men = ols(uric_acid_z ~ gout + rcs(age), data = NHANES_2017 %>% filter(sex == "Male", race == "White")),
white_women = ols(uric_acid_z ~ gout + rcs(age), data = NHANES_2017 %>% filter(sex == "Female", race == "White"))
) %>%
summarize_models()
#data out
d %>% write_rds("data/data_out.rds", compress = "xz")
#versions
write_sessioninfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] osfr_0.2.8 readxl_1.3.1 haven_2.3.1
## [4] rms_6.0-1 SparseM_1.78 kirkegaard_2020-09-08
## [7] metafor_2.4-0 Matrix_1.2-18 psych_2.0.8
## [10] magrittr_1.5 assertthat_0.2.1 weights_1.0.1
## [13] mice_3.11.0 gdata_2.18.0 Hmisc_4.4-1
## [16] Formula_1.2-3 survival_3.1-12 lattice_0.20-41
## [19] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [22] purrr_0.3.4 readr_1.3.1 tidyr_1.1.2
## [25] tibble_3.0.3 ggplot2_3.3.2 tidyverse_1.3.0
## [28] pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ellipsis_0.3.1
## [4] htmlTable_2.0.1 base64enc_0.1-3 fs_1.5.0
## [7] rstudioapi_0.11 httpcode_0.3.0 farver_2.0.3
## [10] MatrixModels_0.4-1 fansi_0.4.1 mvtnorm_1.1-1
## [13] lubridate_1.7.9 xml2_1.3.2 codetools_0.2-16
## [16] splines_4.0.2 mnormt_2.0.2 knitr_1.29
## [19] jsonlite_1.7.1 broom_0.5.6 cluster_2.1.0
## [22] dbplyr_1.4.4 png_0.1-7 compiler_4.0.2
## [25] httr_1.4.2 backports_1.1.9 cli_2.0.2
## [28] htmltools_0.5.0 quantreg_5.61 tools_4.0.2
## [31] gtable_0.3.0 glue_1.4.2 reshape2_1.4.4
## [34] Rcpp_1.0.5 cellranger_1.1.0 vctrs_0.3.4
## [37] crul_1.0.0 nlme_3.1-149 conquer_1.0.2
## [40] xfun_0.16 rvest_0.3.6 lifecycle_0.2.0
## [43] gtools_3.8.2 polspline_1.1.19 MASS_7.3-52
## [46] zoo_1.8-8 scales_1.1.1 hms_0.5.3
## [49] parallel_4.0.2 sandwich_2.5-1 RColorBrewer_1.1-2
## [52] yaml_2.2.1 curl_4.3 memoise_1.1.0
## [55] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-29
## [58] stringi_1.4.6 checkmate_2.0.0 rlang_0.4.7
## [61] pkgconfig_2.0.3 matrixStats_0.56.0 evaluate_0.14
## [64] labeling_0.3 htmlwidgets_1.5.1 tidyselect_1.1.0
## [67] plyr_1.8.6 R6_2.4.1 generics_0.0.2
## [70] multcomp_1.4-13 DBI_1.1.0 pillar_1.4.6
## [73] foreign_0.8-79 withr_2.2.0 nnet_7.3-14
## [76] modelr_0.1.8 crayon_1.3.4 tmvnsim_1.0-2
## [79] rmarkdown_2.3 jpeg_0.1-8.1 grid_4.0.2
## [82] data.table_1.13.0 blob_1.2.1 reprex_0.3.0
## [85] digest_0.6.25 munsell_0.5.0
#upload to OSF
#auth
osf_auth(read_lines("~/.config/osf_token"))
## Registered PAT from the provided token
#project
osf_proj = osf_retrieve_node("https://osf.io/k57ca/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(osf_proj, path = list.files(), conflicts = "overwrite", recurse = T)
## Searching for conflicting files on OSF
## Updating 12 existing file(s) on OSF
## Uploading 66 new file(s) to OSF