CLARITE Example Analysis

Load Data

data_main_table_over18 = paste(params$data_folder, "MainTable_keepvar_over18.tsv", sep="")
data_main_table = paste(params$data_folder, "MainTable.csv", sep="")
data_var_description = paste(params$data_folder, "VarDescription.csv", sep="")
data_var_categories = paste(params$data_folder, "VarCat_nopf.txt", sep="")

Data of all samples with age >= 18

nhanes <- read.delim(data_main_table_over18, header=TRUE, sep="\t")

Variable Descriptions

descriptions <- read.csv(data_var_description)
descriptions_info <- unique(descriptions[,colnames(descriptions) %in% c("tab_desc","module","var","var_desc")])
head(descriptions_info)
#>                       tab_desc     module    var
#> 1      Hepatitis A, B, C and D laboratory LBXHBC
#> 2      Hepatitis A, B, C and D laboratory LBDHBG
#> 3      Hepatitis A, B, C and D laboratory LBDHCV
#> 4      Hepatitis A, B, C and D laboratory  LBDHD
#> 5 Hepatitis B Surface Antibody laboratory LBXHBS
#> 6                  Hepatitis A laboratory  LBXHA
#>                           var_desc
#> 1        Hepatitis B core antibody
#> 2      Hepatitis B surface antigen
#> 3 Hepatitis C antibody (confirmed)
#> 4           Hepatitis D (anti-HDV)
#> 5     Hepatitis B Surface Antibody
#> 6  Hepatitis A Antibody (Anti-HAV)

Survey Weights, as provided by NHANES

Survey weight information is used so that the results apply to the US civillian non-institutionalized population.

This includes:

SDMVPSU (Cluster ID)
SDMVSTRA (Nested Strata ID)
2-year weights
4-year weights

Different variables require different weights, as many of them were measured on a subset of the full dataset. For example:

WTINT is the survey weight for interview variables.
WTMEC is the survey weight for variables measured in the Mobile Exam Centers (a subset of interviewed samples)

2-year and 4-year weights are provided. It is important to adjust the weights when combining multiple cycles, by computing the weighted average. In this case 4-year weights (covering the first 2 cycles) are provided by NHANES and the replication weights (the 3rd and 4th cycles) were computed from the 2-year weights prior to loading them here.

# Get survey info
survey_design_discovery <- read.csv(paste(params$data_folder, "weights/weights_discovery.txt", sep=""), sep="\t")
colnames(survey_design_discovery)[1] <- "ID"
survey_design_discovery <- colfilter(survey_design_discovery, c("SDDSRVYR"), exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
head(survey_design_discovery)
#>   ID SDMVPSU SDMVSTRA  WTINT2YR  WTINT4YR WTMEC2YR  WTMEC4YR     WTDRD1
#> 1  1       1        5  9727.079  4291.490 10982.90  4456.207  14809.894
#> 2  2       3        1 26678.636 14203.336 28325.38 15336.200  26899.709
#> 3  3       2        7 43621.681 20123.764 46192.26 21258.467  34388.840
#> 4  4       1        2 10346.119  4582.132 10251.26  4562.389   7159.829
#> 5  5       2        8 91050.847 44161.868 99445.07 45985.968 127746.359
#> 6  6       2        2 36508.250 16850.978 39656.60 18337.313  87007.933
#>     WTDR4YR WTSBA2YR WTSBA4YR BMXWT BMIWT CVDWTIM MSXWTIME WTSVOC4Y
#> 1  6066.129       NA       NA  12.5     3      NA       NA       NA
#> 2 14921.934       NA       NA  75.4    NA      NA     9.25       NA
#> 3 15866.448       NA       NA  32.9    NA      NA       NA       NA
#> 4  3218.100       NA       NA  13.3    NA      NA       NA       NA
#> 5 58973.611 225514.7 62154.68  92.5    NA       2       NA       NA
#> 6 39979.156       NA       NA  59.2    NA       2       NA       NA
#>   WTSHM2YR  WTSHM4YR WTSTH2YR WTSTH4YR WTSPP4YR WTSPP2YR WTSPO4YR WTSPO2YR
#> 1       NA        NA       NA       NA       NA       NA       NA       NA
#> 2       NA        NA 81681.26 46818.92       NA       NA 51106.09 108769.5
#> 3       NA        NA       NA       NA 46902.15 114094.9       NA       NA
#> 4       NA        NA       NA       NA       NA       NA       NA       NA
#> 5 316235.3 138091.51       NA       NA       NA       NA       NA       NA
#> 6 137608.4  61352.78       NA       NA       NA       NA       NA       NA
#>    WTSCY4YR  WTTFA2YR WTSPH2YR WTSAU4YR WTSAF4YR  WTSAF2YR WTSCI4YR
#> 1        NA        NA       NA       NA       NA        NA       NA
#> 2  16401.68  60586.15       NA       NA 33073.27  60586.15       NA
#> 3        NA  46192.26 142734.1       NA 52434.23 121969.84       NA
#> 4        NA        NA       NA       NA       NA        NA       NA
#> 5 184523.27 234895.21       NA       NA 98468.81 234895.21       NA
#> 6        NA        NA       NA       NA       NA        NA       NA
#>   WTSPH4YR WTSCI2YR WTSVOC2Y WTSAU2YR WTUIO2YR
#> 1       NA       NA       NA       NA       NA
#> 2       NA       NA       NA       NA       NA
#> 3 69163.77       NA       NA       NA       NA
#> 4       NA       NA       NA       NA       NA
#> 5       NA       NA       NA       NA       NA
#> 6       NA       NA       NA       NA       NA

survey_design_replication <- read.csv(paste(params$data_folder, "weights/weights_replication_4yr.txt", sep=""), sep="\t")
colnames(survey_design_replication)[1] <- "ID" 
survey_design_replication <- colfilter(survey_design_replication, c("SDDSRVYR"), exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
head(survey_design_replication)
#>      ID SDMVPSU SDMVSTRA WTINT2YR  WTMEC2YR   WTS_FFQ BMXWT BMIWT CVDWTIM
#> 1 21005       2       39  2756.16  2912.391  2009.015 68.80    NA      NA
#> 2 21006       1       41  2711.07  2782.020  1919.082 27.60    NA       1
#> 3 21007       2       35 19882.09 20295.533 13317.776 23.95    NA       1
#> 4 21008       1       32  2799.75  2848.375        NA 35.00    NA      NA
#> 5 21009       2       31 48796.84 48865.864 33530.095 51.55    NA      NA
#> 6 21010       1       29 19799.68 21643.288 32028.706 33.95    NA      NA
#>   WTSVOC2Y WTSBMEL    WTDR2D  WTSB2YR    WTDRD1    WTSC2YR WTSAF2YR
#> 1       NA      NA  1210.364       NA  1444.934   9472.746   7042.1
#> 2       NA      NA  1156.183 7991.864  1380.252         NA   6540.5
#> 3       NA      NA  7663.560       NA 10655.971         NA       NA
#> 4       NA      NA  9345.278       NA  4639.677   9264.530       NA
#> 5 91449.62      NA 85131.235       NA 28499.297 138102.149       NA
#> 6 46633.43      NA 37648.061       NA 27656.507  68779.743       NA
#>    WTSA2YR WTSCI2YR WTSAU2YR WTAL2YR LBXDWT WTSOG2YR WTSC2YRA WTSPC2YR
#> 1       NA       NA       NA      NA     NA       NA       NA       NA
#> 2       NA       NA       NA      NA     NA       NA       NA       NA
#> 3 66423.54       NA       NA      NA     NA       NA       NA       NA
#> 4       NA       NA       NA      NA     NA       NA       NA       NA
#> 5       NA       NA 93277.11      NA     NA       NA       NA       NA
#> 6       NA       NA 48290.38      NA     NA       NA       NA       NA

# Get weight mapping data
var_weights <- read.csv(paste(params$data_folder, "weights/VarWeights.csv", sep=""), fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
weights_discovery <- setNames(as.list(var_weights$discovery), as.list(var_weights$variable_name))
weights_replication <- setNames(as.list(var_weights$replication), as.list(var_weights$variable_name))

Survey Year

# Data found in the main table
main_table <- read.csv(data_main_table) 
colnames(main_table)[1] <- "ID" 
survey_year <- colfilter(main_table, c("ID", "SDDSRVYR")) 
#> [1] "Running..."
#> [1] "Finished in 0 secs"
nhanes <- merge_data(survey_year, nhanes, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.233376 secs"

Define the phenotype and covariates

covariates <- c("female", "black", "mexican", "other_hispanic", "other_eth", "SES_LEVEL", "RIDAGEYR", "SDDSRVYR")
phenotype <- c("BMXBMI")

Initial cleanup and variable selection

Remove any samples missing the phenotype or a covariate

nhanes <- remove_incomplete_obs(nhanes, c(covariates, phenotype))
#> [1] "Running..."
#> [1] "3687 of 22624 rows removed due to NA values in the specified columns"
#> [1] "Finished in 0 secs"

Remove variables that aren’t appropriate for analysis

Physical fitness

These are measurements rather than proxies for environmental exposure

no_pf_vars <- c("CVDVOMAX","CVDESVO2","CVDS1HR","CVDS1SY","CVDS1DI","CVDS2HR","CVDS2SY", "CVDS2DI","CVDR1HR","CVDR1SY","CVDR1DI","CVDR2HR","CVDR2SY","CVDR2DI","physical_activity") 
nhanes <- colfilter(d = nhanes, cols = c(no_pf_vars), exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

Lipid Variables

These are likely correlated with BMI in some way


bmi_lipid_cols <- c("LBDHDD","LBDHDL","LBDLDL","LBXSTR","LBXTC","LBXTR")
nhanes <- colfilter(nhanes, cols=bmi_lipid_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

Indeterminate Variables

These variables don’t have clear meanings

others <- c("house_type","hepa","hepb", "house_age", "current_past_smoking")
nhanes <- colfilter(nhanes, cols=others, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

Recode missing values

# The values 7 and/or 9 in some variables need to be defined as 'NA'
nhanes$SMQ077[nhanes$SMQ077==7] <- NA
nhanes$DBD100[nhanes$DBD100==9] <- NA

Split the data into discovery and replication

# The values 7 and/or 9 in some variables need to be defined as 'NA'
nhanes_discovery <- nhanes[nhanes$SDDSRVYR==1|nhanes$SDDSRVYR==2,]
nhanes_replication <- nhanes[nhanes$SDDSRVYR==3|nhanes$SDDSRVYR==4,]

QC

Minimum of 200 non-NA values in each variable

Drop variables that have a small sample size

nhanes_discovery <- min_n(nhanes_discovery, skip=c(covariates, phenotype))
#> [1] "Running..."
#> [1] "302 of 946 columns removed due to < 200 non-NA observations (ignoring 10 columns)"
#> [1] "Finished in 0.034909 secs"
nhanes_replication <- min_n(nhanes_replication, skip=c(covariates, phenotype))
#> [1] "Running..."
#> [1] "225 of 946 columns removed due to < 200 non-NA observations (ignoring 10 columns)"
#> [1] "Finished in 0.049869 secs"

Categorize Variables

This is important, as different variable types must be processed in different ways. The number of unique values for each variable is a good heuristic for determining this. The default settings were used here, but different cutoffs can be specified.

# Discovery
discovery_bin <- get_binary(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.076795 secs"
discovery_cat <- get_categorical(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.11968 secs"
discovery_cont <- get_continuous(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.063829 secs"
discovery_check <- get_check(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.116658 secs"

# Replication
replication_bin <- get_binary(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.07879 secs"
replication_cat <- get_categorical(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.149597 secs"
replication_cont <- get_continuous(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.072805 secs"
replication_check <- get_check(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.149572 secs"

Checking Categorization

Distributions of variables may be plotted using CLARITE

multi_plot(discovery_check, file=paste(params$output_folder, "plots", sep="/"), type="hist-qq")

One variable needed correcting where the heuristic was not correct

replication_cat_to_cont <- c("L_GLUTAMINE_gm")
replication_cont <- merge_data(replication_cont, colfilter(replication_cat, cols = replication_cat_to_cont, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.002957 secs"
#> [1] "Finished in 0.029917 secs"
replication_cat <- colfilter(replication_cat, cols = replication_cat_to_cont, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

After examining all of the uncategorized variables, they are all continuous

print(paste("Checked: ", paste(names(replication_check), collapse=", "), sep=" "))
#> [1] "Checked:  ID, LBXVCT, LBXV3A, URXUBE, LBXTO2, LBXPFDO, DRD350AQ, DRD350BQ, DRD350DQ, DRD350FQ, DRD350GQ, DRD370AQ, DRD370DQ, DRD370EQ, DRD370FQ, DRD370GQ, DRD370NQ, DRD370RQ, DRD370UQ, ALANINE_mg, ARGININE_mg, BETA_CAROTENE_mg, CAFFEINE_mg, CYSTINE_mg, HISTIDINE_mg, ISOLEUCINE_mg, LEUCINE_mg, LYSINE_mg, PHENYLALANINE_mg, PROLINE_mg, SERINE_mg, THREONINE_mg, TRYPTOPHAN_mg, TYROSINE_mg, VALINE_mg, LBXV2T, LBXV4T, LBXVDM, URXUTM, LBXPFBS"
replication_cont <- merge_data(replication_cont, replication_check, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.030918 secs"

Types should match across discovery/replication

############################################
# Take note of differently-typed variables #
############################################
print(paste("Bin > Cat:", paste(intersect(names(discovery_bin), names(replication_cat)), collapse=", ")))
#> [1] "Bin > Cat: ID, BETA_CAROTENE_mcg, CALCIUM_Unknown, MAGNESIUM_Unknown"
print(paste("Bin > Cont:", paste(intersect(names(discovery_bin), names(replication_cont)), collapse=", ")))
#> [1] "Bin > Cont: ID, LBXPFDO"
print(paste("Cat > Bin:", paste(intersect(names(discovery_cat), names(replication_bin)), collapse=", ")))
#> [1] "Cat > Bin: ID, VITAMIN_B_12_Unknown"
print(paste("Cat > Cont:", paste(intersect(names(discovery_cat), names(replication_cont)), collapse=", ")))
#> [1] "Cat > Cont: ID, DRD350AQ, DRD350DQ, DRD350GQ, L_GLUTAMINE_gm"
print(paste("Cont > Bin:", paste(intersect(names(discovery_cont), names(replication_bin)), collapse=", ")))
#> [1] "Cont > Bin: ID"
print(paste("Cont > Cont:", paste(intersect(names(discovery_cont), names(replication_cat)), collapse=", ")))
#> [1] "Cont > Cont: ID"

# Fix based on the above

# Binary in discovery that should be categorical
move_cols <- c("BETA_CAROTENE_mcg", "CALCIUM_Unknown", "MAGNESIUM_Unknown")
discovery_cat <- merge_data(discovery_cat, colfilter(discovery_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.00302 secs"
#> [1] "Finished in 0.017984 secs"
discovery_bin <- colfilter(discovery_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

# Binary in discovery that should be continuous
move_cols <- c("LBXPFDO")
discovery_cont <- merge_data(discovery_cont, colfilter(discovery_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.003023 secs"
#> [1] "Finished in 0.02596 secs"
discovery_bin <- colfilter(discovery_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

# Categorical in discovery that should be continuous
move_cols <- c("DRD350AQ", "DRD350DQ", "DRD350GQ")
discovery_cont <- merge_data(discovery_cont, colfilter(discovery_cat, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0 secs"
#> [1] "Finished in 0.021941 secs"
discovery_cat <- colfilter(discovery_cat, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

#Binary in replication that should be categorical
move_cols <- c("VITAMIN_B_12_Unknown")
replication_cat <- merge_data(replication_cat, colfilter(replication_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.002956 secs"
#> [1] "Finished in 0.012964 secs"
replication_bin <- colfilter(replication_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

Filtering

Get the types of covariates to exclude them from filtering

# Discovery
discovery_covariates_cat = intersect(names(discovery_cat), covariates)
discovery_covariates_bin = intersect(names(discovery_bin), covariates)
discovery_covariates_catbin <- union(discovery_covariates_cat, discovery_covariates_bin)
discovery_covariates_cont = intersect(names(discovery_cont), covariates)

# Replication
replication_covariates_cat = intersect(names(replication_cat), covariates)
replication_covariates_bin = intersect(names(replication_bin), covariates)
replication_covariates_catbin <- union(replication_covariates_cat, replication_covariates_bin)
replication_covariates_cont = intersect(names(replication_cont), covariates)

These are a standard set of filters with default settings

# Apply the min_cat filter to exclude categorical/binary with low samples of a category
discovery_bin <- min_cat_n(discovery_bin, skip=discovery_covariates_bin)
#> [1] "Running..."
#> [1] "159 of 226 columns removed due to one or more categories having < 200 samples (ignoring 7 columns)"
#> [1] "Finished in 0.109743 secs"
discovery_cat <- min_cat_n(discovery_cat, skip=discovery_covariates_cat)
#> [1] "Running..."
#> [1] "14 of 20 columns removed due to one or more categories having < 200 samples (ignoring 2 columns)"
#> [1] "Finished in 0.035903 secs"

replication_bin <- min_cat_n(replication_bin, skip=replication_covariates_bin)
#> [1] "Running..."
#> [1] "153 of 236 columns removed due to one or more categories having < 200 samples (ignoring 7 columns)"
#> [1] "Finished in 0.132646 secs"
replication_cat <- min_cat_n(replication_cat, skip=replication_covariates_cat)
#> [1] "Running..."
#> [1] "26 of 33 columns removed due to one or more categories having < 200 samples (ignoring 2 columns)"
#> [1] "Finished in 0.082781 secs"

# Calculate percent zero
get0 <- function(x){
  x <- as.numeric(as.character(x))
  N <- sum(!is.na(x))  # Number not NA
  NNZ <- sum(x[!is.na(x)] > 0)  # Number > 0 and not NA
  PZ <- (N-NNZ)/N  # Number zero /  number not NA
  out <- rbind(N=N, NNZ=NNZ, PZ=PZ)
  return(out)
}

# Discovery Percent Zero
zeros_disc <- as.data.frame(t(sapply(discovery_cont[,-1], get0)))
zeros_disc <- cbind(rownames(zeros_disc), data.frame(zeros_disc, row.names = NULL))
names(zeros_disc) <- c("var","N","NNZ","PZ")
# Drop those with percent zero > 90%
before <- length(discovery_cont) -1 # minus 1 for ID
zeros_disc_filtered <- zeros_disc[zeros_disc$PZ >= 0.9, 1, drop=FALSE]
discovery_cont <- colfilter(discovery_cont, cols=zeros_disc_filtered, exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Log
sprintf("Removed %i of %i continuous variables due to the percent zero filter",
        nrow(zeros_disc_filtered), before)
#> [1] "Removed 15 of 340 continuous variables due to the percent zero filter"

# Replication Percent Zero
zeros_repl <- as.data.frame(t(sapply(replication_cont[,-1], get0)))
zeros_repl <- cbind(rownames(zeros_repl), data.frame(zeros_repl, row.names = NULL))
names(zeros_repl) <- c("var","N","NNZ","PZ")
# Drop those with percent zero > 90%
before <- length(replication_cont) -1 # minus 1 for ID
zeros_repl_filtered <- zeros_repl[zeros_repl$PZ >= 0.9, 1, drop=FALSE]
replication_cont <- colfilter(replication_cont, cols=zeros_repl_filtered, exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Log
sprintf("Removed %i of %i continuous variables due to the percent zero filter",
        length(zeros_repl_filtered), before)
#> [1] "Removed 1 of 440 continuous variables due to the percent zero filter"

Only keep those present in both datasets

common_bin <- intersect(names(discovery_bin), names(replication_bin))
discovery_bin_final <- colfilter(discovery_bin, cols=common_bin, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_bin_final <- colfilter(replication_bin, cols=common_bin, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Binary Variables in Common", length(common_bin))
#> [1] "55 Binary Variables in Common"

common_cat <- intersect(names(discovery_cat), names(replication_cat))
discovery_cat_final <- colfilter(discovery_cat, cols=common_cat, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_cat_final <- colfilter(replication_cat, cols=common_cat, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Categorical Variables in Common", length(common_cat))
#> [1] "6 Categorical Variables in Common"

common_cont <- intersect(names(discovery_cont), names(replication_cont))
discovery_cont_final <- colfilter(discovery_cont, cols=common_cont, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_cont_final <- colfilter(replication_cont, cols=common_cont, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Continuous Variables in Common", length(common_cont))
#> [1] "295 Continuous Variables in Common"

Summarize

sprintf("%i Discovery Binary Variables with %i Observations",
        length(discovery_bin_final),
        nrow(discovery_bin_final))
#> [1] "55 Discovery Binary Variables with 9063 Observations"
sprintf("%i Discovery Categorical Variables with %i Observations",
        length(discovery_cat_final),
        nrow(discovery_cat_final))
#> [1] "6 Discovery Categorical Variables with 9063 Observations"
sprintf("%i Discovery Continuous Variables with %i Observations",
        length(discovery_cont_final),
        nrow(discovery_cont_final))
#> [1] "295 Discovery Continuous Variables with 9063 Observations"

sprintf("%i Replication Binary Variables with %i Observations",
        length(replication_bin_final),
        nrow(replication_bin_final))
#> [1] "55 Replication Binary Variables with 9874 Observations"
sprintf("%i Replication Categorical Variables with %i Observations",
        length(replication_cat_final),
        nrow(replication_cat_final))
#> [1] "6 Replication Categorical Variables with 9874 Observations"
sprintf("%i Replication Continuous Variables with %i Observations",
        length(replication_cont_final),
        nrow(replication_cont_final))
#> [1] "295 Replication Continuous Variables with 9874 Observations"

Combine binary and categorical together

discovery_catbin_final <- merge_data(discovery_cat_final, discovery_bin_final, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.009941 secs"
replication_catbin_final <- merge_data(replication_cat_final, replication_bin_final, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.011998 secs"

Get the name of variables that will be run

discovery_catbin_vars = setdiff(names(discovery_catbin_final),
                                c(discovery_covariates_catbin, phenotype, "ID", names(survey_design_discovery), names(survey_design_replication)))
discovery_cont_vars = setdiff(names(discovery_cont_final),
                              c(discovery_covariates_cont, phenotype, "ID", names(survey_design_discovery), names(survey_design_replication)))

Check the phenotype distribution

The phenotype appears to be skewed, so it will need to be corrected. CLARITE makes it easy to plot distributions and to transform variables.

# Discovery

# Make plot to visulize phneotype not log transformed vs. log transfomred.
hist(discovery_cont_final$BMXBMI, breaks = 100)

skewness(discovery_cont_final$BMXBMI)
#> [1] 1.131507
transBMI <- log(discovery_cont_final$BMXBMI)

hist(transBMI, breaks = 100)

skewness(transBMI)
#> [1] 0.390092

# Replace BMI with log BMI
discovery_cont_final$logBMI <- transBMI
discovery_cont_final <- colfilter(discovery_cont_final, cols=c("BMXBMI"),exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

# Replication

# Make plot to visulize phneotype not log transformed vs. log transfomred.
hist(replication_cont_final$BMXBMI, breaks = 100)

skewness(replication_cont_final$BMXBMI)
#> [1] 1.522081
transBMI <- log(replication_cont_final$BMXBMI)

hist(transBMI, breaks = 100)

skewness(transBMI)
#> [1] 0.4349248

# Replace BMI with log BMI
replication_cont_final$logBMI <- transBMI
replication_cont_final <- colfilter(replication_cont_final, cols=c("BMXBMI"),exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"

# Update phenotype name
phenotype <- "logBMI"

Create final dataframes

All variables and survey information are combined into one dataframe

discovery_final <- merge_data(discovery_catbin_final, discovery_cont_final, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.023936 secs"
replication_final <- merge_data(replication_catbin_final, replication_cont_final, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.025931 secs"

discovery_final <- merge_data(discovery_final, survey_design_discovery, union=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.027925 secs"
replication_final <- merge_data(replication_final, survey_design_replication, union=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.02992 secs"

Filter out variables that don’t have a weight both discovery and replication

# Check for weights
catbin_has_weight <- sapply(discovery_catbin_vars,
                     function(v){ifelse(is.null(weights_discovery[[v]]) | is.null(weights_replication[[v]]),
                                        FALSE,
                                        (weights_discovery[[v]] %in% names(discovery_final)) & (weights_replication[[v]] %in% names(replication_final)))}
)
cont_has_weight <- sapply(discovery_cont_vars,
                     function(v){ifelse(is.null(weights_discovery[[v]]) | is.null(weights_replication[[v]]),
                                        FALSE,
                                        (weights_discovery[[v]] %in% names(discovery_final)) & (weights_replication[[v]] %in% names(replication_final)))}
)

# Log
if(sum(catbin_has_weight) < length(catbin_has_weight)){
  print(paste("Skipped", sum(!catbin_has_weight), "cat/bin variable(s) with no weight: ",
              paste(discovery_catbin_vars[!catbin_has_weight], collapse=", ")))}
if(sum(cont_has_weight) < length(cont_has_weight)){
  print(paste("Skipped", sum(!cont_has_weight), "continuous variable(s) with no weight: ",
              paste(discovery_cont_vars[!cont_has_weight], collapse=", ")))}
#> [1] "Skipped 21 continuous variable(s) with no weight:  LBXPFOA, LBXPFOS, LBXPFHS, LBXEPAH, LBXMPAH, LBXPFDE, LBXPFHP, LBXPFNA, LBXPFSA, LBXPFUA, URXP08, URX14D, URX1TB, URX3TB, URXPCP, URXOPP, URXUIO, URXP17, URXP19, DR1TMOIS, LBXPFDO"

# Update
# NOTE: This does it for replication also, since they are copied from these vectors
discovery_catbin_vars <- discovery_catbin_vars[catbin_has_weight]
discovery_cont_vars <- discovery_cont_vars[cont_has_weight]

EWAS

Survey parameters can be passed in along with the other parameters

discovery_results <- ewas(d = discovery_final,
                                   cat_vars=discovery_catbin_vars,
                                   cont_vars=discovery_cont_vars,
                                   y=phenotype,
                                   cat_covars=discovery_covariates_catbin,
                                   cont_covars=discovery_covariates_cont,
                                   regression_family="gaussian",
                                   allowed_nonvarying=c("female", "SDDSRVYR"),
                                   weights=weights_discovery,
                                   id="SDMVPSU",
                                   strat="SDMVSTRA",
                                   nest=TRUE)
#> [1] "Running EWAS with specific weights assigned for each variable"
#> [1] "Processing 52 categorical variables"
#> [1] "    Some covariates don't vary for 'DBD100' but are allowed to do so: SDDSRVYR"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] "    Some covariates don't vary for 'taking_birth_control' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'RHQ540' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'RHQ510' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'SXQ280' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'LBXBV' but are allowed to do so: SDDSRVYR, female"
#> [1] "    Some covariates don't vary for 'LBXMS1' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'no_salt' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'ordinary_salt' but are allowed to do so: SDDSRVYR"
#> [1] "Processing 271 continuous variables"
#> [1] "    Some covariates don't vary for 'LBXV3A' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXTHG' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'LBXIHG' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'URXUHG' but are allowed to do so: female"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion

#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] "    Some covariates don't vary for 'LBX028' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'RHQ556' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'how_long_estrogen' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'age_stopped_birth_control' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'age_started_birth_control' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'URXUUR' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXVID' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXMNM' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXMC1' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXMHH' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXMOH' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXMIB' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX189' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXD02' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXF09' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX087' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX110' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX149' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX151' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX194' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX195' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX196' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBD199' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX206' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXALD' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXDIE' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXEND' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP01' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP02' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP20' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP21' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP22' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP24' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXALC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXBEC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXCBC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXCRY' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXLUZ' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXLYC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TACAR' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TATOC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TBCAR' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TCRYP' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TFA' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TFDFE' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TFF' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TLYCO' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TLZ' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TRET' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TSUGR' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TVARA' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'DR1TVK' but are allowed to do so: SDDSRVYR"
#> [1] "Finished in 29.149112 secs"

There is a separate function for adding pvalues with multiple-test-correction applied.

discovery_results <- ewas_pval_adjust(discovery_results)
#> [1] "Finished in 0.009973 secs"

Saving results is straightforward

write.table(discovery_results,
            file = paste(params$output_folder, "BMI_Discovery_Results.txt", sep="/"),
            sep = "\t", quote = FALSE, row.names = FALSE)

Replication

Selecting top results

Variables with an FDR less than 0.1 were selected

significant_discovery_vars <- discovery_results[discovery_results$pvalue_FDR < 0.1, "Variable"]

replication_catbin_vars <- intersect(discovery_catbin_vars, significant_discovery_vars)
replication_cont_vars <- intersect(discovery_cont_vars, significant_discovery_vars)

Run the replication

The variables with low FDR in the discovery dataset were analyzed in the replication dataset

replication_results <- ewas(d = replication_final,
                                     cat_vars=replication_catbin_vars,
                                     cont_vars=replication_cont_vars,
                                     y=phenotype,
                                     cat_covars=replication_covariates_catbin,
                                     cont_covars=replication_covariates_cont,
                                     regression_family="gaussian",
                                     allowed_nonvarying=c("female", "SDDSRVYR"),
                                     weights=weights_replication,
                                     id="SDMVPSU",
                                     strat="SDMVSTRA",
                                     nest=TRUE)
#> [1] "Running EWAS with specific weights assigned for each variable"
#> [1] "Processing 14 categorical variables"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] "    Some covariates don't vary for 'DUQ100' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'ever_loud_noise_gt3' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'ever_loud_noise_gt3_2' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'SMQ210' but are allowed to do so: SDDSRVYR"
#> [1] "Processing 84 continuous variables"
#> [1] "    Some covariates don't vary for 'LBXME' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXOP1' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXOP3' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX118' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX156' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXD05' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXF04' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXHXC' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX099' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX153' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX170' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX180' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXODT' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXHPE' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXMIR' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP03' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP11' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP15' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBXIRN' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'DUQ110' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'age_stopped_birth_control' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'age_started_birth_control' but are allowed to do so: female"
#> [1] "    Some covariates don't vary for 'LBX194' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX196' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBD199' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'LBX206' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP20' but are allowed to do so: SDDSRVYR"
#> [1] "    Some covariates don't vary for 'URXP24' but are allowed to do so: SDDSRVYR"
#> [1] "Finished in 9.48369 secs"

replication_results <- ewas_pval_adjust(replication_results)
#> [1] "Finished in 0.000999 secs"

Combine results

results <- merge(discovery_results, replication_results, by="Variable", suffixes = c("_Dis", "_Rep"))
results <- results[order(results$pvalue_FDR_Rep),order(names(results))]

not_NA <- !is.na(results$pval_Dis) & !is.na(results$pval_Rep)

sig_FDR_results <- not_NA & (results$pvalue_FDR_Dis < 0.1) & (results$pvalue_FDR_Rep < 0.1) 
write.table(results[sig_FDR_results, c("Variable", "pvalue_FDR_Dis","pvalue_FDR_Rep")],
            file = paste(params$output_folder, "Significant_Results_FDR_0.1.txt", sep="/"),
            sep = "\t", quote = FALSE, row.names = FALSE)

sig_Bonf_results05 <- not_NA & (results$pvalue_Bonf_Dis < 0.05) & (results$pvalue_Bonf_Rep < 0.05)
write.table(results[sig_Bonf_results05, c("Variable", "pvalue_Bonf_Dis", "pvalue_Bonf_Rep")],
            file = paste(params$output_folder, "Significant_Results_Bonferroni_0.05.txt", sep="/"),
            sep = "\t", quote = FALSE, row.names = FALSE)

sig_Bonf_results01 <- not_NA & (results$pvalue_Bonf_Dis < 0.01) & (results$pvalue_Bonf_Rep < 0.01)
write.table(results[sig_Bonf_results01, c("Variable", "pvalue_Bonf_Dis", "pvalue_Bonf_Rep")],
            file = paste(params$output_folder, "Significant_Results_Bonferroni_0.01.txt", sep="/"),
            sep = "\t", quote = FALSE, row.names = FALSE)

# Record significant variables for testing with possible confounding variables
significant_in_both <- results[sig_FDR_results, "Variable"]

Manhattan Plot

CLARITE provides functionality for generating highly customizable Manhattan plots from EWAS results

# Keep only p-values for each variable
mpdata_discovery <- discovery_results[c("Variable", "pval")]
mpdata_replication <- replication_results[c("Variable", "pval")]

# Rename the pvalue columns
names(mpdata_discovery)[2] <- "pvalue"
names(mpdata_replication)[2] <- "pvalue"

# Add "Shape"
mpdata_discovery$Shape <- "Discovery"
mpdata_replication$Shape <- "Replication"

# Merge
mpdata <- rbind(mpdata_discovery, mpdata_replication)

# Load catagory info
var_categories <- read.delim(data_var_categories)

# Plot
eman(mpdata, ewas = FALSE,
     groups = var_categories,
     line = 0.05/nrow(mpdata_discovery),
     title = "EWAS Results: Discovery vs Replication",
     morecolors = FALSE,
     file = paste(params$output_folder, "BMI_Manhattan_Plot_D_R", sep="/"),
     hgt = 7, wi = 12, res = 300)
#> [1] "Running..."
#> [1] "Saving plot to ./example_output/BMI_Manhattan_Plot_D_R.png"

#> [1] "Finished in 0.690846 secs"