library(clarite)
library(moments) # Skewness
library(ggplot2)
packageVersion("clarite")
#> [1] '1.4.1'CLARITE facilitates the quality control and analysis process for EWAS of metabolic-related traits
Data from NHANES was used in an EWAS analysis including utilizing the provided survey weight information. The first two cycles of NHANES (1999-2000 and 2001-2002) are assigned to a ‘discovery’ dataset and the next two cycles (2003-2004 and 2005-2006) are assigned to a ‘replication’ datset.
data_main_table_over18 = paste(params$data_folder, "MainTable_keepvar_over18.tsv", sep="")
data_main_table = paste(params$data_folder, "MainTable.csv", sep="")
data_var_description = paste(params$data_folder, "VarDescription.csv", sep="")
data_var_categories = paste(params$data_folder, "VarCat_nopf.txt", sep="")descriptions <- read.csv(data_var_description)
descriptions_info <- unique(descriptions[,colnames(descriptions) %in% c("tab_desc","module","var","var_desc")])
head(descriptions_info)
#> tab_desc module var
#> 1 Hepatitis A, B, C and D laboratory LBXHBC
#> 2 Hepatitis A, B, C and D laboratory LBDHBG
#> 3 Hepatitis A, B, C and D laboratory LBDHCV
#> 4 Hepatitis A, B, C and D laboratory LBDHD
#> 5 Hepatitis B Surface Antibody laboratory LBXHBS
#> 6 Hepatitis A laboratory LBXHA
#> var_desc
#> 1 Hepatitis B core antibody
#> 2 Hepatitis B surface antigen
#> 3 Hepatitis C antibody (confirmed)
#> 4 Hepatitis D (anti-HDV)
#> 5 Hepatitis B Surface Antibody
#> 6 Hepatitis A Antibody (Anti-HAV)Survey weight information is used so that the results apply to the US civillian non-institutionalized population.
This includes:
Different variables require different weights, as many of them were measured on a subset of the full dataset. For example:
2-year and 4-year weights are provided. It is important to adjust the weights when combining multiple cycles, by computing the weighted average. In this case 4-year weights (covering the first 2 cycles) are provided by NHANES and the replication weights (the 3rd and 4th cycles) were computed from the 2-year weights prior to loading them here.
# Get survey info
survey_design_discovery <- read.csv(paste(params$data_folder, "weights/weights_discovery.txt", sep=""), sep="\t")
colnames(survey_design_discovery)[1] <- "ID"
survey_design_discovery <- colfilter(survey_design_discovery, c("SDDSRVYR"), exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
head(survey_design_discovery)
#> ID SDMVPSU SDMVSTRA WTINT2YR WTINT4YR WTMEC2YR WTMEC4YR WTDRD1
#> 1 1 1 5 9727.079 4291.490 10982.90 4456.207 14809.894
#> 2 2 3 1 26678.636 14203.336 28325.38 15336.200 26899.709
#> 3 3 2 7 43621.681 20123.764 46192.26 21258.467 34388.840
#> 4 4 1 2 10346.119 4582.132 10251.26 4562.389 7159.829
#> 5 5 2 8 91050.847 44161.868 99445.07 45985.968 127746.359
#> 6 6 2 2 36508.250 16850.978 39656.60 18337.313 87007.933
#> WTDR4YR WTSBA2YR WTSBA4YR BMXWT BMIWT CVDWTIM MSXWTIME WTSVOC4Y
#> 1 6066.129 NA NA 12.5 3 NA NA NA
#> 2 14921.934 NA NA 75.4 NA NA 9.25 NA
#> 3 15866.448 NA NA 32.9 NA NA NA NA
#> 4 3218.100 NA NA 13.3 NA NA NA NA
#> 5 58973.611 225514.7 62154.68 92.5 NA 2 NA NA
#> 6 39979.156 NA NA 59.2 NA 2 NA NA
#> WTSHM2YR WTSHM4YR WTSTH2YR WTSTH4YR WTSPP4YR WTSPP2YR WTSPO4YR WTSPO2YR
#> 1 NA NA NA NA NA NA NA NA
#> 2 NA NA 81681.26 46818.92 NA NA 51106.09 108769.5
#> 3 NA NA NA NA 46902.15 114094.9 NA NA
#> 4 NA NA NA NA NA NA NA NA
#> 5 316235.3 138091.51 NA NA NA NA NA NA
#> 6 137608.4 61352.78 NA NA NA NA NA NA
#> WTSCY4YR WTTFA2YR WTSPH2YR WTSAU4YR WTSAF4YR WTSAF2YR WTSCI4YR
#> 1 NA NA NA NA NA NA NA
#> 2 16401.68 60586.15 NA NA 33073.27 60586.15 NA
#> 3 NA 46192.26 142734.1 NA 52434.23 121969.84 NA
#> 4 NA NA NA NA NA NA NA
#> 5 184523.27 234895.21 NA NA 98468.81 234895.21 NA
#> 6 NA NA NA NA NA NA NA
#> WTSPH4YR WTSCI2YR WTSVOC2Y WTSAU2YR WTUIO2YR
#> 1 NA NA NA NA NA
#> 2 NA NA NA NA NA
#> 3 69163.77 NA NA NA NA
#> 4 NA NA NA NA NA
#> 5 NA NA NA NA NA
#> 6 NA NA NA NA NAsurvey_design_replication <- read.csv(paste(params$data_folder, "weights/weights_replication_4yr.txt", sep=""), sep="\t")
colnames(survey_design_replication)[1] <- "ID"
survey_design_replication <- colfilter(survey_design_replication, c("SDDSRVYR"), exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
head(survey_design_replication)
#> ID SDMVPSU SDMVSTRA WTINT2YR WTMEC2YR WTS_FFQ BMXWT BMIWT CVDWTIM
#> 1 21005 2 39 2756.16 2912.391 2009.015 68.80 NA NA
#> 2 21006 1 41 2711.07 2782.020 1919.082 27.60 NA 1
#> 3 21007 2 35 19882.09 20295.533 13317.776 23.95 NA 1
#> 4 21008 1 32 2799.75 2848.375 NA 35.00 NA NA
#> 5 21009 2 31 48796.84 48865.864 33530.095 51.55 NA NA
#> 6 21010 1 29 19799.68 21643.288 32028.706 33.95 NA NA
#> WTSVOC2Y WTSBMEL WTDR2D WTSB2YR WTDRD1 WTSC2YR WTSAF2YR
#> 1 NA NA 1210.364 NA 1444.934 9472.746 7042.1
#> 2 NA NA 1156.183 7991.864 1380.252 NA 6540.5
#> 3 NA NA 7663.560 NA 10655.971 NA NA
#> 4 NA NA 9345.278 NA 4639.677 9264.530 NA
#> 5 91449.62 NA 85131.235 NA 28499.297 138102.149 NA
#> 6 46633.43 NA 37648.061 NA 27656.507 68779.743 NA
#> WTSA2YR WTSCI2YR WTSAU2YR WTAL2YR LBXDWT WTSOG2YR WTSC2YRA WTSPC2YR
#> 1 NA NA NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA NA NA
#> 3 66423.54 NA NA NA NA NA NA NA
#> 4 NA NA NA NA NA NA NA NA
#> 5 NA NA 93277.11 NA NA NA NA NA
#> 6 NA NA 48290.38 NA NA NA NA NA# Get weight mapping data
var_weights <- read.csv(paste(params$data_folder, "weights/VarWeights.csv", sep=""), fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
weights_discovery <- setNames(as.list(var_weights$discovery), as.list(var_weights$variable_name))
weights_replication <- setNames(as.list(var_weights$replication), as.list(var_weights$variable_name))# Data found in the main table
main_table <- read.csv(data_main_table)
colnames(main_table)[1] <- "ID"
survey_year <- colfilter(main_table, c("ID", "SDDSRVYR"))
#> [1] "Running..."
#> [1] "Finished in 0 secs"
nhanes <- merge_data(survey_year, nhanes, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.233376 secs"These are measurements rather than proxies for environmental exposure
These are likely correlated with BMI in some way
These variables don’t have clear meanings
Drop variables that have a small sample size
nhanes_discovery <- min_n(nhanes_discovery, skip=c(covariates, phenotype))
#> [1] "Running..."
#> [1] "302 of 946 columns removed due to < 200 non-NA observations (ignoring 10 columns)"
#> [1] "Finished in 0.034909 secs"
nhanes_replication <- min_n(nhanes_replication, skip=c(covariates, phenotype))
#> [1] "Running..."
#> [1] "225 of 946 columns removed due to < 200 non-NA observations (ignoring 10 columns)"
#> [1] "Finished in 0.049869 secs"This is important, as different variable types must be processed in different ways. The number of unique values for each variable is a good heuristic for determining this. The default settings were used here, but different cutoffs can be specified.
# Discovery
discovery_bin <- get_binary(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.076795 secs"
discovery_cat <- get_categorical(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.11968 secs"
discovery_cont <- get_continuous(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.063829 secs"
discovery_check <- get_check(nhanes_discovery)
#> [1] "Running..."
#> [1] "Finished in 0.116658 secs"
# Replication
replication_bin <- get_binary(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.07879 secs"
replication_cat <- get_categorical(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.149597 secs"
replication_cont <- get_continuous(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.072805 secs"
replication_check <- get_check(nhanes_replication)
#> [1] "Running..."
#> [1] "Finished in 0.149572 secs"Distributions of variables may be plotted using CLARITE
One variable needed correcting where the heuristic was not correct
replication_cat_to_cont <- c("L_GLUTAMINE_gm")
replication_cont <- merge_data(replication_cont, colfilter(replication_cat, cols = replication_cat_to_cont, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.002957 secs"
#> [1] "Finished in 0.029917 secs"
replication_cat <- colfilter(replication_cat, cols = replication_cat_to_cont, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"After examining all of the uncategorized variables, they are all continuous
print(paste("Checked: ", paste(names(replication_check), collapse=", "), sep=" "))
#> [1] "Checked: ID, LBXVCT, LBXV3A, URXUBE, LBXTO2, LBXPFDO, DRD350AQ, DRD350BQ, DRD350DQ, DRD350FQ, DRD350GQ, DRD370AQ, DRD370DQ, DRD370EQ, DRD370FQ, DRD370GQ, DRD370NQ, DRD370RQ, DRD370UQ, ALANINE_mg, ARGININE_mg, BETA_CAROTENE_mg, CAFFEINE_mg, CYSTINE_mg, HISTIDINE_mg, ISOLEUCINE_mg, LEUCINE_mg, LYSINE_mg, PHENYLALANINE_mg, PROLINE_mg, SERINE_mg, THREONINE_mg, TRYPTOPHAN_mg, TYROSINE_mg, VALINE_mg, LBXV2T, LBXV4T, LBXVDM, URXUTM, LBXPFBS"
replication_cont <- merge_data(replication_cont, replication_check, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.030918 secs"Types should match across discovery/replication
############################################
# Take note of differently-typed variables #
############################################
print(paste("Bin > Cat:", paste(intersect(names(discovery_bin), names(replication_cat)), collapse=", ")))
#> [1] "Bin > Cat: ID, BETA_CAROTENE_mcg, CALCIUM_Unknown, MAGNESIUM_Unknown"
print(paste("Bin > Cont:", paste(intersect(names(discovery_bin), names(replication_cont)), collapse=", ")))
#> [1] "Bin > Cont: ID, LBXPFDO"
print(paste("Cat > Bin:", paste(intersect(names(discovery_cat), names(replication_bin)), collapse=", ")))
#> [1] "Cat > Bin: ID, VITAMIN_B_12_Unknown"
print(paste("Cat > Cont:", paste(intersect(names(discovery_cat), names(replication_cont)), collapse=", ")))
#> [1] "Cat > Cont: ID, DRD350AQ, DRD350DQ, DRD350GQ, L_GLUTAMINE_gm"
print(paste("Cont > Bin:", paste(intersect(names(discovery_cont), names(replication_bin)), collapse=", ")))
#> [1] "Cont > Bin: ID"
print(paste("Cont > Cont:", paste(intersect(names(discovery_cont), names(replication_cat)), collapse=", ")))
#> [1] "Cont > Cont: ID"
# Fix based on the above
# Binary in discovery that should be categorical
move_cols <- c("BETA_CAROTENE_mcg", "CALCIUM_Unknown", "MAGNESIUM_Unknown")
discovery_cat <- merge_data(discovery_cat, colfilter(discovery_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.00302 secs"
#> [1] "Finished in 0.017984 secs"
discovery_bin <- colfilter(discovery_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Binary in discovery that should be continuous
move_cols <- c("LBXPFDO")
discovery_cont <- merge_data(discovery_cont, colfilter(discovery_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.003023 secs"
#> [1] "Finished in 0.02596 secs"
discovery_bin <- colfilter(discovery_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Categorical in discovery that should be continuous
move_cols <- c("DRD350AQ", "DRD350DQ", "DRD350GQ")
discovery_cont <- merge_data(discovery_cont, colfilter(discovery_cat, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0 secs"
#> [1] "Finished in 0.021941 secs"
discovery_cat <- colfilter(discovery_cat, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
#Binary in replication that should be categorical
move_cols <- c("VITAMIN_B_12_Unknown")
replication_cat <- merge_data(replication_cat, colfilter(replication_bin, cols = move_cols, exclude = FALSE))
#> [1] "Running..."
#> [1] "Running..."
#> [1] "Finished in 0.002956 secs"
#> [1] "Finished in 0.012964 secs"
replication_bin <- colfilter(replication_bin, cols = move_cols, exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"Get the types of covariates to exclude them from filtering
# Discovery
discovery_covariates_cat = intersect(names(discovery_cat), covariates)
discovery_covariates_bin = intersect(names(discovery_bin), covariates)
discovery_covariates_catbin <- union(discovery_covariates_cat, discovery_covariates_bin)
discovery_covariates_cont = intersect(names(discovery_cont), covariates)
# Replication
replication_covariates_cat = intersect(names(replication_cat), covariates)
replication_covariates_bin = intersect(names(replication_bin), covariates)
replication_covariates_catbin <- union(replication_covariates_cat, replication_covariates_bin)
replication_covariates_cont = intersect(names(replication_cont), covariates)These are a standard set of filters with default settings
# Apply the min_cat filter to exclude categorical/binary with low samples of a category
discovery_bin <- min_cat_n(discovery_bin, skip=discovery_covariates_bin)
#> [1] "Running..."
#> [1] "159 of 226 columns removed due to one or more categories having < 200 samples (ignoring 7 columns)"
#> [1] "Finished in 0.109743 secs"
discovery_cat <- min_cat_n(discovery_cat, skip=discovery_covariates_cat)
#> [1] "Running..."
#> [1] "14 of 20 columns removed due to one or more categories having < 200 samples (ignoring 2 columns)"
#> [1] "Finished in 0.035903 secs"
replication_bin <- min_cat_n(replication_bin, skip=replication_covariates_bin)
#> [1] "Running..."
#> [1] "153 of 236 columns removed due to one or more categories having < 200 samples (ignoring 7 columns)"
#> [1] "Finished in 0.132646 secs"
replication_cat <- min_cat_n(replication_cat, skip=replication_covariates_cat)
#> [1] "Running..."
#> [1] "26 of 33 columns removed due to one or more categories having < 200 samples (ignoring 2 columns)"
#> [1] "Finished in 0.082781 secs"# Calculate percent zero
get0 <- function(x){
x <- as.numeric(as.character(x))
N <- sum(!is.na(x)) # Number not NA
NNZ <- sum(x[!is.na(x)] > 0) # Number > 0 and not NA
PZ <- (N-NNZ)/N # Number zero / number not NA
out <- rbind(N=N, NNZ=NNZ, PZ=PZ)
return(out)
}
# Discovery Percent Zero
zeros_disc <- as.data.frame(t(sapply(discovery_cont[,-1], get0)))
zeros_disc <- cbind(rownames(zeros_disc), data.frame(zeros_disc, row.names = NULL))
names(zeros_disc) <- c("var","N","NNZ","PZ")
# Drop those with percent zero > 90%
before <- length(discovery_cont) -1 # minus 1 for ID
zeros_disc_filtered <- zeros_disc[zeros_disc$PZ >= 0.9, 1, drop=FALSE]
discovery_cont <- colfilter(discovery_cont, cols=zeros_disc_filtered, exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Log
sprintf("Removed %i of %i continuous variables due to the percent zero filter",
nrow(zeros_disc_filtered), before)
#> [1] "Removed 15 of 340 continuous variables due to the percent zero filter"
# Replication Percent Zero
zeros_repl <- as.data.frame(t(sapply(replication_cont[,-1], get0)))
zeros_repl <- cbind(rownames(zeros_repl), data.frame(zeros_repl, row.names = NULL))
names(zeros_repl) <- c("var","N","NNZ","PZ")
# Drop those with percent zero > 90%
before <- length(replication_cont) -1 # minus 1 for ID
zeros_repl_filtered <- zeros_repl[zeros_repl$PZ >= 0.9, 1, drop=FALSE]
replication_cont <- colfilter(replication_cont, cols=zeros_repl_filtered, exclude=TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Log
sprintf("Removed %i of %i continuous variables due to the percent zero filter",
length(zeros_repl_filtered), before)
#> [1] "Removed 1 of 440 continuous variables due to the percent zero filter"Only keep those present in both datasets
common_bin <- intersect(names(discovery_bin), names(replication_bin))
discovery_bin_final <- colfilter(discovery_bin, cols=common_bin, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_bin_final <- colfilter(replication_bin, cols=common_bin, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Binary Variables in Common", length(common_bin))
#> [1] "55 Binary Variables in Common"
common_cat <- intersect(names(discovery_cat), names(replication_cat))
discovery_cat_final <- colfilter(discovery_cat, cols=common_cat, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_cat_final <- colfilter(replication_cat, cols=common_cat, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Categorical Variables in Common", length(common_cat))
#> [1] "6 Categorical Variables in Common"
common_cont <- intersect(names(discovery_cont), names(replication_cont))
discovery_cont_final <- colfilter(discovery_cont, cols=common_cont, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
replication_cont_final <- colfilter(replication_cont, cols=common_cont, exclude=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
sprintf("%i Continuous Variables in Common", length(common_cont))
#> [1] "295 Continuous Variables in Common"sprintf("%i Discovery Binary Variables with %i Observations",
length(discovery_bin_final),
nrow(discovery_bin_final))
#> [1] "55 Discovery Binary Variables with 9063 Observations"
sprintf("%i Discovery Categorical Variables with %i Observations",
length(discovery_cat_final),
nrow(discovery_cat_final))
#> [1] "6 Discovery Categorical Variables with 9063 Observations"
sprintf("%i Discovery Continuous Variables with %i Observations",
length(discovery_cont_final),
nrow(discovery_cont_final))
#> [1] "295 Discovery Continuous Variables with 9063 Observations"
sprintf("%i Replication Binary Variables with %i Observations",
length(replication_bin_final),
nrow(replication_bin_final))
#> [1] "55 Replication Binary Variables with 9874 Observations"
sprintf("%i Replication Categorical Variables with %i Observations",
length(replication_cat_final),
nrow(replication_cat_final))
#> [1] "6 Replication Categorical Variables with 9874 Observations"
sprintf("%i Replication Continuous Variables with %i Observations",
length(replication_cont_final),
nrow(replication_cont_final))
#> [1] "295 Replication Continuous Variables with 9874 Observations"Combine binary and categorical together
discovery_catbin_final <- merge_data(discovery_cat_final, discovery_bin_final, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.009941 secs"
replication_catbin_final <- merge_data(replication_cat_final, replication_bin_final, union = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0.011998 secs"Get the name of variables that will be run
discovery_catbin_vars = setdiff(names(discovery_catbin_final),
c(discovery_covariates_catbin, phenotype, "ID", names(survey_design_discovery), names(survey_design_replication)))
discovery_cont_vars = setdiff(names(discovery_cont_final),
c(discovery_covariates_cont, phenotype, "ID", names(survey_design_discovery), names(survey_design_replication)))The phenotype appears to be skewed, so it will need to be corrected. CLARITE makes it easy to plot distributions and to transform variables.
# Discovery
# Make plot to visulize phneotype not log transformed vs. log transfomred.
hist(discovery_cont_final$BMXBMI, breaks = 100)skewness(discovery_cont_final$BMXBMI)
#> [1] 1.131507
transBMI <- log(discovery_cont_final$BMXBMI)
hist(transBMI, breaks = 100)skewness(transBMI)
#> [1] 0.390092
# Replace BMI with log BMI
discovery_cont_final$logBMI <- transBMI
discovery_cont_final <- colfilter(discovery_cont_final, cols=c("BMXBMI"),exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Replication
# Make plot to visulize phneotype not log transformed vs. log transfomred.
hist(replication_cont_final$BMXBMI, breaks = 100)skewness(replication_cont_final$BMXBMI)
#> [1] 1.522081
transBMI <- log(replication_cont_final$BMXBMI)
hist(transBMI, breaks = 100)skewness(transBMI)
#> [1] 0.4349248
# Replace BMI with log BMI
replication_cont_final$logBMI <- transBMI
replication_cont_final <- colfilter(replication_cont_final, cols=c("BMXBMI"),exclude = TRUE)
#> [1] "Running..."
#> [1] "Finished in 0 secs"
# Update phenotype name
phenotype <- "logBMI"All variables and survey information are combined into one dataframe
discovery_final <- merge_data(discovery_catbin_final, discovery_cont_final, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.023936 secs"
replication_final <- merge_data(replication_catbin_final, replication_cont_final, union = FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.025931 secs"
discovery_final <- merge_data(discovery_final, survey_design_discovery, union=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.027925 secs"
replication_final <- merge_data(replication_final, survey_design_replication, union=FALSE)
#> [1] "Running..."
#> [1] "Finished in 0.02992 secs"Filter out variables that don’t have a weight both discovery and replication
# Check for weights
catbin_has_weight <- sapply(discovery_catbin_vars,
function(v){ifelse(is.null(weights_discovery[[v]]) | is.null(weights_replication[[v]]),
FALSE,
(weights_discovery[[v]] %in% names(discovery_final)) & (weights_replication[[v]] %in% names(replication_final)))}
)
cont_has_weight <- sapply(discovery_cont_vars,
function(v){ifelse(is.null(weights_discovery[[v]]) | is.null(weights_replication[[v]]),
FALSE,
(weights_discovery[[v]] %in% names(discovery_final)) & (weights_replication[[v]] %in% names(replication_final)))}
)
# Log
if(sum(catbin_has_weight) < length(catbin_has_weight)){
print(paste("Skipped", sum(!catbin_has_weight), "cat/bin variable(s) with no weight: ",
paste(discovery_catbin_vars[!catbin_has_weight], collapse=", ")))}
if(sum(cont_has_weight) < length(cont_has_weight)){
print(paste("Skipped", sum(!cont_has_weight), "continuous variable(s) with no weight: ",
paste(discovery_cont_vars[!cont_has_weight], collapse=", ")))}
#> [1] "Skipped 21 continuous variable(s) with no weight: LBXPFOA, LBXPFOS, LBXPFHS, LBXEPAH, LBXMPAH, LBXPFDE, LBXPFHP, LBXPFNA, LBXPFSA, LBXPFUA, URXP08, URX14D, URX1TB, URX3TB, URXPCP, URXOPP, URXUIO, URXP17, URXP19, DR1TMOIS, LBXPFDO"
# Update
# NOTE: This does it for replication also, since they are copied from these vectors
discovery_catbin_vars <- discovery_catbin_vars[catbin_has_weight]
discovery_cont_vars <- discovery_cont_vars[cont_has_weight]Survey parameters can be passed in along with the other parameters
discovery_results <- ewas(d = discovery_final,
cat_vars=discovery_catbin_vars,
cont_vars=discovery_cont_vars,
y=phenotype,
cat_covars=discovery_covariates_catbin,
cont_covars=discovery_covariates_cont,
regression_family="gaussian",
allowed_nonvarying=c("female", "SDDSRVYR"),
weights=weights_discovery,
id="SDMVPSU",
strat="SDMVSTRA",
nest=TRUE)
#> [1] "Running EWAS with specific weights assigned for each variable"
#> [1] "Processing 52 categorical variables"
#> [1] " Some covariates don't vary for 'DBD100' but are allowed to do so: SDDSRVYR"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] " Some covariates don't vary for 'taking_birth_control' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'RHQ540' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'RHQ510' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'SXQ280' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'LBXBV' but are allowed to do so: SDDSRVYR, female"
#> [1] " Some covariates don't vary for 'LBXMS1' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'no_salt' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'ordinary_salt' but are allowed to do so: SDDSRVYR"
#> [1] "Processing 271 continuous variables"
#> [1] " Some covariates don't vary for 'LBXV3A' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXTHG' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'LBXIHG' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'URXUHG' but are allowed to do so: female"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] " Some covariates don't vary for 'LBX028' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'RHQ556' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'how_long_estrogen' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'age_stopped_birth_control' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'age_started_birth_control' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'URXUUR' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXVID' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXMNM' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXMC1' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXMHH' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXMOH' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXMIB' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX189' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXD02' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXF09' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX087' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX110' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX149' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX151' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX194' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX195' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX196' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBD199' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX206' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXALD' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXDIE' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXEND' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP01' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP02' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP20' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP21' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP22' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP24' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXALC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXBEC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXCBC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXCRY' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXLUZ' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXLYC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TACAR' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TATOC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TBCAR' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TCRYP' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TFA' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TFDFE' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TFF' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TLYCO' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TLZ' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TRET' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TSUGR' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TVARA' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'DR1TVK' but are allowed to do so: SDDSRVYR"
#> [1] "Finished in 29.149112 secs"There is a separate function for adding pvalues with multiple-test-correction applied.
Saving results is straightforward
Variables with an FDR less than 0.1 were selected
The variables with low FDR in the discovery dataset were analyzed in the replication dataset
replication_results <- ewas(d = replication_final,
cat_vars=replication_catbin_vars,
cont_vars=replication_cont_vars,
y=phenotype,
cat_covars=replication_covariates_catbin,
cont_covars=replication_covariates_cont,
regression_family="gaussian",
allowed_nonvarying=c("female", "SDDSRVYR"),
weights=weights_replication,
id="SDMVPSU",
strat="SDMVSTRA",
nest=TRUE)
#> [1] "Running EWAS with specific weights assigned for each variable"
#> [1] "Processing 14 categorical variables"
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> Warning in summary.glm(g): observations with zero weight not used for
#> calculating dispersion
#> Warning in summary.glm(glm.object): observations with zero weight not used
#> for calculating dispersion
#> [1] " Some covariates don't vary for 'DUQ100' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'ever_loud_noise_gt3' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'ever_loud_noise_gt3_2' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'SMQ210' but are allowed to do so: SDDSRVYR"
#> [1] "Processing 84 continuous variables"
#> [1] " Some covariates don't vary for 'LBXME' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXOP1' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXOP3' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX118' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX156' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXD05' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXF04' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXHXC' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX099' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX153' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX170' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX180' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXODT' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXHPE' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXMIR' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP03' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP11' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP15' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBXIRN' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'DUQ110' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'age_stopped_birth_control' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'age_started_birth_control' but are allowed to do so: female"
#> [1] " Some covariates don't vary for 'LBX194' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX196' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBD199' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'LBX206' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP20' but are allowed to do so: SDDSRVYR"
#> [1] " Some covariates don't vary for 'URXP24' but are allowed to do so: SDDSRVYR"
#> [1] "Finished in 9.48369 secs"
replication_results <- ewas_pval_adjust(replication_results)
#> [1] "Finished in 0.000999 secs"results <- merge(discovery_results, replication_results, by="Variable", suffixes = c("_Dis", "_Rep"))
results <- results[order(results$pvalue_FDR_Rep),order(names(results))]
not_NA <- !is.na(results$pval_Dis) & !is.na(results$pval_Rep)
sig_FDR_results <- not_NA & (results$pvalue_FDR_Dis < 0.1) & (results$pvalue_FDR_Rep < 0.1)
write.table(results[sig_FDR_results, c("Variable", "pvalue_FDR_Dis","pvalue_FDR_Rep")],
file = paste(params$output_folder, "Significant_Results_FDR_0.1.txt", sep="/"),
sep = "\t", quote = FALSE, row.names = FALSE)
sig_Bonf_results05 <- not_NA & (results$pvalue_Bonf_Dis < 0.05) & (results$pvalue_Bonf_Rep < 0.05)
write.table(results[sig_Bonf_results05, c("Variable", "pvalue_Bonf_Dis", "pvalue_Bonf_Rep")],
file = paste(params$output_folder, "Significant_Results_Bonferroni_0.05.txt", sep="/"),
sep = "\t", quote = FALSE, row.names = FALSE)
sig_Bonf_results01 <- not_NA & (results$pvalue_Bonf_Dis < 0.01) & (results$pvalue_Bonf_Rep < 0.01)
write.table(results[sig_Bonf_results01, c("Variable", "pvalue_Bonf_Dis", "pvalue_Bonf_Rep")],
file = paste(params$output_folder, "Significant_Results_Bonferroni_0.01.txt", sep="/"),
sep = "\t", quote = FALSE, row.names = FALSE)
# Record significant variables for testing with possible confounding variables
significant_in_both <- results[sig_FDR_results, "Variable"]CLARITE provides functionality for generating highly customizable Manhattan plots from EWAS results
# Keep only p-values for each variable
mpdata_discovery <- discovery_results[c("Variable", "pval")]
mpdata_replication <- replication_results[c("Variable", "pval")]
# Rename the pvalue columns
names(mpdata_discovery)[2] <- "pvalue"
names(mpdata_replication)[2] <- "pvalue"
# Add "Shape"
mpdata_discovery$Shape <- "Discovery"
mpdata_replication$Shape <- "Replication"
# Merge
mpdata <- rbind(mpdata_discovery, mpdata_replication)
# Load catagory info
var_categories <- read.delim(data_var_categories)
# Plot
eman(mpdata, ewas = FALSE,
groups = var_categories,
line = 0.05/nrow(mpdata_discovery),
title = "EWAS Results: Discovery vs Replication",
morecolors = FALSE,
file = paste(params$output_folder, "BMI_Manhattan_Plot_D_R", sep="/"),
hgt = 7, wi = 12, res = 300)
#> [1] "Running..."
#> [1] "Saving plot to ./example_output/BMI_Manhattan_Plot_D_R.png"#> [1] "Finished in 0.690846 secs"