NCI_MD_STUDY

READ ME…

DATA DECRIPTION: We have data from participants of the National Cancer Institute (NCI). This data set includes some demographic, ancestral, clinical, and biomarker information on each participant within the study. Here we perform regression, correlation, and paired t-test.

PURPOSE: The goal for this study is to identify difference in clinical biomarker levels in a normal patient population study by self-reported race and ancestry. (DONE) We also want to see the influence of diabetes status, bmi, prostate cancer on biomarker levels based on self-reported race and ancestry.

if all packages cannot be installed use this

if(!requireNamespace("BiocManager", quietly = TRUE))
   install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")

## Bioconductor version 3.19 (BiocManager 1.30.25), R 4.4.2 (2024-10-31)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'ComplexHeatmap'

install packages

library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(readxl)
library("rstatix")

## 
## Attaching package: 'rstatix'

## The following objects are masked from 'package:plyr':
## 
##     desc, mutate

## The following object is masked from 'package:stats':
## 
##     filter

library("ggplot2")
library("dplyr")
library("ggpubr")

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:plyr':
## 
##     mutate

library("dunn.test")
library("ARTool")
library(openxlsx)
library("rio")
library(corrplot)

## corrplot 0.95 loaded

library(pheatmap)
library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.20.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
##    in the original pheatmap() are identically supported in the new function. You 
##    can still use the original function by explicitly calling pheatmap::pheatmap().

## 
## Attaching package: 'ComplexHeatmap'

## The following object is masked from 'package:pheatmap':
## 
##     pheatmap

library(circlize)

## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

set working directory

setwd("/Users/ewamble/Desktop/Tsion/NCI_MD/Data/patient_pop_data")

rename data sets

NCI_OG_Study <- read_excel("/Users/ewamble/Desktop/Tsion/NCI_MD/Data/patient_pop_data/NCI_MD_OG_data.xlsx")
NCI_Ancestry <- read_excel("/Users/ewamble/Desktop/Tsion/NCI_MD/Data/patient_pop_data/WestAfrAncestry_NCIMDcontrols.xlsx")

outlier removal from serum proteomics paper

rm(list=ls())

PROJECT_DIR = "/Users/ewamble/Desktop/Tsion/NCI_MD/Data/patient_pop_data" # replace this line with your local path

infile = file.path(PROJECT_DIR, "NCI_MD_OG_data.xlsx")
data = as.matrix(read_excel(infile,sheet=1))

tail = 0.01
group = "controls_only" #c("cases_only","controls_only","all")
analyte_cols = 21:ncol(data)
n_analyte = length(analyte_cols)

if (group=="cases_only") {
  select = data[,"case"]==1
} else if (group=="controls_only") {
  select = data[,"case"]==0
} else if (group=="all") {
  select = rep(T,nrow(data))
}
for (i_analyte in 1:n_analyte) {
  intensity = as.numeric(data[select,analyte_cols[i_analyte]])
  lower_bound = quantile(intensity,probs=tail)
  intensity[intensity<lower_bound] = lower_bound
  upper_bound = quantile(intensity,probs=1-tail)
  intensity[intensity>upper_bound] = upper_bound
  data[select,analyte_cols[i_analyte]] = intensity
}

outfile = file.path(PROJECT_DIR,"original_data_outliers_removed.txt")
output = rbind(colnames(data),data)
write(t(output),ncol=ncol(output),file=outfile,sep="\t")

rename data sets

NCI_OG_outrm <- read_excel("/Users/ewamble/Desktop/Tsion/NCI_MD/Data/patient_pop_data/original_data_outliers_removed.xlsx")

subset data

#TOTAL POPULATION
#pull out important columns from study
analytes_tp <- NCI_OG_outrm[,c("IL8", "TNFRSF9", "IL7", "IL6", "MCP1", "MCP4", "IL18", "CXCL1", "MCP2", "PDL1", "CD27", "CX3CL1", "IL8", "CD5", "MMP7", "MMP12", "IL12", "CSF1", "ARG1", "IL4", "IL5", "CD28", "NOS3", "CD4", "IL10", "PTN", "IL12RB1", "VEGFC", "MCP3", "CXCL5", "CXCL11", "PDL2")]
analytes_tp <- na.omit(analytes_tp)

control_pop <- subset(NCI_OG_outrm, case == 0)
cancer_pop <- subset(NCI_OG_outrm, case == 1)

analytes_contp <- control_pop[,c("IL8", "TNFRSF9", "IL7", "IL6", "MCP1", "MCP4", "IL18", "CXCL1", "MCP2", "PDL1", "CD27", "CX3CL1", "IL8", "CD5", "MMP7", "MMP12", "IL12", "CSF1", "ARG1", "IL4", "IL5", "CD28", "NOS3", "CD4", "IL10", "PTN", "IL12RB1", "VEGFC", "MCP3", "CXCL5", "CXCL11", "PDL2")]
analytes_contp <- na.omit(analytes_contp)

analytes_cancp <- cancer_pop[,c("IL8", "TNFRSF9", "IL7", "IL6", "MCP1", "MCP4", "IL18", "CXCL1", "MCP2", "PDL1", "CD27", "CX3CL1", "IL8", "CD5", "MMP7", "MMP12", "IL12", "CSF1", "ARG1", "IL4", "IL5", "CD28", "NOS3", "CD4", "IL10", "PTN", "IL12RB1", "VEGFC", "MCP3", "CXCL5", "CXCL11", "PDL2")]
analytes_cancp <- na.omit(analytes_cancp)

#prostate cancer status (PCa)
#self reported race (European American, African American, and African)
euro_sr <- subset(control_pop, race_num == 1)
afro_am_sr <- subset(control_pop, race_num == 2)
afro_sr <- subset(control_pop, race == 3)
pca_euro_sr <- subset(cancer_pop, race_num == 1)
pca_afro_am_sr <- subset(cancer_pop, race_num == 2)
pca_afro_sr <- subset(cancer_pop, race_num == 3)

#diabetic status
normal <- subset(control_pop, diabetes == 0)
diabetic <- subset(control_pop, diabetes == 1)
pca_normal <- subset(cancer_pop, diabetes == 0)
pca_diabetic <- subset(cancer_pop, diabetes == 1)

#bmi status
normal_bmi <- subset(control_pop, bmi < 30)
obesity_bmi <- subset(control_pop, bmi >= 30)
pca_normal_bmi <- subset(cancer_pop, bmi < 30)
pca_obesity_bmi <- subset(cancer_pop, bmi >= 30)

check data

#generate boxplots to identify outliers
boxplot(analytes_tp, main = "total population boxplot")

boxplot(analytes_contp, main = "control population boxplot")

boxplot(analytes_cancp, main = "prostate cancer poopulation boxplot")

race specific subset

#SELF-REPORTED RACE STUDY
#diabetic status
EA_normal <- subset(euro_sr, diabetes == 0)
EA_diabetic <- subset(euro_sr, diabetes == 1)
EA_pca_normal <- subset(pca_euro_sr, diabetes == 0)
EA_pca_diabetic <- subset(pca_euro_sr, diabetes == 1)
AA_normal <- subset(afro_am_sr, diabetes == 0)
AA_diabetic <- subset(afro_am_sr, diabetes == 1)
AA_pca_normal <- subset(pca_afro_am_sr, diabetes == 0)
AA_pca_diabetic <- subset(pca_afro_am_sr, diabetes == 1)
AFR_normal <- subset(afro_sr, diabetes == 0)
AFR_diabetic <- subset(afro_sr, diabetes == 1)
AFR_pca_normal <- subset(pca_afro_sr, diabetes == 0)
AFR_pca_diabetic <- subset(pca_afro_sr, diabetes == 1)

#bmi status
EA_normal_bmi <- subset(euro_sr, bmi < 30)
EA_obesity_bmi <- subset(euro_sr, bmi >= 30)
EA_pca_normal_bmi <- subset(pca_euro_sr, bmi < 30)
EA_pca_obesity_bmi <- subset(pca_euro_sr, bmi >= 30)
AA_normal_bmi <- subset(afro_am_sr, bmi < 30)
AA_obesity_bmi <- subset(afro_am_sr, bmi >= 30)
AA_pca_normal_bmi <- subset(pca_afro_am_sr, bmi < 30)
AA_pca_obesity_bmi <- subset(pca_afro_am_sr, bmi >= 30)
AFR_normal_bmi <- subset(afro_sr, bmi < 30)
AFR_obesity_bmi <- subset(afro_sr, bmi >= 30)
AFR_pca_normal_bmi <- subset(pca_afro_sr, bmi < 30)
AFR_pca_obesity_bmi <- subset(pca_afro_sr, bmi >= 30)

establish comparisons for stats test

my_comparisons <- list(c("African","African American"), c("African American", "European American"), c("African", "European American"))

Visualization

Here we compare the difference between biomarkers based on self-reported race using violin plots in the control population. IL8

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL8, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  #add pairwise comparisons p-value
  stat_compare_means(label.y = 25, method = "anova") +
  #remove legend
  theme(legend.position = "none")

TNFRSF9

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=TNFRSF9, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, method = "t.test") + 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL7

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL7, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL6

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL6, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 25, method = "anova") +
  #remove legend
  theme(legend.position = "none")

MCP1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=MCP1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=7, label= "n = 654") + annotate("text", x=2, y=7, label= "n = 374") + annotate("text", x=3, y=7, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 20, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL18

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL18, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=5, label= "n = 654") + annotate("text", x=2, y=5, label= "n = 374") + annotate("text", x=3, y=5, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 17, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CXCL1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CXCL1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=7, label= "n = 654") + annotate("text", x=2, y=7, label= "n = 374") + annotate("text", x=3, y=7, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 17, method = "anova") +
  #remove legend
  theme(legend.position = "none")

MCP2

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=MCP2, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=5, label= "n = 654") + annotate("text", x=2, y=5, label= "n = 374") + annotate("text", x=3, y=5, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 16, method = "anova") +
  #remove legend
  theme(legend.position = "none")

PDL1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=PDL1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=3, label= "n = 654") + annotate("text", x=2, y=3, label= "n = 374") + annotate("text", x=3, y=3, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CD27

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CD27, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=5, label= "n = 654") + annotate("text", x=2, y=5, label= "n = 374") + annotate("text", x=3, y=5, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CX3CL1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CX3CL1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=5, label= "n = 654") + annotate("text", x=2, y=5, label= "n = 374") + annotate("text", x=3, y=5, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 13, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CD70

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CD70, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=1, label= "n = 654") + annotate("text", x=2, y=1, label= "n = 374") + annotate("text", x=3, y=1, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 10, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CD5

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CD5, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=2, label= "n = 654") + annotate("text", x=2, y=2, label= "n = 374") + annotate("text", x=3, y=2, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 12, method = "anova") +
  #remove legend
  theme(legend.position = "none")

MMP7

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=MMP7, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=7, label= "n = 654") + annotate("text", x=2, y=7, label= "n = 374") + annotate("text", x=3, y=7, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 17, method = "anova") +
  #remove legend
  theme(legend.position = "none")

MMP12

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=MMP12, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL12

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL12, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 15, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CSF1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CSF1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=7, label= "n = 654") + annotate("text", x=2, y=7, label= "n = 374") + annotate("text", x=3, y=7, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 11, method = "anova") +
  #remove legend
  theme(legend.position = "none")

ARG1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=ARG1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=3, label= "n = 654") + annotate("text", x=2, y=3, label= "n = 374") + annotate("text", x=3, y=3, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 10, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL4

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL4, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 8, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL5

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL5, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 12, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CD28

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CD28, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=1, label= "n = 654") + annotate("text", x=2, y=1, label= "n = 374") + annotate("text", x=3, y=1, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 6, method = "anova") +
  #remove legend
  theme(legend.position = "none")

NOS3

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=NOS3, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 5, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CD4

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CD4, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 6, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL10

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL10, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 12, method = "anova") +
  #remove legend
  theme(legend.position = "none")

PTN

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=PTN, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 10, method = "anova") +
  #remove legend
  theme(legend.position = "none")

IL12RB1

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=IL12RB1, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 8, method = "anova") +
  #remove legend
  theme(legend.position = "none")

VEGFC

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=VEGFC, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 8, method = "anova") +
  #remove legend
  theme(legend.position = "none")

MCP3

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=MCP3, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 20, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CXCL5

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CXCL5, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=5, label= "n = 654") + annotate("text", x=2, y=5, label= "n = 374") + annotate("text", x=3, y=5, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 22, method = "anova") +
  #remove legend
  theme(legend.position = "none")

CXCL11

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=CXCL11, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=2, label= "n = 654") + annotate("text", x=2, y=2, label= "n = 374") + annotate("text", x=3, y=2, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 20, method = "anova") +
  #remove legend
  theme(legend.position = "none")

PDL2

#number of individuals
control_pop %>%
  count(race)

## # A tibble: 3 × 2
##   race                  n
##   <chr>             <int>
## 1 African             654
## 2 African American    374
## 3 European American   454

#plot
ggplot(control_pop, aes(x=factor(race), y=PDL2, fill=race))+ geom_violin(trim = FALSE)+ xlab("") + theme_minimal() + geom_boxplot(width=0.1) + scale_fill_manual(values=c("cadetblue3", "deepskyblue4", "darkolivegreen2")) + 
                                                                                                    #add the mean line for aggregat on each plot
stat_summary(geom = "errorbar", fun.min = mean, fun = mean, fun.max = mean, width = .75, linetype = "dashed") +
#add the number of observations to the bottom of the graph
  annotate("text", x=1, y=0, label= "n = 654") + annotate("text", x=2, y=0, label= "n = 374") + annotate("text", x=3, y=0, label= "n = 454") +
#add T-test p-values for the comparison of means
  stat_compare_means(comparisons = my_comparisons, t.test = "t.test")+ 
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = 12, method = "anova") +
  #remove legend
  theme(legend.position = "none")

More Visualization

Here we compare the differences between ancestry and biomarker levels in the control population

NCI_MD_STUDY

Ezekiel Wamble

2025-02-24