library(omicade4)
## Loading required package: ade4
library(mogsa)
library(RSpectra)
# library(lubridate)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:mogsa':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
library(ggplot2)
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized")
# Read in metadata
meta.2020<-read.table('clinical_metadata.2020.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
meta.2021<-read.table('clinical_metadata.2021.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
files from imputed data is already normalised so we don’t need to normalise them
# imputed_dir is path to local drive where data is saved
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/imputed")
# Import imputed datasets
rnaseq_baseline_mat_imputed_20 <- read.csv('rnaseq_baseline_mat_imputed_20_051022.csv',row.names=1)
cytof_baseline_mat_imputed_20 <- read.csv('cytof_baseline_mat_imputed_20_051022.csv',row.names=1)
olink_baseline_mat_imputed_20 <- read.csv('olink_baseline_mat_imputed_20_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_20 <- read.csv('abtiters_baseline_mat_imputed_20_051022.csv',row.names=1)
rnaseq_baseline_mat_imputed_21 <- read.csv('rnaseq_baseline_mat_imputed_21_051022.csv',row.names=1)
cytof_baseline_mat_imputed_21 <- read.csv('cytof_baseline_mat_imputed_21_051022.csv',row.names=1)
olink_baseline_mat_imputed_21 <- read.csv('olink_baseline_mat_imputed_21_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_21 <- read.csv('abtiters_baseline_mat_imputed_21_051022.csv',row.names=1)
colnames(cytof_baseline_mat_imputed_21)
## [1] "ASCs..Plasmablasts." "Basophils"
## [3] "Bcells" "CD3.Tcells"
## [5] "CD3CD19" "CD3CD19neg"
## [7] "CD4Tcells" "CD8Tcells"
## [9] "Classical_Monocytes" "Intermediate_Monocytes"
## [11] "Monocytes" "NK"
## [13] "NaiveCD4" "NaiveCD8"
## [15] "Non.Classical_Monocytes" "TcmCD4"
## [17] "TcmCD8" "TemCD4"
## [19] "TemCD8" "TemraCD4"
## [21] "TemraCD8" "pDC"
tasks_seq<-c('ENSG00000277632','ENSG00000136244','ENSG00000100906','ENSG00000229807')
names(rnaseq_baseline_mat_imputed_20[tasks_seq])
## [1] "ENSG00000277632" "ENSG00000136244" "ENSG00000100906" "ENSG00000229807"
distPlot <- function(col, df){
p <-
ggplot(df) +
aes_string(col)
if(is.numeric(df[[col]])) {
p <- p + geom_density()
} else {
p <- p + geom_bar()
}
}
distPlots <- lapply(tasks_seq, distPlot, df=rnaseq_baseline_mat_imputed_20)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_grid(plotlist = distPlots)
# Get age at boost
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:cowplot':
##
## stamp
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
meta.2020$date_of_boost<-parse_date_time(meta.2020$date_of_boost,"ymd")
meta.2020$year_of_birth<-parse_date_time(meta.2020$year_of_birth,"ymd")
meta.2020$age_at_boost<- as.numeric(round(difftime(meta.2020$date_of_boost,
meta.2020$year_of_birth,units="weeks")/52,2))
meta.2021$date_of_boost<-parse_date_time(meta.2021$date_of_boost,"ymd")
meta.2021$year_of_birth<-parse_date_time(meta.2021$year_of_birth,"ymd")
meta.2021$age_at_boost<- as.numeric(round(difftime(meta.2021$date_of_boost,
meta.2021$year_of_birth,units="weeks")/52,2))
meta <- rbind(meta.2020[c("age_at_boost", "infancy_vac", "biological_sex")], meta.2021[c("age_at_boost", "infancy_vac", "biological_sex")])
meta$infancy_vac <- as.numeric(meta$infancy_vac)
meta$biological_sex <- as.numeric(meta$biological_sex)
colnames(meta)
## [1] "age_at_boost" "infancy_vac" "biological_sex"
library(DBI)
library(RPostgreSQL)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(dplyr)
library(readr)
dsn_database = "cmipb_v4_0"
dsn_hostname = "cmi-pb.lji.org"
dsn_port = "5432"
dsn_uid = "cmipb"
dsn_pwd = "b5mq62vW7JE2YUwq"
tryCatch({
drv <- dbDriver("PostgreSQL")
print("Connecting to Database…")
connec <- dbConnect(drv,
dbname = dsn_database,
host = dsn_hostname,
port = dsn_port,
user = dsn_uid,
password = dsn_pwd)
print("Database Connected!")
},
error=function(cond) {
print("Unable to connect to Database.")
})
## [1] "Connecting to Database…"
## [1] "Database Connected!"
# dbListTables(connec)
library(tibble)
library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────── tidyverse 2.0.0.9000 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks mogsa::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# library(lubridate)
exposDf <- dbGetQuery(connec, "SELECT * FROM immune_exposure")
ifelse(exposDf$event_start==exposDf$event_end,"Yes","No")
## [1] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [13] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [25] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [37] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [49] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [61] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [73] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [85] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [97] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [109] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [121] "Yes"
exposDf %>% count(event_type)
## event_type n
## 1 bronchitis 1
## 2 covid 36
## 3 dtap 1
## 4 hib 2
## 5 hpv 11
## 6 influenza 32
## 7 japanese encephalitis 1
## 8 meningococcal 8
## 9 mononucleosis 1
## 10 pneumococcal 1
## 11 pneumonia 3
## 12 polio 10
## 13 strep throat 4
## 14 tick-borne encephalitis 1
## 15 typhoid 1
## 16 varicella 7
## 17 whooping cough 1
exposDf$exposure_material_type <- as.character(as.numeric(as.factor(exposDf$exposure_material_type)))
exposDf$event_start <- as.character(as.numeric(as.factor(exposDf$event_start)))
#
exposDf <- dplyr::filter(exposDf, event_type %in% c('covid', 'influenza'))
exposDf1 <- exposDf[, c('subject_id', 'event_type','exposure_material_type', 'event_start')] %>%
pivot_wider( names_from = event_type,
values_from = c(event_start, exposure_material_type),
values_fn= toString,
values_fill='Not known'
)
colnames(exposDf1)
## [1] "subject_id" "event_start_influenza"
## [3] "event_start_covid" "exposure_material_type_influenza"
## [5] "exposure_material_type_covid"
typeof(exposDf1)
## [1] "list"
exposDf2 <- as.data.frame(exposDf1)
for (col in colnames(exposDf1)[2:length(colnames(exposDf1))]){
exposDf2 <- exposDf1[!grepl(',', exposDf1[[col]]), ]
}
exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.factor))
exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.numeric))
exposDf2
## # A tibble: 51 × 5
## subject_id event_start_influenza event_start_covid exposure_material_type_i…¹
## <dbl> <dbl> <dbl> <dbl>
## 1 3 1 1 1
## 2 38 1 1 1
## 3 41 3 1 1
## 4 46 2 1 1
## 5 44 2 1 1
## 6 2 1 3 1
## 7 4 2 3 1
## 8 5 1 3 1
## 9 6 1 3 1
## 10 7 3 3 1
## # ℹ 41 more rows
## # ℹ abbreviated name: ¹​exposure_material_type_influenza
## # ℹ 1 more variable: exposure_material_type_covid <dbl>
# nullToNA <- function(x) {
# x[sapply(x, is.null)] <- 'Not known'
# return(x)
# }
# z=exposDf3[2][1]
# exposDf3 <- lapply(exposDf2, nullToNA)
# exposDf3.df <- t(do.call(rbind,lapply(exposDf3, rbind)))
#
# # z= dim(exposDf3.df)[1]
# # print(dim(exposDf3.df))
# # print(exposDf3.df[[2]][2])
# # exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=TRUE)
# exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=FALSE))
# # z= typeof(exposDf4)
# colnames(exposDf4) <- names(exposDf3)
# #
# # z=exposDf3.df$subject_id
# #
# Get the y data
dataY = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/yData_task_matrix.common_names.mfi_raw.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
dataY <- dataY[c("IgG.PT.day14", "ENSG00000277632.day3", "Monocytes.day1")]
colnames(dataY) <- c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
dataY$IgG.PT.day14 <- log2(dataY[,'IgG.PT.day14']+1)
dataY$CCL3.day3 <- log2(dataY[,'CCL3.day3']+1)
dataY$subject_id <- rownames(dataY)
colnames(dataY)
## [1] "IgG.PT.day14" "CCL3.day3" "Monocytes.day1" "subject_id"
distPlot <- function(col, df){
p <-
ggplot(df) +
aes_string(col)
if(is.numeric(df[[col]])) {
p <- p + geom_density()
} else {
p <- p + geom_bar()
}
}
typeof(dataY)
## [1] "list"
#
distPlots <- lapply(c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1"), distPlot, df=dataY)
plot_grid(plotlist = distPlots)
## Warning: Removed 4 rows containing non-finite values (`stat_density()`).
## Warning: Removed 22 rows containing non-finite values (`stat_density()`).
## Warning: Removed 41 rows containing non-finite values (`stat_density()`).
# df = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized/task_matrix.common_names.mfi_normalised.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
#
# distPlot <- function(col, df){
# p <-
# ggplot(df) +
# aes_string(col)
#
# if(is.numeric(df[[col]])) {
# p <- p + geom_density()
#
# } else {
# p <- p + geom_bar()
# }
# }
#
# typeof(df)
# colnames(df)
# #
# distPlots <- lapply(c("IgG.PT_day14", "CCL3_day3", "Monocytes_day1"), distPlot, df=df)
# plot_grid(plotlist = distPlots)
Reading BioAge Vaccine response to Influenza genes
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/")
rnaDf1 <- read.csv('RNAseq_age_influenzaVaccine_markers.csv')
print(tasks_seq[-c(2)])
## [1] "ENSG00000277632" "ENSG00000100906" "ENSG00000229807"
rnaDf2 <- rbind(rnaseq_baseline_mat_imputed_20[tasks_seq[-c(2)]], rnaseq_baseline_mat_imputed_21[tasks_seq[-c(2)]])
rnaDf2["subject_id"] <- row.names(rnaDf2)
colnames(rnaDf2)[which(names(rnaDf2) == tasks_seq[1])] <- "CCL3"
# colnames(rnaDf2)[which(names(rnaDf2) == tasks_seq[2])] <- "IL6"
colnames(rnaDf2)[which(names(rnaDf2) == tasks_seq[3])] <- "NFKBIA"
colnames(rnaDf2)[which(names(rnaDf2) == tasks_seq[4])] <- "XIST"
rnaDf3 <- merge(rnaDf1, rnaDf2, by='subject_id', all=T)
row.names(rnaDf3) <- rnaDf3[,'subject_id']
rnaDf = select(rnaDf3, -c('subject_id'))
colnames(rnaseq_baseline_mat_imputed_20)[which(names(rnaseq_baseline_mat_imputed_20) == "ENSG00000277632")] <- "CCL3"
colnames(rnaseq_baseline_mat_imputed_21)[which(names(rnaseq_baseline_mat_imputed_21) == "ENSG00000277632")] <- "CCL3"
rnaDf <- rbind(rnaseq_baseline_mat_imputed_20["CCL3"], rnaseq_baseline_mat_imputed_21["CCL3"])
abtiterDf <- rbind(abtiters_baseline_mat_imputed_20[c("IgG.FHA", "IgG.PRN", "IgG.PT")], abtiters_baseline_mat_imputed_21[c("IgG.FHA", "IgG.PRN", "IgG.PT")])
cytofDf <- rbind(cytof_baseline_mat_imputed_20["Monocytes"], cytof_baseline_mat_imputed_21["Monocytes"])
dataDf1 <- merge(rnaDf, abtiterDf, by='row.names', all=T)
colnames(dataDf1)[1] <- "subject_id"
dataDf2 <- merge(cytofDf, meta, by='row.names', all=T)
colnames(dataDf2)[1] <- "subject_id"
dataX1 <- merge(dataDf1, dataDf2, by='subject_id', all=T)
dataX <- merge(dataX1, exposDf2, by='subject_id', all=T)
# dataX <- merge(dataDf1, dataDf2, by='subject_id', all=T)
# colnames(dataX)[1] <- "subject_id"
#
dataDf <- merge(dataX, dataY, by='subject_id', all=T)
rownames(dataDf) <- dataDf$subject_id
dataDf
## subject_id CCL3 IgG.FHA IgG.PRN IgG.PT Monocytes
## 1 1 5.333531 5.13138189 1.84893838 2.24397113 7.5260062
## 10 10 4.101398 0.14364571 0.09320618 0.04154169 13.4642547
## 11 11 4.542939 2.47376114 3.18404386 1.77698271 10.7905171
## 12 12 NA NA NA NA NA
## 13 13 3.399718 1.07740175 1.12798355 1.36806068 5.0304685
## 14 14 NA NA NA NA NA
## 15 15 4.835874 1.14771363 1.03058953 0.38100554 15.6373679
## 16 16 NA NA NA NA NA
## 17 17 4.512669 1.78634624 0.48701666 3.45553366 14.4574172
## 18 18 4.083384 3.51822961 1.56843058 1.68734290 0.9568001
## 19 19 4.459300 1.04292223 1.84192155 2.16297720 13.5202756
## 2 2 NA NA NA NA NA
## 20 20 3.963474 1.53996887 0.59107133 2.55022450 21.3991621
## 21 21 3.823138 3.93012107 1.68057253 2.77448207 15.2266453
## 22 22 4.462314 2.14113953 1.30342083 2.16393744 11.9355890
## 23 23 4.421223 1.68946851 0.43547450 0.40194862 11.7041207
## 24 24 3.106516 0.60815892 1.06497238 0.26992483 11.5201600
## 25 25 4.042732 0.34555009 0.80002675 0.74967184 13.4709985
## 26 26 4.735685 0.98270780 1.12156687 0.72685734 5.5320667
## 27 27 3.782618 1.68013778 0.84721352 0.77080873 13.8965682
## 28 28 NA NA NA NA NA
## 29 29 4.447249 2.58070095 2.87788355 0.99688351 16.0947648
## 3 3 4.599972 1.06795335 3.11313957 1.06789053 13.0245725
## 30 30 NA NA NA NA NA
## 31 31 4.693487 0.40767306 2.02769318 0.16545091 4.8790233
## 32 32 5.779549 0.80344564 1.30960268 1.40368966 13.5492381
## 33 33 5.311794 0.66955298 0.24197417 0.63824158 7.6518328
## 34 34 NA NA NA NA NA
## 35 35 5.434762 1.75928512 1.23573946 1.19302502 13.5420630
## 36 36 3.016496 2.00337580 2.33488694 1.04080942 11.2437986
## 37 37 NA NA NA NA NA
## 38 38 3.017922 0.81897225 0.56487492 0.77149215 13.7721006
## 39 39 NA NA NA NA NA
## 4 4 4.094574 1.03441003 2.73777402 1.60723380 5.4908380
## 40 40 NA NA NA NA NA
## 41 41 NA NA NA NA NA
## 42 42 5.230280 0.86883965 2.56022738 1.11654738 10.6095832
## 43 43 3.871351 1.01708739 1.30963816 1.00310977 13.6337391
## 44 44 4.518409 0.52901785 0.72866760 0.92865661 23.1032762
## 45 45 NA NA NA NA NA
## 46 46 NA NA NA NA NA
## 47 47 6.900831 1.56572050 0.54455774 2.65845646 10.1292397
## 48 48 6.731536 0.48194166 1.03140808 0.04154169 14.9597507
## 49 49 NA NA NA NA NA
## 5 5 4.916572 0.11694600 2.64806823 2.26243525 13.1258248
## 50 50 8.091139 0.61246324 0.32074600 0.50769248 9.3613078
## 51 51 NA NA NA NA NA
## 52 52 7.288949 0.11694600 1.56152075 1.97201547 28.5490352
## 53 53 9.867189 1.90423998 0.47059211 4.17711376 12.8294284
## 54 54 NA NA NA NA NA
## 55 55 NA NA NA NA NA
## 56 56 NA NA NA NA NA
## 57 57 NA NA NA NA NA
## 58 58 NA NA NA NA NA
## 59 59 NA NA NA NA NA
## 6 6 4.575554 0.46392532 0.12456085 0.27896948 22.7930283
## 60 60 NA NA NA NA NA
## 61 61 12.042340 3.54131321 0.15131429 1.00000000 12.8005992
## 62 62 6.937333 2.82093049 0.75122163 1.07602222 13.5692650
## 63 63 6.691953 1.95776540 0.11013376 1.79570532 15.3000000
## 64 64 6.760926 2.68882561 0.89982323 2.04705167 12.9000000
## 65 65 7.605079 3.31317657 0.64048964 0.80140188 15.8000000
## 66 66 5.042425 3.74662797 1.43985527 1.32512343 15.8000000
## 67 67 5.889230 2.00974701 0.35723800 0.70618673 34.6000000
## 68 68 5.853946 1.11683339 1.94255138 1.05459788 13.5000000
## 69 69 5.528321 1.65215623 1.21850657 1.07297696 13.1000000
## 7 7 NA NA NA NA NA
## 70 70 5.815166 3.35167692 1.65402214 0.47027882 32.3000000
## 71 71 5.987707 4.65325882 0.87273399 2.90659418 12.2000000
## 72 72 5.880881 0.28729447 2.12336100 2.08840338 19.9000000
## 73 73 4.529196 0.80435967 1.43128987 1.88327381 15.7000000
## 74 74 3.871745 0.62423924 1.60899360 0.52016941 36.3000000
## 75 75 5.678804 3.05443604 1.81011848 0.17736981 14.4559889
## 76 76 4.663800 4.53841221 0.11013376 3.53643892 18.0000000
## 77 77 8.217396 0.06711778 1.06772479 0.69081102 28.8000000
## 78 78 7.151839 0.10084120 0.25375330 0.99519368 29.0000000
## 79 79 6.969300 0.20652117 0.14721857 2.32256773 41.5000000
## 8 8 NA NA NA NA NA
## 80 80 7.178107 0.14117982 1.58147601 0.46841860 21.9000000
## 81 81 10.726589 0.07947224 1.13942560 1.81601157 34.9000000
## 82 82 9.413611 0.96199368 0.94934816 0.76723104 22.5000000
## 83 83 8.588209 1.00000000 2.03087458 1.26356779 32.4000000
## 84 84 7.593122 2.31136374 0.44903137 0.28053529 18.9000000
## 85 85 4.937862 3.32067659 0.92948200 0.33276350 19.7000000
## 86 86 5.362224 0.67511620 0.26301839 0.30403242 23.5000000
## 87 87 5.295650 3.21116798 0.96864376 1.07997538 22.9000000
## 88 88 5.567850 1.48284828 1.33113192 1.04544297 17.6000000
## 89 89 5.130601 0.04523958 0.13834498 0.06807137 15.7000000
## 9 9 4.238481 0.46042437 0.20839284 0.28783862 5.7229578
## 90 90 4.837136 0.74851784 1.63121255 0.33985853 20.6000000
## 91 91 4.140370 1.12281459 0.78254880 0.92749667 25.4000000
## 92 92 4.623047 0.04523958 1.03269446 0.23829881 40.6000000
## 93 93 4.616769 0.12543138 1.60575881 1.57747907 16.3000000
## 94 94 4.785446 0.08254440 1.26047137 2.65824315 26.2000000
## 95 95 4.878235 0.37016694 1.00000000 2.29792327 17.3000000
## 96 96 4.356355 0.19946349 0.92277461 0.76421696 34.2000000
## age_at_boost infancy_vac biological_sex event_start_influenza
## 1 30.80 2 1 4
## 10 34.68 2 1 3
## 11 30.76 2 1 2
## 12 34.68 2 2 4
## 13 19.63 1 2 1
## 14 23.70 2 2 3
## 15 27.71 2 2 3
## 16 29.66 2 1 4
## 17 36.82 2 1 3
## 18 19.73 1 1 3
## 19 22.81 2 2 3
## 2 51.25 2 1 1
## 20 35.78 2 1 3
## 21 33.77 2 2 3
## 22 31.77 2 1 4
## 23 25.82 2 1 4
## 24 24.79 2 1 2
## 25 28.80 2 1 3
## 26 33.85 2 1 4
## 27 19.80 1 1 3
## 28 34.85 2 2 2
## 29 19.80 1 2 1
## 3 33.89 2 1 1
## 30 28.84 2 1 2
## 31 27.83 2 1 3
## 32 19.88 1 2 4
## 33 26.87 2 2 3
## 34 33.93 2 1 4
## 35 25.86 2 2 4
## 36 19.88 1 1 4
## 37 18.91 1 1 2
## 38 19.88 1 1 1
## 39 31.92 2 1 4
## 4 28.76 2 2 2
## 40 22.89 2 1 4
## 41 31.96 2 2 3
## 42 19.92 1 1 4
## 43 18.91 1 1 4
## 44 18.91 1 1 2
## 45 19.98 1 1 4
## 46 18.91 1 1 2
## 47 20.98 1 1 4
## 48 19.11 1 1 4
## 49 20.11 1 1 4
## 5 25.75 2 2 1
## 50 19.98 1 1 4
## 51 19.98 1 2 4
## 52 19.07 1 2 NA
## 53 19.07 1 1 NA
## 54 20.11 1 1 NA
## 55 20.11 1 1 NA
## 56 20.15 1 1 NA
## 57 21.15 1 1 NA
## 58 20.15 1 1 NA
## 59 20.15 1 1 NA
## 6 28.87 2 1 1
## 60 20.15 1 2 NA
## 61 32.38 2 1 NA
## 62 25.99 2 1 NA
## 63 23.98 2 1 NA
## 64 25.99 2 2 NA
## 65 29.02 2 2 NA
## 66 43.07 2 1 NA
## 67 47.24 2 1 NA
## 68 47.24 2 2 NA
## 69 29.17 2 1 NA
## 7 35.97 2 1 3
## 70 21.15 1 2 NA
## 71 21.15 1 1 NA
## 72 28.25 2 1 NA
## 73 24.23 2 1 NA
## 74 24.23 2 1 NA
## 75 21.22 1 1 NA
## 76 21.22 1 1 NA
## 77 31.32 2 2 NA
## 78 26.30 2 1 NA
## 79 32.32 2 2 NA
## 8 34.27 2 1 3
## 80 27.30 2 1 NA
## 81 26.30 2 2 NA
## 82 21.28 1 1 NA
## 83 20.34 1 1 NA
## 84 22.34 1 1 NA
## 85 19.39 1 1 NA
## 86 21.40 1 1 NA
## 87 19.39 1 2 NA
## 88 19.39 1 2 NA
## 89 22.49 1 1 NA
## 9 20.63 1 2 1
## 90 20.49 1 1 NA
## 91 21.49 1 2 NA
## 92 19.54 1 1 NA
## 93 23.56 1 1 NA
## 94 20.55 1 2 NA
## 95 21.55 1 1 NA
## 96 19.54 1 2 NA
## event_start_covid exposure_material_type_influenza
## 1 1 3
## 10 3 1
## 11 3 1
## 12 1 3
## 13 3 1
## 14 3 1
## 15 3 1
## 16 1 3
## 17 3 1
## 18 3 1
## 19 3 1
## 2 3 1
## 20 3 1
## 21 3 1
## 22 1 3
## 23 1 3
## 24 3 1
## 25 3 1
## 26 1 3
## 27 3 1
## 28 3 1
## 29 3 1
## 3 1 1
## 30 3 1
## 31 3 1
## 32 1 3
## 33 3 1
## 34 1 3
## 35 1 3
## 36 2 3
## 37 2 2
## 38 1 1
## 39 1 3
## 4 3 1
## 40 1 3
## 41 1 1
## 42 1 3
## 43 1 3
## 44 1 1
## 45 1 3
## 46 1 1
## 47 1 3
## 48 2 3
## 49 1 3
## 5 3 1
## 50 1 3
## 51 1 3
## 52 NA NA
## 53 NA NA
## 54 NA NA
## 55 NA NA
## 56 NA NA
## 57 NA NA
## 58 NA NA
## 59 NA NA
## 6 3 1
## 60 NA NA
## 61 NA NA
## 62 NA NA
## 63 NA NA
## 64 NA NA
## 65 NA NA
## 66 NA NA
## 67 NA NA
## 68 NA NA
## 69 NA NA
## 7 3 1
## 70 NA NA
## 71 NA NA
## 72 NA NA
## 73 NA NA
## 74 NA NA
## 75 NA NA
## 76 NA NA
## 77 NA NA
## 78 NA NA
## 79 NA NA
## 8 3 1
## 80 NA NA
## 81 NA NA
## 82 NA NA
## 83 NA NA
## 84 NA NA
## 85 NA NA
## 86 NA NA
## 87 NA NA
## 88 NA NA
## 89 NA NA
## 9 3 1
## 90 NA NA
## 91 NA NA
## 92 NA NA
## 93 NA NA
## 94 NA NA
## 95 NA NA
## 96 NA NA
## exposure_material_type_covid IgG.PT.day14 CCL3.day3 Monocytes.day1
## 1 1 7.6475855 8.531381 NA
## 10 2 2.2695086 7.491853 NA
## 11 2 8.6987531 7.179909 7.257095
## 12 1 6.6128578 NA NA
## 13 2 5.8841566 9.832890 NA
## 14 2 8.0768224 NA NA
## 15 2 6.6651860 8.118941 10.585489
## 16 1 6.4541001 NA NA
## 17 2 7.4085495 8.209453 16.401488
## 18 2 7.5228254 6.643856 NA
## 19 2 7.4194018 7.066089 NA
## 2 2 NA NA NA
## 20 2 7.0689888 8.909893 26.605583
## 21 2 5.1846314 7.900867 34.812168
## 22 1 6.5076306 6.375039 NA
## 23 1 7.0576625 7.066089 NA
## 24 2 4.6224481 7.238405 NA
## 25 2 4.5909272 7.562242 NA
## 26 1 5.5468623 7.219169 16.108508
## 27 2 5.4905601 7.592457 NA
## 28 2 6.8762502 NA NA
## 29 2 6.2496495 8.154818 25.083209
## 3 1 7.0245630 7.888743 NA
## 30 2 5.9932153 NA NA
## 31 2 5.0875933 8.897845 8.545243
## 32 1 8.0965425 9.353147 NA
## 33 2 3.8450834 9.550747 17.703064
## 34 1 7.4372481 NA NA
## 35 1 6.2916862 7.774787 NA
## 36 1 5.4060017 7.584963 17.446750
## 37 1 NA NA NA
## 38 1 6.9326678 5.977280 NA
## 39 1 6.1874734 NA NA
## 4 2 7.1886911 7.066089 7.211965
## 40 1 7.1901774 NA NA
## 41 1 7.6409027 NA NA
## 42 1 7.3543048 8.980140 NA
## 43 1 4.7818220 8.317413 NA
## 44 1 8.2039885 7.209453 35.241054
## 45 1 5.3037468 NA 13.545087
## 46 1 7.6599183 NA 30.018793
## 47 1 8.1390023 13.173521 8.663056
## 48 1 0.6191782 11.965063 18.252658
## 49 1 7.1395468 NA 10.347126
## 5 2 6.6256103 7.554589 NA
## 50 1 5.7517331 13.075145 NA
## 51 1 6.2205920 NA NA
## 52 NA 6.7943897 13.313733 23.512649
## 53 NA 7.7831903 12.686938 NA
## 54 NA 7.4376533 NA NA
## 55 NA 6.9266194 NA 15.334381
## 56 NA 7.6690347 NA NA
## 57 NA 6.3923743 NA NA
## 58 NA 7.8093466 NA NA
## 59 NA 8.4674371 NA NA
## 6 2 7.3965736 7.761551 41.380502
## 60 NA 4.4714296 NA NA
## 61 NA 8.2526654 8.717676 NA
## 62 NA 10.5975417 10.582142 NA
## 63 NA 10.9051623 9.266787 18.700000
## 64 NA 9.1278687 9.333155 13.800000
## 65 NA 10.8445096 11.200899 20.400000
## 66 NA 11.0087787 9.951285 13.900000
## 67 NA 11.0562652 8.826548 31.100000
## 68 NA 10.6132806 8.344296 15.500000
## 69 NA 10.1689229 9.631177 18.900000
## 7 2 6.5497867 NA NA
## 70 NA 7.6596483 8.495855 46.100000
## 71 NA 9.8422738 9.881114 18.200000
## 72 NA 10.7625897 8.243174 27.500000
## 73 NA 10.3994888 8.717676 23.400000
## 74 NA 8.5950614 7.700440 50.000000
## 75 NA 7.1006623 7.924813 NA
## 76 NA 10.2854022 7.686501 22.300000
## 77 NA 8.9872640 10.913637 31.000000
## 78 NA 9.7013065 8.290019 35.700000
## 79 NA 9.9947070 11.197831 52.600000
## 8 2 NA NA NA
## 80 NA 7.9628960 11.810973 26.400000
## 81 NA 9.0161118 11.244958 35.600000
## 82 NA NA 12.988152 20.500000
## 83 NA 8.7729745 7.033423 27.700000
## 84 NA 8.8946701 9.219169 25.100000
## 85 NA 7.4072678 8.479780 18.700000
## 86 NA 9.2672550 9.159871 25.200000
## 87 NA NA 10.062046 22.500000
## 88 NA NA 12.066426 20.200000
## 89 NA 7.0762538 9.377211 16.600000
## 9 2 4.2023938 8.388017 NA
## 90 NA 7.8005768 8.781360 21.600000
## 91 NA 8.9320595 7.459432 40.400000
## 92 NA 8.5651021 8.108524 33.000000
## 93 NA 9.3359486 8.189825 15.000000
## 94 NA 9.9581902 7.139551 39.500000
## 95 NA 10.0764114 7.721099 23.500000
## 96 NA 8.5961898 7.475733 35.000000
###Distination plot
distPlot <- function(col, df){
p <-
ggplot(df) +
aes_string(col)
if(is.numeric(df[[col]])) {
p <- p + geom_density()
} else {
p <- p + geom_bar()
}
}
names(dataDf)[2:length(dataDf)]
## [1] "CCL3" "IgG.FHA"
## [3] "IgG.PRN" "IgG.PT"
## [5] "Monocytes" "age_at_boost"
## [7] "infancy_vac" "biological_sex"
## [9] "event_start_influenza" "event_start_covid"
## [11] "exposure_material_type_influenza" "exposure_material_type_covid"
## [13] "IgG.PT.day14" "CCL3.day3"
## [15] "Monocytes.day1"
distPlots <- lapply(names(dataDf)[2:length(dataDf)], distPlot, df=dataDf)
plot_grid(plotlist = distPlots)
## Warning: Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Warning: Removed 45 rows containing non-finite values (`stat_density()`).
## Removed 45 rows containing non-finite values (`stat_density()`).
## Removed 45 rows containing non-finite values (`stat_density()`).
## Removed 45 rows containing non-finite values (`stat_density()`).
## Warning: Removed 6 rows containing non-finite values (`stat_density()`).
## Warning: Removed 24 rows containing non-finite values (`stat_density()`).
## Warning: Removed 43 rows containing non-finite values (`stat_density()`).
options(warn=-1)
# influenzaMarkers <- c("CD4Tcells", "ALB", "ALT", "CREAT", "FERR", "T4", "CHOL", "HDL", "TRIG", "TNF", "IL-6", "IL-10", "IFN")
# xCols = c(c("IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "CD4Tcells", "age_at_boost", "infancy_vac", "biological_sex", "event_start_influenza", "event_start_covid", "exposure_material_type_influenza", "exposure_material_type_covid"), colnames(rnaDf))
# xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "CD4Tcells", "age_at_boost", "infancy_vac", "biological_sex", "event_start_influenza", "event_start_covid", "exposure_material_type_influenza", "exposure_material_type_covid")
xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex", "event_start_influenza", "event_start_covid", "exposure_material_type_influenza", "exposure_material_type_covid")
# xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex")
yCols = c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
pred_cor <- data.frame(matrix(nrow=length(yCols), ncol=3))
rownames(pred_cor) <- yCols
colnames(pred_cor) <- c('pearson.cor.pred.true', 'spearman.cor.pred.true', 'ranked.spearman.cor.pred.true')
rownames(dataDf) <- attr(dataDf, "row.names")
print(typeof(row.names(dataDf)))
## [1] "character"
#
for (i in 1:length(yCols)){
all_preds <- c()
all_true <- c()
all_reduced <- c()
set.seed(1)
filteredY <- na.omit(dataDf[yCols[i]])
filteredX <- na.omit(dataDf[xCols])
row_int <- intersect(rownames(filteredY), rownames(filteredX))
for (j in 1:length(row_int)){
train <- row_int[-c(j)]
xData <- filteredX[train, xCols]
yData <- filteredY[train,]
a1= nrow(xData[train,])
a2= nrow(xData[train,])-1
allidx = row_int
predidx = setdiff(allidx, train)
# create lasso model
cvfit_out <- cv.glmnet(x=as.matrix(xData), yData, family='gaussian',
alpha=1, nfolds=(nrow(xData[train,])-1))
preds <- suppressWarnings(predict(cvfit_out, newx = as.matrix(data.frame(filteredX[predidx,])), s='lambda.min'))
all_preds <- c(all_preds, preds)
all_true<- c(all_true, filteredY[predidx, yCols[i]])
}
b = data_frame(all_preds, rank(all_preds,na.last="keep",ties.method="min"), all_true, rank(all_true,na.last="keep",ties.method="min"))
pred_cor[yCols[i],'pearson.cor.pred.true'] <- cor(all_preds,all_true)
pred_cor[yCols[i],'spearman.cor.pred.true'] <- cor(all_preds,all_true, method="spearman")
pred_cor[yCols[i],'ranked.spearman.cor.pred.true'] <- cor(rank(all_preds,na.last="keep",ties.method="min"),rank(all_true,na.last="keep",ties.method="min"), method="spearman")
}
pred_cor
## pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14 0.3234688 0.4221543
## CCL3.day3 0.7345652 0.4118906
## Monocytes.day1 0.3876290 0.4750000
## ranked.spearman.cor.pred.true
## IgG.PT.day14 0.4221543
## CCL3.day3 0.4118906
## Monocytes.day1 0.4750000
typeof(pred_cor)
## [1] "list"
Consider only choosing models for follow-on analysis that show good correlation scores
size <- length(yCols)
all_models_coef<-vector(mode='list',length=size)
all_models_names<-vector(mode='list',length=size)
all_models<-vector(mode='list',length=size)
for (i in 1:length(yCols)){
set.seed(1)
filteredY <- na.omit(dataDf[yCols[i]])
filteredX <- na.omit(dataDf[xCols])
row_int <- intersect(rownames(filteredY), rownames(filteredX))
# create lasso model
suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(filteredX[row_int,]), as.matrix(filteredY[row_int,]), family='gaussian', alpha=1, nfolds=(nrow(filteredX[row_int,])-1)))
plot(cvfit_out)
all_models_coef[i]=list(coef(cvfit_out, s = 'lambda.min')[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
all_models_names[i]=list(rownames(coef(cvfit_out, s = 'lambda.min'))[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
}
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
names(all_models_coef) <- yCols
names(all_models_names) <- yCols
for (i in 1:size){
all_models[[i]] = data.frame(cbind(all_models_names[[i]],all_models_coef[[i]]))
colnames(all_models[[i]])<-c("Variable","Coefficient")
all_models[[i]]$Coefficient<-as.numeric(all_models[[i]]$Coefficient)
all_models[[i]]$Coefficient=round(all_models[[i]]$Coefficient,3)
all_models[[i]]<-all_models[[i]] %>% arrange(desc(abs(Coefficient)))
}
names(all_models)<-yCols
all_models
## $IgG.PT.day14
## Variable Coefficient
## 1 (Intercept) 6.062
## 2 IgG.PT 0.806
## 3 event_start_covid -0.299
## 4 event_start_influenza -0.148
## 5 IgG.PRN 0.124
##
## $CCL3.day3
## Variable Coefficient
## 1 (Intercept) 4.263
## 2 CCL3 1.092
## 3 infancy_vac -0.764
## 4 event_start_covid 0.152
## 5 IgG.PRN -0.108
##
## $Monocytes.day1
## Variable Coefficient
## 1 (Intercept) 6.227
## 2 Monocytes 0.992
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/rasteh/models")
a= append(list(pred_cor),all_models)
names(a)[1] <- 'pred_cor'
# sink("expanded_allModels_predCor_baseExposure_removedDiplicates_normalized.txt")
# sink("expanded_allModels_predCor_notNormalized.txt")
print(a)
## $pred_cor
## pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14 0.3234688 0.4221543
## CCL3.day3 0.7345652 0.4118906
## Monocytes.day1 0.3876290 0.4750000
## ranked.spearman.cor.pred.true
## IgG.PT.day14 0.4221543
## CCL3.day3 0.4118906
## Monocytes.day1 0.4750000
##
## $IgG.PT.day14
## Variable Coefficient
## 1 (Intercept) 6.062
## 2 IgG.PT 0.806
## 3 event_start_covid -0.299
## 4 event_start_influenza -0.148
## 5 IgG.PRN 0.124
##
## $CCL3.day3
## Variable Coefficient
## 1 (Intercept) 4.263
## 2 CCL3 1.092
## 3 infancy_vac -0.764
## 4 event_start_covid 0.152
## 5 IgG.PRN -0.108
##
## $Monocytes.day1
## Variable Coefficient
## 1 (Intercept) 6.227
## 2 Monocytes 0.992
# sink()
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.