baseModel

library(omicade4)

## Loading required package: ade4

library(mogsa)
library(RSpectra)
# library(lubridate)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-7

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:mogsa':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(cowplot)
library(ggplot2)

setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized")

# Read in metadata
meta.2020<-read.table('clinical_metadata.2020.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
meta.2021<-read.table('clinical_metadata.2021.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

files from imputed data is already normalised so we don’t need to normalise them

# imputed_dir is path to local drive where data is saved
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/imputed")

# Import imputed datasets
rnaseq_baseline_mat_imputed_20 <- read.csv('rnaseq_baseline_mat_imputed_20_051022.csv',row.names=1)
cytof_baseline_mat_imputed_20 <- read.csv('cytof_baseline_mat_imputed_20_051022.csv',row.names=1)
olink_baseline_mat_imputed_20 <- read.csv('olink_baseline_mat_imputed_20_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_20 <- read.csv('abtiters_baseline_mat_imputed_20_051022.csv',row.names=1)

rnaseq_baseline_mat_imputed_21 <- read.csv('rnaseq_baseline_mat_imputed_21_051022.csv',row.names=1)
cytof_baseline_mat_imputed_21 <- read.csv('cytof_baseline_mat_imputed_21_051022.csv',row.names=1)
olink_baseline_mat_imputed_21 <- read.csv('olink_baseline_mat_imputed_21_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_21 <- read.csv('abtiters_baseline_mat_imputed_21_051022.csv',row.names=1)

colnames(abtiters_baseline_mat_imputed_21)

##  [1] "IgG.FHA"     "IgG.PRN"     "IgG.PT"      "IgG1.FHA"    "IgG1.FIM2.3"
##  [6] "IgG1.PRN"    "IgG1.PT"     "IgG2.FHA"    "IgG2.FIM2.3" "IgG2.PRN"   
## [11] "IgG2.PT"     "IgG3.FHA"    "IgG3.FIM2.3" "IgG3.PRN"    "IgG3.PT"    
## [16] "IgG4.FHA"    "IgG4.FIM2.3" "IgG4.PRN"    "IgG4.PT"

tasks_seq<-c('ENSG00000277632','ENSG00000136244','ENSG00000100906','ENSG00000229807')
names(rnaseq_baseline_mat_imputed_20[tasks_seq])

## [1] "ENSG00000277632" "ENSG00000136244" "ENSG00000100906" "ENSG00000229807"

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}

distPlots <- lapply(tasks_seq, distPlot, df=rnaseq_baseline_mat_imputed_20)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_grid(plotlist = distPlots)

# Get age at boost
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:cowplot':
## 
##     stamp

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

meta.2020$date_of_boost<-parse_date_time(meta.2020$date_of_boost,"ymd")
meta.2020$year_of_birth<-parse_date_time(meta.2020$year_of_birth,"ymd")
meta.2020$age_at_boost<- as.numeric(round(difftime(meta.2020$date_of_boost,
                                                    meta.2020$year_of_birth,units="weeks")/52,2))
meta.2021$date_of_boost<-parse_date_time(meta.2021$date_of_boost,"ymd")
meta.2021$year_of_birth<-parse_date_time(meta.2021$year_of_birth,"ymd")
meta.2021$age_at_boost<- as.numeric(round(difftime(meta.2021$date_of_boost,
                                                    meta.2021$year_of_birth,units="weeks")/52,2))

meta <- rbind(meta.2020[c("age_at_boost", "infancy_vac", "biological_sex")], meta.2021[c("age_at_boost", "infancy_vac", "biological_sex")])

meta$infancy_vac <- as.numeric(meta$infancy_vac)
meta$biological_sex <- as.numeric(meta$biological_sex)
colnames(meta)

## [1] "age_at_boost"   "infancy_vac"    "biological_sex"

add exposure table

library(DBI)
library(RPostgreSQL)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

library(dplyr)
library(readr)

dsn_database = "cmipb_v4_0"   
dsn_hostname = "cmi-pb.lji.org"
dsn_port = "5432"                
dsn_uid = "cmipb"         
dsn_pwd = "b5mq62vW7JE2YUwq"


tryCatch({
    drv <- dbDriver("PostgreSQL")
    print("Connecting to Database…")
    connec <- dbConnect(drv,
                 dbname = dsn_database,
                 host = dsn_hostname,
                 port = dsn_port,
                 user = dsn_uid,
                 password = dsn_pwd)
    print("Database Connected!")
    },
    error=function(cond) {
            print("Unable to connect to Database.")
    })

## [1] "Connecting to Database…"
## [1] "Database Connected!"

# dbListTables(connec)

library(tibble)
library(tidyverse)

## ── Attaching core tidyverse packages ─────────────────── tidyverse 2.0.0.9000 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()   masks mogsa::combine()
## ✖ tidyr::expand()    masks Matrix::expand()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ tidyr::pack()      masks Matrix::pack()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ✖ tidyr::unpack()    masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# library(lubridate)
exposDf <- dbGetQuery(connec, "SELECT * FROM immune_exposure")
ifelse(exposDf$event_start==exposDf$event_end,"Yes","No")

##   [1] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [13] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [25] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [37] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [49] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [61] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [73] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [85] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [97] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [109] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [121] "Yes"

exposDf %>% count(event_type)

##                 event_type  n
## 1               bronchitis  1
## 2                    covid 36
## 3                     dtap  1
## 4                      hib  2
## 5                      hpv 11
## 6                influenza 32
## 7    japanese encephalitis  1
## 8            meningococcal  8
## 9            mononucleosis  1
## 10            pneumococcal  1
## 11               pneumonia  3
## 12                   polio 10
## 13            strep throat  4
## 14 tick-borne encephalitis  1
## 15                 typhoid  1
## 16               varicella  7
## 17          whooping cough  1

exposDf$exposure_material_type <- as.character(as.numeric(as.factor(exposDf$exposure_material_type)))
exposDf$event_start <- as.character(as.numeric(as.factor(exposDf$event_start)))
# 
exposDf <- dplyr::filter(exposDf, event_type %in% c('covid', 'influenza'))

exposDf1 <- exposDf[, c('subject_id', 'event_type','exposure_material_type', 'event_start')] %>%
              pivot_wider( names_from = event_type,
              values_from = c(event_start, exposure_material_type),
              values_fn= toString,
              values_fill='Not known'
              )

colnames(exposDf1)

## [1] "subject_id"                       "event_start_influenza"           
## [3] "event_start_covid"                "exposure_material_type_influenza"
## [5] "exposure_material_type_covid"

typeof(exposDf1)

## [1] "list"

exposDf2 <- as.data.frame(exposDf1)
for (col in colnames(exposDf1)[2:length(colnames(exposDf1))]){
  
  exposDf2 <- exposDf1[!grepl(',', exposDf1[[col]]), ]
}

exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.factor))
exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.numeric))
exposDf2

## # A tibble: 51 × 5
##    subject_id event_start_influenza event_start_covid exposure_material_type_i…¹
##         <dbl>                 <dbl>             <dbl>                      <dbl>
##  1          3                     1                 1                          1
##  2         38                     1                 1                          1
##  3         41                     3                 1                          1
##  4         46                     2                 1                          1
##  5         44                     2                 1                          1
##  6          2                     1                 3                          1
##  7          4                     2                 3                          1
##  8          5                     1                 3                          1
##  9          6                     1                 3                          1
## 10          7                     3                 3                          1
## # ℹ 41 more rows
## # ℹ abbreviated name: ¹exposure_material_type_influenza
## # ℹ 1 more variable: exposure_material_type_covid <dbl>

# nullToNA <- function(x) {
#     x[sapply(x, is.null)] <- 'Not known'
#     return(x)
# }
# z=exposDf3[2][1]
# exposDf3 <- lapply(exposDf2, nullToNA)
# exposDf3.df <- t(do.call(rbind,lapply(exposDf3, rbind)))
# 
# # z= dim(exposDf3.df)[1]
# # print(dim(exposDf3.df))
# # print(exposDf3.df[[2]][2])
# # exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=TRUE)
# exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=FALSE))
# # z= typeof(exposDf4)
# colnames(exposDf4) <- names(exposDf3)
# #
# # z=exposDf3.df$subject_id
# #

All data from abtiter been log2 transformed. Why? to make them normally distributed since our glm uses gaussian as the distribution kernel?

# Get the y data
dataY = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/yData_task_matrix.common_names.mfi_raw.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

dataY <- dataY[c("IgG.PT.day14", "ENSG00000277632.day3", "Monocytes.day1")]
colnames(dataY) <- c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
# dataY$IgG.PT.day14 <- log2(dataY[,'IgG.PT.day14']+1)
# dataY$CCL3.day3 <- log2(dataY[,'CCL3.day3']+1)

dataY$subject_id <- rownames(dataY)
colnames(dataY)

## [1] "IgG.PT.day14"   "CCL3.day3"      "Monocytes.day1" "subject_id"

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}

typeof(dataY)

## [1] "list"

# 
distPlots <- lapply(c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1"), distPlot, df=dataY)
plot_grid(plotlist = distPlots)

## Warning: Removed 4 rows containing non-finite values (`stat_density()`).

## Warning: Removed 22 rows containing non-finite values (`stat_density()`).

## Warning: Removed 41 rows containing non-finite values (`stat_density()`).

# df = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized/task_matrix.common_names.mfi_normalised.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
# 
# distPlot <- function(col, df){
#   p <- 
#     ggplot(df) +
#     aes_string(col)
# 
#   if(is.numeric(df[[col]])) {
#     p <- p + geom_density()
# 
#   } else {
#     p <- p + geom_bar()
#   }
# }
# 
# typeof(df)
# colnames(df)
# # 
# distPlots <- lapply(c("IgG.PT_day14", "CCL3_day3", "Monocytes_day1"), distPlot, df=df)
# plot_grid(plotlist = distPlots)

colnames(rnaseq_baseline_mat_imputed_20)[which(names(rnaseq_baseline_mat_imputed_20) == "ENSG00000277632")] <- "CCL3"
colnames(rnaseq_baseline_mat_imputed_21)[which(names(rnaseq_baseline_mat_imputed_21) == "ENSG00000277632")] <- "CCL3"

rnaDf <- rbind(rnaseq_baseline_mat_imputed_20["CCL3"], rnaseq_baseline_mat_imputed_21["CCL3"])

abtiterDf <- rbind(abtiters_baseline_mat_imputed_20[c("IgG.FHA", "IgG.PRN", "IgG.PT")], abtiters_baseline_mat_imputed_21[c("IgG.FHA", "IgG.PRN", "IgG.PT")])

cytofDf <- rbind(cytof_baseline_mat_imputed_20["Monocytes"], cytof_baseline_mat_imputed_21["Monocytes"])

dataDf1 <- merge(rnaDf, abtiterDf, by='row.names', all=T)
colnames(dataDf1)[1] <- "subject_id" 

dataDf2 <- merge(cytofDf, meta, by='row.names', all=T)
colnames(dataDf2)[1] <- "subject_id"

# dataX1 <- merge(dataDf1, dataDf2, by='subject_id', all=T)
dataX <- merge(dataDf1, dataDf2, by='subject_id', all=T)
colnames(dataX)[1] <- "subject_id"

# dataX <- merge(dataX1, exposDf2, by='subject_id', all=T)
# 
dataDf <- merge(dataX, dataY, by='subject_id', all=T)
rownames(dataDf) <- dataDf$subject_id

dataDf

##    subject_id      CCL3    IgG.FHA    IgG.PRN     IgG.PT  Monocytes
## 1           1  5.333531 5.13138189 1.84893838 2.24397113  7.5260062
## 10         10  4.101398 0.14364571 0.09320618 0.04154169 13.4642547
## 11         11  4.542939 2.47376114 3.18404386 1.77698271 10.7905171
## 12         12        NA         NA         NA         NA         NA
## 13         13  3.399718 1.07740175 1.12798355 1.36806068  5.0304685
## 14         14        NA         NA         NA         NA         NA
## 15         15  4.835874 1.14771363 1.03058953 0.38100554 15.6373679
## 16         16        NA         NA         NA         NA         NA
## 17         17  4.512669 1.78634624 0.48701666 3.45553366 14.4574172
## 18         18  4.083384 3.51822961 1.56843058 1.68734290  0.9568001
## 19         19  4.459300 1.04292223 1.84192155 2.16297720 13.5202756
## 2           2        NA         NA         NA         NA         NA
## 20         20  3.963474 1.53996887 0.59107133 2.55022450 21.3991621
## 21         21  3.823138 3.93012107 1.68057253 2.77448207 15.2266453
## 22         22  4.462314 2.14113953 1.30342083 2.16393744 11.9355890
## 23         23  4.421223 1.68946851 0.43547450 0.40194862 11.7041207
## 24         24  3.106516 0.60815892 1.06497238 0.26992483 11.5201600
## 25         25  4.042732 0.34555009 0.80002675 0.74967184 13.4709985
## 26         26  4.735685 0.98270780 1.12156687 0.72685734  5.5320667
## 27         27  3.782618 1.68013778 0.84721352 0.77080873 13.8965682
## 28         28        NA         NA         NA         NA         NA
## 29         29  4.447249 2.58070095 2.87788355 0.99688351 16.0947648
## 3           3  4.599972 1.06795335 3.11313957 1.06789053 13.0245725
## 30         30        NA         NA         NA         NA         NA
## 31         31  4.693487 0.40767306 2.02769318 0.16545091  4.8790233
## 32         32  5.779549 0.80344564 1.30960268 1.40368966 13.5492381
## 33         33  5.311794 0.66955298 0.24197417 0.63824158  7.6518328
## 34         34        NA         NA         NA         NA         NA
## 35         35  5.434762 1.75928512 1.23573946 1.19302502 13.5420630
## 36         36  3.016496 2.00337580 2.33488694 1.04080942 11.2437986
## 37         37        NA         NA         NA         NA         NA
## 38         38  3.017922 0.81897225 0.56487492 0.77149215 13.7721006
## 39         39        NA         NA         NA         NA         NA
## 4           4  4.094574 1.03441003 2.73777402 1.60723380  5.4908380
## 40         40        NA         NA         NA         NA         NA
## 41         41        NA         NA         NA         NA         NA
## 42         42  5.230280 0.86883965 2.56022738 1.11654738 10.6095832
## 43         43  3.871351 1.01708739 1.30963816 1.00310977 13.6337391
## 44         44  4.518409 0.52901785 0.72866760 0.92865661 23.1032762
## 45         45        NA         NA         NA         NA         NA
## 46         46        NA         NA         NA         NA         NA
## 47         47  6.900831 1.56572050 0.54455774 2.65845646 10.1292397
## 48         48  6.731536 0.48194166 1.03140808 0.04154169 14.9597507
## 49         49        NA         NA         NA         NA         NA
## 5           5  4.916572 0.11694600 2.64806823 2.26243525 13.1258248
## 50         50  8.091139 0.61246324 0.32074600 0.50769248  9.3613078
## 51         51        NA         NA         NA         NA         NA
## 52         52  7.288949 0.11694600 1.56152075 1.97201547 28.5490352
## 53         53  9.867189 1.90423998 0.47059211 4.17711376 12.8294284
## 54         54        NA         NA         NA         NA         NA
## 55         55        NA         NA         NA         NA         NA
## 56         56        NA         NA         NA         NA         NA
## 57         57        NA         NA         NA         NA         NA
## 58         58        NA         NA         NA         NA         NA
## 59         59        NA         NA         NA         NA         NA
## 6           6  4.575554 0.46392532 0.12456085 0.27896948 22.7930283
## 60         60        NA         NA         NA         NA         NA
## 61         61 12.042340 3.54131321 0.15131429 1.00000000 12.8005992
## 62         62  6.937333 2.82093049 0.75122163 1.07602222 13.5692650
## 63         63  6.691953 1.95776540 0.11013376 1.79570532 15.3000000
## 64         64  6.760926 2.68882561 0.89982323 2.04705167 12.9000000
## 65         65  7.605079 3.31317657 0.64048964 0.80140188 15.8000000
## 66         66  5.042425 3.74662797 1.43985527 1.32512343 15.8000000
## 67         67  5.889230 2.00974701 0.35723800 0.70618673 34.6000000
## 68         68  5.853946 1.11683339 1.94255138 1.05459788 13.5000000
## 69         69  5.528321 1.65215623 1.21850657 1.07297696 13.1000000
## 7           7        NA         NA         NA         NA         NA
## 70         70  5.815166 3.35167692 1.65402214 0.47027882 32.3000000
## 71         71  5.987707 4.65325882 0.87273399 2.90659418 12.2000000
## 72         72  5.880881 0.28729447 2.12336100 2.08840338 19.9000000
## 73         73  4.529196 0.80435967 1.43128987 1.88327381 15.7000000
## 74         74  3.871745 0.62423924 1.60899360 0.52016941 36.3000000
## 75         75  5.678804 3.05443604 1.81011848 0.17736981 14.4559889
## 76         76  4.663800 4.53841221 0.11013376 3.53643892 18.0000000
## 77         77  8.217396 0.06711778 1.06772479 0.69081102 28.8000000
## 78         78  7.151839 0.10084120 0.25375330 0.99519368 29.0000000
## 79         79  6.969300 0.20652117 0.14721857 2.32256773 41.5000000
## 8           8        NA         NA         NA         NA         NA
## 80         80  7.178107 0.14117982 1.58147601 0.46841860 21.9000000
## 81         81 10.726589 0.07947224 1.13942560 1.81601157 34.9000000
## 82         82  9.413611 0.96199368 0.94934816 0.76723104 22.5000000
## 83         83  8.588209 1.00000000 2.03087458 1.26356779 32.4000000
## 84         84  7.593122 2.31136374 0.44903137 0.28053529 18.9000000
## 85         85  4.937862 3.32067659 0.92948200 0.33276350 19.7000000
## 86         86  5.362224 0.67511620 0.26301839 0.30403242 23.5000000
## 87         87  5.295650 3.21116798 0.96864376 1.07997538 22.9000000
## 88         88  5.567850 1.48284828 1.33113192 1.04544297 17.6000000
## 89         89  5.130601 0.04523958 0.13834498 0.06807137 15.7000000
## 9           9  4.238481 0.46042437 0.20839284 0.28783862  5.7229578
## 90         90  4.837136 0.74851784 1.63121255 0.33985853 20.6000000
## 91         91  4.140370 1.12281459 0.78254880 0.92749667 25.4000000
## 92         92  4.623047 0.04523958 1.03269446 0.23829881 40.6000000
## 93         93  4.616769 0.12543138 1.60575881 1.57747907 16.3000000
## 94         94  4.785446 0.08254440 1.26047137 2.65824315 26.2000000
## 95         95  4.878235 0.37016694 1.00000000 2.29792327 17.3000000
## 96         96  4.356355 0.19946349 0.92277461 0.76421696 34.2000000
##    age_at_boost infancy_vac biological_sex IgG.PT.day14 CCL3.day3
## 1         30.80           2              1   199.517666       369
## 10        34.68           2              1     3.821589       179
## 11        30.76           2              1   414.513947       144
## 12        34.68           2              2    96.874274        NA
## 13        19.63           1              2    58.061932       911
## 14        23.70           2              2   269.001265        NA
## 15        27.71           2              2   100.489455       277
## 16        29.66           2              1    86.675397        NA
## 17        36.82           2              1   168.900882       295
## 18        19.73           1              1   182.906088        99
## 19        22.81           2              2   170.183735       133
## 2         51.25           2              1           NA        NA
## 20        35.78           2              1   133.269594       480
## 21        33.77           2              2    35.368851       238
## 22        31.77           2              1    89.989653        82
## 23        25.82           2              1   132.219593       133
## 24        24.79           2              1    23.631765       150
## 25        28.80           2              1    23.099432       188
## 26        33.85           2              1    45.748957       148
## 27        19.80           1              1    43.959688       192
## 28        34.85           2              2   116.478279        NA
## 29        19.80           1              2    75.090769       284
## 3         33.89           2              1   129.197956       236
## 30        28.84           2              1    62.699730        NA
## 31        27.83           2              1    33.003075       476
## 32        19.88           1              2   272.717237       653
## 33        26.87           2              2    13.370949       749
## 34        33.93           2              1   172.314446        NA
## 35        25.86           2              2    77.340488       218
## 36        19.88           1              1    41.400275       191
## 37        18.91           1              1           NA        NA
## 38        19.88           1              1   121.163359        62
## 39        31.92           2              1    71.881127        NA
## 4         28.76           2              2   144.885339       133
## 40        22.89           2              1   145.035713        NA
## 41        31.96           2              2   198.590973        NA
## 42        19.92           1              1   162.631291       504
## 43        18.91           1              1    26.508813       318
## 44        18.91           1              1   293.880891       147
## 45        19.98           1              1    38.499070        NA
## 46        18.91           1              1   201.239119        NA
## 47        20.98           1              1   280.892693      9238
## 48        19.11           1              1     0.536000      3997
## 49        20.11           1              1   139.999552        NA
## 5         25.75           2              2    97.743258       187
## 50        19.98           1              1    52.882062      8629
## 51        19.98           1              2    73.573545        NA
## 52        19.07           1              2   109.997984     10181
## 53        19.07           1              1   219.279332      6593
## 54        20.11           1              1   172.363131        NA
## 55        20.11           1              1   120.652266        NA
## 56        20.15           1              1   202.521120        NA
## 57        21.15           1              1    83.003313        NA
## 58        20.15           1              1   223.309447        NA
## 59        20.15           1              1   352.958671        NA
## 6         28.87           2              1   167.496355       216
## 60        20.15           1              2    21.183724        NA
## 61        32.38           2              1   304.000000       420
## 62        25.99           2              1  1548.451277      1532
## 63        23.98           2              1  1916.701277       615
## 64        25.99           2              2   558.451277       644
## 65        29.02           2              2  1837.750000      2353
## 66        43.07           2              1  2059.500000       989
## 67        47.24           2              1  2128.450062       453
## 68        47.24           2              2  1565.447441       324
## 69        29.17           2              1  1150.200062       792
## 7         35.97           2              1    92.687631        NA
## 70        21.15           1              2   201.201277       360
## 71        21.15           1              1   916.951277       942
## 72        28.25           2              1  1736.250000       302
## 73        24.23           2              1  1349.697441       420
## 74        24.23           2              1   385.697441       207
## 75        21.22           1              1   136.250000       242
## 76        21.22           1              1  1247.000000       205
## 77        31.32           2              2   506.500000      1928
## 78        26.30           2              1   831.500000       312
## 79        32.32           2              2  1019.250000      2348
## 8         34.27           2              1           NA        NA
## 80        27.30           2              1   248.500000      3592
## 81        26.30           2              2   516.750000      2426
## 82        21.28           1              1           NA      8124
## 83        20.34           1              1   436.450062       130
## 84        22.34           1              1   474.951277       595
## 85        19.39           1              1   168.750000       356
## 86        21.40           1              1   615.200062       571
## 87        19.39           1              2           NA      1068
## 88        19.39           1              2           NA      4288
## 89        22.49           1              1   133.947441       664
## 9         20.63           1              2    17.409695       334
## 90        20.49           1              1   221.950062       439
## 91        21.49           1              2   487.447441       175
## 92        19.54           1              1   377.750000       275
## 93        23.56           1              1   645.250000       291
## 94        20.55           1              2   993.750000       140
## 95        21.55           1              1  1078.697441       210
## 96        19.54           1              2   386.000000       177
##    Monocytes.day1
## 1              NA
## 10             NA
## 11       7.257095
## 12             NA
## 13             NA
## 14             NA
## 15      10.585489
## 16             NA
## 17      16.401488
## 18             NA
## 19             NA
## 2              NA
## 20      26.605583
## 21      34.812168
## 22             NA
## 23             NA
## 24             NA
## 25             NA
## 26      16.108508
## 27             NA
## 28             NA
## 29      25.083209
## 3              NA
## 30             NA
## 31       8.545243
## 32             NA
## 33      17.703064
## 34             NA
## 35             NA
## 36      17.446750
## 37             NA
## 38             NA
## 39             NA
## 4        7.211965
## 40             NA
## 41             NA
## 42             NA
## 43             NA
## 44      35.241054
## 45      13.545087
## 46      30.018793
## 47       8.663056
## 48      18.252658
## 49      10.347126
## 5              NA
## 50             NA
## 51             NA
## 52      23.512649
## 53             NA
## 54             NA
## 55      15.334381
## 56             NA
## 57             NA
## 58             NA
## 59             NA
## 6       41.380502
## 60             NA
## 61             NA
## 62             NA
## 63      18.700000
## 64      13.800000
## 65      20.400000
## 66      13.900000
## 67      31.100000
## 68      15.500000
## 69      18.900000
## 7              NA
## 70      46.100000
## 71      18.200000
## 72      27.500000
## 73      23.400000
## 74      50.000000
## 75             NA
## 76      22.300000
## 77      31.000000
## 78      35.700000
## 79      52.600000
## 8              NA
## 80      26.400000
## 81      35.600000
## 82      20.500000
## 83      27.700000
## 84      25.100000
## 85      18.700000
## 86      25.200000
## 87      22.500000
## 88      20.200000
## 89      16.600000
## 9              NA
## 90      21.600000
## 91      40.400000
## 92      33.000000
## 93      15.000000
## 94      39.500000
## 95      23.500000
## 96      35.000000

###Distination plot

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}


names(dataDf)[2:length(dataDf)]

##  [1] "CCL3"           "IgG.FHA"        "IgG.PRN"        "IgG.PT"        
##  [5] "Monocytes"      "age_at_boost"   "infancy_vac"    "biological_sex"
##  [9] "IgG.PT.day14"   "CCL3.day3"      "Monocytes.day1"

distPlots <- lapply(names(dataDf)[2:length(dataDf)], distPlot, df=dataDf)
plot_grid(plotlist = distPlots)

## Warning: Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).

## Warning: Removed 6 rows containing non-finite values (`stat_density()`).

## Warning: Removed 24 rows containing non-finite values (`stat_density()`).

## Warning: Removed 43 rows containing non-finite values (`stat_density()`).

Test model quality

options(warn=-1)
# xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex", "event_start_influenza", "event_start_covid", "exposure_material_type_influenza", "exposure_material_type_covid")

xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex")
yCols = c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")

pred_cor <- data.frame(matrix(nrow=length(yCols), ncol=3))
rownames(pred_cor) <- yCols
colnames(pred_cor) <- c('pearson.cor.pred.true', 'spearman.cor.pred.true', 'ranked.spearman.cor.pred.true')

rownames(dataDf) <- attr(dataDf, "row.names")
print(typeof(row.names(dataDf)))

## [1] "character"

# 
for (i in 1:length(yCols)){
  all_preds <- c()
  all_true <- c()
  all_reduced <- c()
  set.seed(1)
  
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  for (j in 1:length(row_int)){
    train <- row_int[-c(j)]
    xData <- filteredX[train, xCols]
    yData <- filteredY[train,]
    
    a1= nrow(xData[train,])
    a2= nrow(xData[train,]-1)
    
    allidx = row_int
    predidx = setdiff(allidx, train)
    
    # create lasso model
    cvfit_out <- cv.glmnet(x=as.matrix(xData), yData, family='gaussian',
                         alpha=1, nfolds=nrow(xData[train,]))

    preds <- predict(cvfit_out, newx = as.matrix(data.frame(filteredX[predidx,])), s='lambda.min')

    all_preds <- c(all_preds, preds)
    all_true<- c(all_true, filteredY[predidx, yCols[i]])
    all_reduced <- c(all_reduced, (filteredY[predidx, yCols[i]]-preds) )
  }
  
  b = data_frame(all_preds, rank(all_preds,na.last="keep",ties.method="min"), all_true, rank(all_true,na.last="keep",ties.method="min"))

  pred_cor[yCols[i],'pearson.cor.pred.true'] <- cor(all_preds,all_true)
  pred_cor[yCols[i],'spearman.cor.pred.true'] <- cor(all_preds,all_true, method="spearman")
  pred_cor[yCols[i],'ranked.spearman.cor.pred.true'] <- cor(rank(all_preds,na.last="keep",ties.method="min"),rank(all_true,na.last="keep",ties.method="min"), method="spearman")
  pred_cor[yCols[i],'MSE'] <- mean((all_reduced)^2)
}

pred_cor

##                pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14               0.1635785              0.3913774
## CCL3.day3                  0.4678559              0.5198132
## Monocytes.day1             0.7531411              0.7884895
##                ranked.spearman.cor.pred.true          MSE
## IgG.PT.day14                       0.3913774 3.350612e+05
## CCL3.day3                          0.5198132 3.878264e+06
## Monocytes.day1                     0.7884895 5.045415e+01

typeof(pred_cor)

## [1] "list"

For each model, can assess which features contribute to non-zero coefficients

Consider only choosing models for follow-on analysis that show good correlation scores

size <- length(yCols)
all_models_coef<-vector(mode='list',length=size)
all_models_names<-vector(mode='list',length=size)
all_models<-vector(mode='list',length=size)

for (i in 1:length(yCols)){
  set.seed(1)
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  # create lasso model
  suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(filteredX[row_int,]), as.matrix(filteredY[row_int,]), family='gaussian', alpha=1, nfolds=nrow(filteredX[row_int,])))
  plot(cvfit_out)
  all_models_coef[i]=list(coef(cvfit_out, s = 'lambda.min')[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
  all_models_names[i]=list(rownames(coef(cvfit_out, s = 'lambda.min'))[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
}

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

names(all_models_coef) <- yCols
names(all_models_names) <- yCols

for (i in 1:size){
  all_models[[i]] = data.frame(cbind(all_models_names[[i]],all_models_coef[[i]]))
  colnames(all_models[[i]])<-c("Variable","Coefficient")
  all_models[[i]]$Coefficient<-as.numeric(all_models[[i]]$Coefficient)
  all_models[[i]]$Coefficient=round(all_models[[i]]$Coefficient,3)
  all_models[[i]]<-all_models[[i]] %>% arrange(desc(abs(Coefficient)))
}
names(all_models)<-yCols

all_models

## $IgG.PT.day14
##         Variable Coefficient
## 1    (Intercept)    -687.431
## 2         IgG.PT      72.815
## 3        IgG.FHA      61.339
## 4 biological_sex     -55.445
## 5        IgG.PRN     -45.001
## 6   age_at_boost      26.941
## 7           CCL3      25.473
## 8      Monocytes      15.596
## 
## $CCL3.day3
##      Variable Coefficient
## 1 infancy_vac   -1163.903
## 2        CCL3     674.274
## 3     IgG.FHA    -346.207
## 4      IgG.PT     325.465
## 5 (Intercept)     -76.142
## 6     IgG.PRN     -46.188
## 7   Monocytes     -30.664
## 
## $Monocytes.day1
##         Variable Coefficient
## 1    (Intercept)       9.699
## 2    infancy_vac       3.353
## 3 biological_sex       1.957
## 4           CCL3      -1.383
## 5        IgG.PRN      -1.294
## 6      Monocytes       1.009
## 7         IgG.PT       0.197
## 8   age_at_boost      -0.168

setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/rasteh/models")
a= append(list(pred_cor),all_models)
names(a)[1] <- 'pred_cor'
# sink("expanded_allModels_predCor_baseExposure_removedDiplicates_normalized.txt")
# sink("expanded_allModels_predCor_notNormalized.txt")
print(a)

## $pred_cor
##                pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14               0.1635785              0.3913774
## CCL3.day3                  0.4678559              0.5198132
## Monocytes.day1             0.7531411              0.7884895
##                ranked.spearman.cor.pred.true          MSE
## IgG.PT.day14                       0.3913774 3.350612e+05
## CCL3.day3                          0.5198132 3.878264e+06
## Monocytes.day1                     0.7884895 5.045415e+01
## 
## $IgG.PT.day14
##         Variable Coefficient
## 1    (Intercept)    -687.431
## 2         IgG.PT      72.815
## 3        IgG.FHA      61.339
## 4 biological_sex     -55.445
## 5        IgG.PRN     -45.001
## 6   age_at_boost      26.941
## 7           CCL3      25.473
## 8      Monocytes      15.596
## 
## $CCL3.day3
##      Variable Coefficient
## 1 infancy_vac   -1163.903
## 2        CCL3     674.274
## 3     IgG.FHA    -346.207
## 4      IgG.PT     325.465
## 5 (Intercept)     -76.142
## 6     IgG.PRN     -46.188
## 7   Monocytes     -30.664
## 
## $Monocytes.day1
##         Variable Coefficient
## 1    (Intercept)       9.699
## 2    infancy_vac       3.353
## 3 biological_sex       1.957
## 4           CCL3      -1.383
## 5        IgG.PRN      -1.294
## 6      Monocytes       1.009
## 7         IgG.PT       0.197
## 8   age_at_boost      -0.168

# sink()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

baseModel

2023-05-08

add exposure table

All data from abtiter been log2 transformed. Why? to make them normally distributed since our glm uses gaussian as the distribution kernel?

Test model quality

For each model, can assess which features contribute to non-zero coefficients