baseModel

library(omicade4)

## Loading required package: ade4

library(mogsa)
library(RSpectra)
# library(lubridate)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-7

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:mogsa':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(cowplot)
library(ggplot2)

setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized")

# Read in metadata
meta.2020<-read.table('clinical_metadata.2020.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
meta.2021<-read.table('clinical_metadata.2021.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

files from imputed data is already normalised so we don’t need to normalise them

# imputed_dir is path to local drive where data is saved
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/imputed")

# Import imputed datasets
rnaseq_baseline_mat_imputed_20 <- read.csv('rnaseq_baseline_mat_imputed_20_051022.csv',row.names=1)
cytof_baseline_mat_imputed_20 <- read.csv('cytof_baseline_mat_imputed_20_051022.csv',row.names=1)
olink_baseline_mat_imputed_20 <- read.csv('olink_baseline_mat_imputed_20_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_20 <- read.csv('abtiters_baseline_mat_imputed_20_051022.csv',row.names=1)

rnaseq_baseline_mat_imputed_21 <- read.csv('rnaseq_baseline_mat_imputed_21_051022.csv',row.names=1)
cytof_baseline_mat_imputed_21 <- read.csv('cytof_baseline_mat_imputed_21_051022.csv',row.names=1)
olink_baseline_mat_imputed_21 <- read.csv('olink_baseline_mat_imputed_21_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_21 <- read.csv('abtiters_baseline_mat_imputed_21_051022.csv',row.names=1)

colnames(abtiters_baseline_mat_imputed_21)

##  [1] "IgG.FHA"     "IgG.PRN"     "IgG.PT"      "IgG1.FHA"    "IgG1.FIM2.3"
##  [6] "IgG1.PRN"    "IgG1.PT"     "IgG2.FHA"    "IgG2.FIM2.3" "IgG2.PRN"   
## [11] "IgG2.PT"     "IgG3.FHA"    "IgG3.FIM2.3" "IgG3.PRN"    "IgG3.PT"    
## [16] "IgG4.FHA"    "IgG4.FIM2.3" "IgG4.PRN"    "IgG4.PT"

tasks_seq<-c('ENSG00000277632','ENSG00000136244','ENSG00000100906','ENSG00000229807')
names(rnaseq_baseline_mat_imputed_20[tasks_seq])

## [1] "ENSG00000277632" "ENSG00000136244" "ENSG00000100906" "ENSG00000229807"

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}

distPlots <- lapply(tasks_seq, distPlot, df=rnaseq_baseline_mat_imputed_20)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_grid(plotlist = distPlots)

# Get age at boost
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:cowplot':
## 
##     stamp

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

meta.2020$date_of_boost<-parse_date_time(meta.2020$date_of_boost,"ymd")
meta.2020$year_of_birth<-parse_date_time(meta.2020$year_of_birth,"ymd")
meta.2020$age_at_boost<- as.numeric(round(difftime(meta.2020$date_of_boost,
                                                    meta.2020$year_of_birth,units="weeks")/52,2))
meta.2021$date_of_boost<-parse_date_time(meta.2021$date_of_boost,"ymd")
meta.2021$year_of_birth<-parse_date_time(meta.2021$year_of_birth,"ymd")
meta.2021$age_at_boost<- as.numeric(round(difftime(meta.2021$date_of_boost,
                                                    meta.2021$year_of_birth,units="weeks")/52,2))

meta <- rbind(meta.2020[c("age_at_boost", "infancy_vac", "biological_sex")], meta.2021[c("age_at_boost", "infancy_vac", "biological_sex")])

meta$infancy_vac <- as.numeric(meta$infancy_vac)
meta$biological_sex <- as.numeric(meta$biological_sex)
colnames(meta)

## [1] "age_at_boost"   "infancy_vac"    "biological_sex"

add exposure table

library(DBI)
library(RPostgreSQL)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

library(dplyr)
library(readr)

dsn_database = "cmipb_v4_0"   
dsn_hostname = "cmi-pb.lji.org"
dsn_port = "5432"                
dsn_uid = "cmipb"         
dsn_pwd = "b5mq62vW7JE2YUwq"


tryCatch({
    drv <- dbDriver("PostgreSQL")
    print("Connecting to Database…")
    connec <- dbConnect(drv,
                 dbname = dsn_database,
                 host = dsn_hostname,
                 port = dsn_port,
                 user = dsn_uid,
                 password = dsn_pwd)
    print("Database Connected!")
    },
    error=function(cond) {
            print("Unable to connect to Database.")
    })

## [1] "Connecting to Database…"
## [1] "Database Connected!"

# dbListTables(connec)

library(tibble)
library(tidyverse)

## ── Attaching core tidyverse packages ─────────────────── tidyverse 2.0.0.9000 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()   masks mogsa::combine()
## ✖ tidyr::expand()    masks Matrix::expand()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ tidyr::pack()      masks Matrix::pack()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ✖ tidyr::unpack()    masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# library(lubridate)
exposDf <- dbGetQuery(connec, "SELECT * FROM immune_exposure")
ifelse(exposDf$event_start==exposDf$event_end,"Yes","No")

##   [1] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [13] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [25] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [37] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [49] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [61] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [73] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [85] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
##  [97] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [109] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [121] "Yes"

exposDf %>% count(event_type)

##                 event_type  n
## 1               bronchitis  1
## 2                    covid 36
## 3                     dtap  1
## 4                      hib  2
## 5                      hpv 11
## 6                influenza 32
## 7    japanese encephalitis  1
## 8            meningococcal  8
## 9            mononucleosis  1
## 10            pneumococcal  1
## 11               pneumonia  3
## 12                   polio 10
## 13            strep throat  4
## 14 tick-borne encephalitis  1
## 15                 typhoid  1
## 16               varicella  7
## 17          whooping cough  1

exposDf$exposure_material_type <- as.character(as.numeric(as.factor(exposDf$exposure_material_type)))
exposDf$event_start <- as.character(as.numeric(as.factor(exposDf$event_start)))
# 
exposDf <- dplyr::filter(exposDf, event_type %in% c('covid', 'influenza'))

exposDf1 <- exposDf[, c('subject_id', 'event_type','exposure_material_type', 'event_start')] %>%
              pivot_wider( names_from = event_type,
              values_from = c(event_start, exposure_material_type),
              values_fn= toString,
              values_fill='Not known'
              )

colnames(exposDf1)

## [1] "subject_id"                       "event_start_influenza"           
## [3] "event_start_covid"                "exposure_material_type_influenza"
## [5] "exposure_material_type_covid"

typeof(exposDf1)

## [1] "list"

exposDf2 <- as.data.frame(exposDf1)
for (col in colnames(exposDf1)[2:length(colnames(exposDf1))]){
  
  exposDf2 <- exposDf1[!grepl(',', exposDf1[[col]]), ]
}

exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.factor))
exposDf2 <- exposDf2%>% mutate(across(colnames(exposDf2), as.numeric))
exposDf2

## # A tibble: 51 × 5
##    subject_id event_start_influenza event_start_covid exposure_material_type_i…¹
##         <dbl>                 <dbl>             <dbl>                      <dbl>
##  1          3                     1                 1                          1
##  2         38                     1                 1                          1
##  3         41                     3                 1                          1
##  4         46                     2                 1                          1
##  5         44                     2                 1                          1
##  6          2                     1                 3                          1
##  7          4                     2                 3                          1
##  8          5                     1                 3                          1
##  9          6                     1                 3                          1
## 10          7                     3                 3                          1
## # ℹ 41 more rows
## # ℹ abbreviated name: ¹exposure_material_type_influenza
## # ℹ 1 more variable: exposure_material_type_covid <dbl>

# nullToNA <- function(x) {
#     x[sapply(x, is.null)] <- 'Not known'
#     return(x)
# }
# z=exposDf3[2][1]
# exposDf3 <- lapply(exposDf2, nullToNA)
# exposDf3.df <- t(do.call(rbind,lapply(exposDf3, rbind)))
# 
# # z= dim(exposDf3.df)[1]
# # print(dim(exposDf3.df))
# # print(exposDf3.df[[2]][2])
# # exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=TRUE)
# exposDf4 <- as.data.frame(matrix(unlist(exposDf3.df),nrow=dim(exposDf3.df)[1],byrow=FALSE))
# # z= typeof(exposDf4)
# colnames(exposDf4) <- names(exposDf3)
# #
# # z=exposDf3.df$subject_id
# #

All data from abtiter been log2 transformed. Why? to make them normally distributed since our glm uses gaussian as the distribution kernel?

# Get the y data
dataY = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/yData_task_matrix.common_names.mfi_raw.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

dataY <- dataY[c("IgG.PT.day14", "ENSG00000277632.day3", "Monocytes.day1")]
colnames(dataY) <- c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
dataY$IgG.PT.day14 <- log2(dataY[,'IgG.PT.day14']+1)
dataY$CCL3.day3 <- log2(dataY[,'CCL3.day3']+1)

dataY$subject_id <- rownames(dataY)
colnames(dataY)

## [1] "IgG.PT.day14"   "CCL3.day3"      "Monocytes.day1" "subject_id"

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}

typeof(dataY)

## [1] "list"

# 
distPlots <- lapply(c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1"), distPlot, df=dataY)
plot_grid(plotlist = distPlots)

## Warning: Removed 4 rows containing non-finite values (`stat_density()`).

## Warning: Removed 22 rows containing non-finite values (`stat_density()`).

## Warning: Removed 41 rows containing non-finite values (`stat_density()`).

# df = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized/task_matrix.common_names.mfi_normalised.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
# 
# distPlot <- function(col, df){
#   p <- 
#     ggplot(df) +
#     aes_string(col)
# 
#   if(is.numeric(df[[col]])) {
#     p <- p + geom_density()
# 
#   } else {
#     p <- p + geom_bar()
#   }
# }
# 
# typeof(df)
# colnames(df)
# # 
# distPlots <- lapply(c("IgG.PT_day14", "CCL3_day3", "Monocytes_day1"), distPlot, df=df)
# plot_grid(plotlist = distPlots)

colnames(rnaseq_baseline_mat_imputed_20)[which(names(rnaseq_baseline_mat_imputed_20) == "ENSG00000277632")] <- "CCL3"
colnames(rnaseq_baseline_mat_imputed_21)[which(names(rnaseq_baseline_mat_imputed_21) == "ENSG00000277632")] <- "CCL3"

rnaDf <- rbind(rnaseq_baseline_mat_imputed_20["CCL3"], rnaseq_baseline_mat_imputed_21["CCL3"])

abtiterDf <- rbind(abtiters_baseline_mat_imputed_20[c("IgG.FHA", "IgG.PRN", "IgG.PT")], abtiters_baseline_mat_imputed_21[c("IgG.FHA", "IgG.PRN", "IgG.PT")])

cytofDf <- rbind(cytof_baseline_mat_imputed_20["Monocytes"], cytof_baseline_mat_imputed_21["Monocytes"])

dataDf1 <- merge(rnaDf, abtiterDf, by='row.names', all=T)
colnames(dataDf1)[1] <- "subject_id" 

dataDf2 <- merge(cytofDf, meta, by='row.names', all=T)
colnames(dataDf2)[1] <- "subject_id"

# dataX1 <- merge(dataDf1, dataDf2, by='subject_id', all=T)
dataX <- merge(dataDf1, dataDf2, by='subject_id', all=T)
colnames(dataX)[1] <- "subject_id"

# dataX <- merge(dataX1, exposDf2, by='subject_id', all=T)
# 
dataDf <- merge(dataX, dataY, by='subject_id', all=T)
rownames(dataDf) <- dataDf$subject_id

dataDf

##    subject_id      CCL3    IgG.FHA    IgG.PRN     IgG.PT  Monocytes
## 1           1  5.333531 5.13138189 1.84893838 2.24397113  7.5260062
## 10         10  4.101398 0.14364571 0.09320618 0.04154169 13.4642547
## 11         11  4.542939 2.47376114 3.18404386 1.77698271 10.7905171
## 12         12        NA         NA         NA         NA         NA
## 13         13  3.399718 1.07740175 1.12798355 1.36806068  5.0304685
## 14         14        NA         NA         NA         NA         NA
## 15         15  4.835874 1.14771363 1.03058953 0.38100554 15.6373679
## 16         16        NA         NA         NA         NA         NA
## 17         17  4.512669 1.78634624 0.48701666 3.45553366 14.4574172
## 18         18  4.083384 3.51822961 1.56843058 1.68734290  0.9568001
## 19         19  4.459300 1.04292223 1.84192155 2.16297720 13.5202756
## 2           2        NA         NA         NA         NA         NA
## 20         20  3.963474 1.53996887 0.59107133 2.55022450 21.3991621
## 21         21  3.823138 3.93012107 1.68057253 2.77448207 15.2266453
## 22         22  4.462314 2.14113953 1.30342083 2.16393744 11.9355890
## 23         23  4.421223 1.68946851 0.43547450 0.40194862 11.7041207
## 24         24  3.106516 0.60815892 1.06497238 0.26992483 11.5201600
## 25         25  4.042732 0.34555009 0.80002675 0.74967184 13.4709985
## 26         26  4.735685 0.98270780 1.12156687 0.72685734  5.5320667
## 27         27  3.782618 1.68013778 0.84721352 0.77080873 13.8965682
## 28         28        NA         NA         NA         NA         NA
## 29         29  4.447249 2.58070095 2.87788355 0.99688351 16.0947648
## 3           3  4.599972 1.06795335 3.11313957 1.06789053 13.0245725
## 30         30        NA         NA         NA         NA         NA
## 31         31  4.693487 0.40767306 2.02769318 0.16545091  4.8790233
## 32         32  5.779549 0.80344564 1.30960268 1.40368966 13.5492381
## 33         33  5.311794 0.66955298 0.24197417 0.63824158  7.6518328
## 34         34        NA         NA         NA         NA         NA
## 35         35  5.434762 1.75928512 1.23573946 1.19302502 13.5420630
## 36         36  3.016496 2.00337580 2.33488694 1.04080942 11.2437986
## 37         37        NA         NA         NA         NA         NA
## 38         38  3.017922 0.81897225 0.56487492 0.77149215 13.7721006
## 39         39        NA         NA         NA         NA         NA
## 4           4  4.094574 1.03441003 2.73777402 1.60723380  5.4908380
## 40         40        NA         NA         NA         NA         NA
## 41         41        NA         NA         NA         NA         NA
## 42         42  5.230280 0.86883965 2.56022738 1.11654738 10.6095832
## 43         43  3.871351 1.01708739 1.30963816 1.00310977 13.6337391
## 44         44  4.518409 0.52901785 0.72866760 0.92865661 23.1032762
## 45         45        NA         NA         NA         NA         NA
## 46         46        NA         NA         NA         NA         NA
## 47         47  6.900831 1.56572050 0.54455774 2.65845646 10.1292397
## 48         48  6.731536 0.48194166 1.03140808 0.04154169 14.9597507
## 49         49        NA         NA         NA         NA         NA
## 5           5  4.916572 0.11694600 2.64806823 2.26243525 13.1258248
## 50         50  8.091139 0.61246324 0.32074600 0.50769248  9.3613078
## 51         51        NA         NA         NA         NA         NA
## 52         52  7.288949 0.11694600 1.56152075 1.97201547 28.5490352
## 53         53  9.867189 1.90423998 0.47059211 4.17711376 12.8294284
## 54         54        NA         NA         NA         NA         NA
## 55         55        NA         NA         NA         NA         NA
## 56         56        NA         NA         NA         NA         NA
## 57         57        NA         NA         NA         NA         NA
## 58         58        NA         NA         NA         NA         NA
## 59         59        NA         NA         NA         NA         NA
## 6           6  4.575554 0.46392532 0.12456085 0.27896948 22.7930283
## 60         60        NA         NA         NA         NA         NA
## 61         61 12.042340 3.54131321 0.15131429 1.00000000 12.8005992
## 62         62  6.937333 2.82093049 0.75122163 1.07602222 13.5692650
## 63         63  6.691953 1.95776540 0.11013376 1.79570532 15.3000000
## 64         64  6.760926 2.68882561 0.89982323 2.04705167 12.9000000
## 65         65  7.605079 3.31317657 0.64048964 0.80140188 15.8000000
## 66         66  5.042425 3.74662797 1.43985527 1.32512343 15.8000000
## 67         67  5.889230 2.00974701 0.35723800 0.70618673 34.6000000
## 68         68  5.853946 1.11683339 1.94255138 1.05459788 13.5000000
## 69         69  5.528321 1.65215623 1.21850657 1.07297696 13.1000000
## 7           7        NA         NA         NA         NA         NA
## 70         70  5.815166 3.35167692 1.65402214 0.47027882 32.3000000
## 71         71  5.987707 4.65325882 0.87273399 2.90659418 12.2000000
## 72         72  5.880881 0.28729447 2.12336100 2.08840338 19.9000000
## 73         73  4.529196 0.80435967 1.43128987 1.88327381 15.7000000
## 74         74  3.871745 0.62423924 1.60899360 0.52016941 36.3000000
## 75         75  5.678804 3.05443604 1.81011848 0.17736981 14.4559889
## 76         76  4.663800 4.53841221 0.11013376 3.53643892 18.0000000
## 77         77  8.217396 0.06711778 1.06772479 0.69081102 28.8000000
## 78         78  7.151839 0.10084120 0.25375330 0.99519368 29.0000000
## 79         79  6.969300 0.20652117 0.14721857 2.32256773 41.5000000
## 8           8        NA         NA         NA         NA         NA
## 80         80  7.178107 0.14117982 1.58147601 0.46841860 21.9000000
## 81         81 10.726589 0.07947224 1.13942560 1.81601157 34.9000000
## 82         82  9.413611 0.96199368 0.94934816 0.76723104 22.5000000
## 83         83  8.588209 1.00000000 2.03087458 1.26356779 32.4000000
## 84         84  7.593122 2.31136374 0.44903137 0.28053529 18.9000000
## 85         85  4.937862 3.32067659 0.92948200 0.33276350 19.7000000
## 86         86  5.362224 0.67511620 0.26301839 0.30403242 23.5000000
## 87         87  5.295650 3.21116798 0.96864376 1.07997538 22.9000000
## 88         88  5.567850 1.48284828 1.33113192 1.04544297 17.6000000
## 89         89  5.130601 0.04523958 0.13834498 0.06807137 15.7000000
## 9           9  4.238481 0.46042437 0.20839284 0.28783862  5.7229578
## 90         90  4.837136 0.74851784 1.63121255 0.33985853 20.6000000
## 91         91  4.140370 1.12281459 0.78254880 0.92749667 25.4000000
## 92         92  4.623047 0.04523958 1.03269446 0.23829881 40.6000000
## 93         93  4.616769 0.12543138 1.60575881 1.57747907 16.3000000
## 94         94  4.785446 0.08254440 1.26047137 2.65824315 26.2000000
## 95         95  4.878235 0.37016694 1.00000000 2.29792327 17.3000000
## 96         96  4.356355 0.19946349 0.92277461 0.76421696 34.2000000
##    age_at_boost infancy_vac biological_sex IgG.PT.day14 CCL3.day3
## 1         30.80           2              1    7.6475855  8.531381
## 10        34.68           2              1    2.2695086  7.491853
## 11        30.76           2              1    8.6987531  7.179909
## 12        34.68           2              2    6.6128578        NA
## 13        19.63           1              2    5.8841566  9.832890
## 14        23.70           2              2    8.0768224        NA
## 15        27.71           2              2    6.6651860  8.118941
## 16        29.66           2              1    6.4541001        NA
## 17        36.82           2              1    7.4085495  8.209453
## 18        19.73           1              1    7.5228254  6.643856
## 19        22.81           2              2    7.4194018  7.066089
## 2         51.25           2              1           NA        NA
## 20        35.78           2              1    7.0689888  8.909893
## 21        33.77           2              2    5.1846314  7.900867
## 22        31.77           2              1    6.5076306  6.375039
## 23        25.82           2              1    7.0576625  7.066089
## 24        24.79           2              1    4.6224481  7.238405
## 25        28.80           2              1    4.5909272  7.562242
## 26        33.85           2              1    5.5468623  7.219169
## 27        19.80           1              1    5.4905601  7.592457
## 28        34.85           2              2    6.8762502        NA
## 29        19.80           1              2    6.2496495  8.154818
## 3         33.89           2              1    7.0245630  7.888743
## 30        28.84           2              1    5.9932153        NA
## 31        27.83           2              1    5.0875933  8.897845
## 32        19.88           1              2    8.0965425  9.353147
## 33        26.87           2              2    3.8450834  9.550747
## 34        33.93           2              1    7.4372481        NA
## 35        25.86           2              2    6.2916862  7.774787
## 36        19.88           1              1    5.4060017  7.584963
## 37        18.91           1              1           NA        NA
## 38        19.88           1              1    6.9326678  5.977280
## 39        31.92           2              1    6.1874734        NA
## 4         28.76           2              2    7.1886911  7.066089
## 40        22.89           2              1    7.1901774        NA
## 41        31.96           2              2    7.6409027        NA
## 42        19.92           1              1    7.3543048  8.980140
## 43        18.91           1              1    4.7818220  8.317413
## 44        18.91           1              1    8.2039885  7.209453
## 45        19.98           1              1    5.3037468        NA
## 46        18.91           1              1    7.6599183        NA
## 47        20.98           1              1    8.1390023 13.173521
## 48        19.11           1              1    0.6191782 11.965063
## 49        20.11           1              1    7.1395468        NA
## 5         25.75           2              2    6.6256103  7.554589
## 50        19.98           1              1    5.7517331 13.075145
## 51        19.98           1              2    6.2205920        NA
## 52        19.07           1              2    6.7943897 13.313733
## 53        19.07           1              1    7.7831903 12.686938
## 54        20.11           1              1    7.4376533        NA
## 55        20.11           1              1    6.9266194        NA
## 56        20.15           1              1    7.6690347        NA
## 57        21.15           1              1    6.3923743        NA
## 58        20.15           1              1    7.8093466        NA
## 59        20.15           1              1    8.4674371        NA
## 6         28.87           2              1    7.3965736  7.761551
## 60        20.15           1              2    4.4714296        NA
## 61        32.38           2              1    8.2526654  8.717676
## 62        25.99           2              1   10.5975417 10.582142
## 63        23.98           2              1   10.9051623  9.266787
## 64        25.99           2              2    9.1278687  9.333155
## 65        29.02           2              2   10.8445096 11.200899
## 66        43.07           2              1   11.0087787  9.951285
## 67        47.24           2              1   11.0562652  8.826548
## 68        47.24           2              2   10.6132806  8.344296
## 69        29.17           2              1   10.1689229  9.631177
## 7         35.97           2              1    6.5497867        NA
## 70        21.15           1              2    7.6596483  8.495855
## 71        21.15           1              1    9.8422738  9.881114
## 72        28.25           2              1   10.7625897  8.243174
## 73        24.23           2              1   10.3994888  8.717676
## 74        24.23           2              1    8.5950614  7.700440
## 75        21.22           1              1    7.1006623  7.924813
## 76        21.22           1              1   10.2854022  7.686501
## 77        31.32           2              2    8.9872640 10.913637
## 78        26.30           2              1    9.7013065  8.290019
## 79        32.32           2              2    9.9947070 11.197831
## 8         34.27           2              1           NA        NA
## 80        27.30           2              1    7.9628960 11.810973
## 81        26.30           2              2    9.0161118 11.244958
## 82        21.28           1              1           NA 12.988152
## 83        20.34           1              1    8.7729745  7.033423
## 84        22.34           1              1    8.8946701  9.219169
## 85        19.39           1              1    7.4072678  8.479780
## 86        21.40           1              1    9.2672550  9.159871
## 87        19.39           1              2           NA 10.062046
## 88        19.39           1              2           NA 12.066426
## 89        22.49           1              1    7.0762538  9.377211
## 9         20.63           1              2    4.2023938  8.388017
## 90        20.49           1              1    7.8005768  8.781360
## 91        21.49           1              2    8.9320595  7.459432
## 92        19.54           1              1    8.5651021  8.108524
## 93        23.56           1              1    9.3359486  8.189825
## 94        20.55           1              2    9.9581902  7.139551
## 95        21.55           1              1   10.0764114  7.721099
## 96        19.54           1              2    8.5961898  7.475733
##    Monocytes.day1
## 1              NA
## 10             NA
## 11       7.257095
## 12             NA
## 13             NA
## 14             NA
## 15      10.585489
## 16             NA
## 17      16.401488
## 18             NA
## 19             NA
## 2              NA
## 20      26.605583
## 21      34.812168
## 22             NA
## 23             NA
## 24             NA
## 25             NA
## 26      16.108508
## 27             NA
## 28             NA
## 29      25.083209
## 3              NA
## 30             NA
## 31       8.545243
## 32             NA
## 33      17.703064
## 34             NA
## 35             NA
## 36      17.446750
## 37             NA
## 38             NA
## 39             NA
## 4        7.211965
## 40             NA
## 41             NA
## 42             NA
## 43             NA
## 44      35.241054
## 45      13.545087
## 46      30.018793
## 47       8.663056
## 48      18.252658
## 49      10.347126
## 5              NA
## 50             NA
## 51             NA
## 52      23.512649
## 53             NA
## 54             NA
## 55      15.334381
## 56             NA
## 57             NA
## 58             NA
## 59             NA
## 6       41.380502
## 60             NA
## 61             NA
## 62             NA
## 63      18.700000
## 64      13.800000
## 65      20.400000
## 66      13.900000
## 67      31.100000
## 68      15.500000
## 69      18.900000
## 7              NA
## 70      46.100000
## 71      18.200000
## 72      27.500000
## 73      23.400000
## 74      50.000000
## 75             NA
## 76      22.300000
## 77      31.000000
## 78      35.700000
## 79      52.600000
## 8              NA
## 80      26.400000
## 81      35.600000
## 82      20.500000
## 83      27.700000
## 84      25.100000
## 85      18.700000
## 86      25.200000
## 87      22.500000
## 88      20.200000
## 89      16.600000
## 9              NA
## 90      21.600000
## 91      40.400000
## 92      33.000000
## 93      15.000000
## 94      39.500000
## 95      23.500000
## 96      35.000000

###Distination plot

distPlot <- function(col, df){
  p <- 
    ggplot(df) +
    aes_string(col)

  if(is.numeric(df[[col]])) {
    p <- p + geom_density()

  } else {
    p <- p + geom_bar()
  }
}


names(dataDf)[2:length(dataDf)]

##  [1] "CCL3"           "IgG.FHA"        "IgG.PRN"        "IgG.PT"        
##  [5] "Monocytes"      "age_at_boost"   "infancy_vac"    "biological_sex"
##  [9] "IgG.PT.day14"   "CCL3.day3"      "Monocytes.day1"

distPlots <- lapply(names(dataDf)[2:length(dataDf)], distPlot, df=dataDf)
plot_grid(plotlist = distPlots)

## Warning: Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).
## Removed 24 rows containing non-finite values (`stat_density()`).

## Warning: Removed 6 rows containing non-finite values (`stat_density()`).

## Warning: Removed 24 rows containing non-finite values (`stat_density()`).

## Warning: Removed 43 rows containing non-finite values (`stat_density()`).

Test model quality

options(warn=-1)
# xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex", "event_start_influenza", "event_start_covid", "exposure_material_type_influenza", "exposure_material_type_covid")

xCols = c("CCL3", "IgG.FHA", "IgG.PRN", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex")
yCols = c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")

pred_cor <- data.frame(matrix(nrow=length(yCols), ncol=3))
rownames(pred_cor) <- yCols
colnames(pred_cor) <- c('pearson.cor.pred.true', 'spearman.cor.pred.true', 'ranked.spearman.cor.pred.true')

rownames(dataDf) <- attr(dataDf, "row.names")
print(typeof(row.names(dataDf)))

## [1] "character"

# 
for (i in 1:length(yCols)){
  all_preds <- c()
  all_true <- c()
  all_reduced <- c()
  set.seed(1)
  
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  for (j in 1:length(row_int)){
    train <- row_int[-c(j)]
    xData <- filteredX[train, xCols]
    yData <- filteredY[train,]
    
    a1= nrow(xData[train,])
    a2= nrow(xData[train,]-1)
    
    allidx = row_int
    predidx = setdiff(allidx, train)
    
    # create lasso model
    cvfit_out <- cv.glmnet(x=as.matrix(xData), yData, family='gaussian',
                         alpha=1, nfolds=nrow(xData[train,]))

    preds <- predict(cvfit_out, newx = as.matrix(data.frame(filteredX[predidx,])), s='lambda.min')

    all_preds <- c(all_preds, preds)
    all_true<- c(all_true, filteredY[predidx, yCols[i]])
    all_reduced <- c(all_reduced, (filteredY[predidx, yCols[i]]-preds) )
  }
  
  b = data_frame(all_preds, rank(all_preds,na.last="keep",ties.method="min"), all_true, rank(all_true,na.last="keep",ties.method="min"))

  pred_cor[yCols[i],'pearson.cor.pred.true'] <- cor(all_preds,all_true)
  pred_cor[yCols[i],'spearman.cor.pred.true'] <- cor(all_preds,all_true, method="spearman")
  pred_cor[yCols[i],'ranked.spearman.cor.pred.true'] <- cor(rank(all_preds,na.last="keep",ties.method="min"),rank(all_true,na.last="keep",ties.method="min"), method="spearman")
  pred_cor[yCols[i],'MSE'] <- mean((all_reduced)^2)
}

pred_cor

##                pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14               0.4548602              0.5337596
## CCL3.day3                  0.5107044              0.6519170
## Monocytes.day1             0.7531411              0.7884895
##                ranked.spearman.cor.pred.true       MSE
## IgG.PT.day14                       0.5337596  3.631745
## CCL3.day3                          0.6519170  2.263844
## Monocytes.day1                     0.7884895 50.454154

typeof(pred_cor)

## [1] "list"

For each model, can assess which features contribute to non-zero coefficients

Consider only choosing models for follow-on analysis that show good correlation scores

size <- length(yCols)
all_models_coef<-vector(mode='list',length=size)
all_models_names<-vector(mode='list',length=size)
all_models<-vector(mode='list',length=size)

for (i in 1:length(yCols)){
  set.seed(1)
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  # create lasso model
  suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(filteredX[row_int,]), as.matrix(filteredY[row_int,]), family='gaussian', alpha=1, nfolds=nrow(filteredX[row_int,])))
  plot(cvfit_out)
  all_models_coef[i]=list(coef(cvfit_out, s = 'lambda.min')[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
  all_models_names[i]=list(rownames(coef(cvfit_out, s = 'lambda.min'))[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
}

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

names(all_models_coef) <- yCols
names(all_models_names) <- yCols

for (i in 1:size){
  all_models[[i]] = data.frame(cbind(all_models_names[[i]],all_models_coef[[i]]))
  colnames(all_models[[i]])<-c("Variable","Coefficient")
  all_models[[i]]$Coefficient<-as.numeric(all_models[[i]]$Coefficient)
  all_models[[i]]$Coefficient=round(all_models[[i]]$Coefficient,3)
  all_models[[i]]<-all_models[[i]] %>% arrange(desc(abs(Coefficient)))
}
names(all_models)<-yCols

all_models

## $IgG.PT.day14
##         Variable Coefficient
## 1    (Intercept)       3.448
## 2         IgG.PT       0.558
## 3        IgG.FHA       0.284
## 4 biological_sex      -0.165
## 5           CCL3       0.145
## 6      Monocytes       0.097
## 7   age_at_boost       0.034
## 
## $CCL3.day3
##      Variable Coefficient
## 1 (Intercept)       6.251
## 2        CCL3       0.528
## 3 infancy_vac      -0.194
## 
## $Monocytes.day1
##         Variable Coefficient
## 1    (Intercept)       9.699
## 2    infancy_vac       3.353
## 3 biological_sex       1.957
## 4           CCL3      -1.383
## 5        IgG.PRN      -1.294
## 6      Monocytes       1.009
## 7         IgG.PT       0.197
## 8   age_at_boost      -0.168

setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/rasteh/models")
a= append(list(pred_cor),all_models)
names(a)[1] <- 'pred_cor'
# sink("expanded_allModels_predCor_baseExposure_removedDiplicates_normalized.txt")
sink("expanded_allModels_predCor_normalized.txt")
print(a)

## $pred_cor
##                pearson.cor.pred.true spearman.cor.pred.true
## IgG.PT.day14               0.4548602              0.5337596
## CCL3.day3                  0.5107044              0.6519170
## Monocytes.day1             0.7531411              0.7884895
##                ranked.spearman.cor.pred.true       MSE
## IgG.PT.day14                       0.5337596  3.631745
## CCL3.day3                          0.6519170  2.263844
## Monocytes.day1                     0.7884895 50.454154
## 
## $IgG.PT.day14
##         Variable Coefficient
## 1    (Intercept)       3.448
## 2         IgG.PT       0.558
## 3        IgG.FHA       0.284
## 4 biological_sex      -0.165
## 5           CCL3       0.145
## 6      Monocytes       0.097
## 7   age_at_boost       0.034
## 
## $CCL3.day3
##      Variable Coefficient
## 1 (Intercept)       6.251
## 2        CCL3       0.528
## 3 infancy_vac      -0.194
## 
## $Monocytes.day1
##         Variable Coefficient
## 1    (Intercept)       9.699
## 2    infancy_vac       3.353
## 3 biological_sex       1.957
## 4           CCL3      -1.383
## 5        IgG.PRN      -1.294
## 6      Monocytes       1.009
## 7         IgG.PT       0.197
## 8   age_at_boost      -0.168

sink()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

baseModel

2023-05-08

add exposure table

All data from abtiter been log2 transformed. Why? to make them normally distributed since our glm uses gaussian as the distribution kernel?

Test model quality

For each model, can assess which features contribute to non-zero coefficients