baseModel

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(omicade4)

## Loading required package: ade4

library(mogsa)
library(RSpectra)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-7

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:mogsa':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Including Plots

You can also embed plots, for example:

setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized")

# Read in metadata
meta.2020<-read.table('clinical_metadata.2020.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
meta.2021<-read.table('clinical_metadata.2021.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

# imputed_dir is path to local drive where data is saved
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/imputed")

# Import imputed datasets
rnaseq_baseline_mat_imputed_20 <- read.csv('rnaseq_baseline_mat_imputed_20_051022.csv',row.names=1)
cytof_baseline_mat_imputed_20 <- read.csv('cytof_baseline_mat_imputed_20_051022.csv',row.names=1)
olink_baseline_mat_imputed_20 <- read.csv('olink_baseline_mat_imputed_20_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_20 <- read.csv('abtiters_baseline_mat_imputed_20_051022.csv',row.names=1)

rnaseq_baseline_mat_imputed_21 <- read.csv('rnaseq_baseline_mat_imputed_21_051022.csv',row.names=1)
cytof_baseline_mat_imputed_21 <- read.csv('cytof_baseline_mat_imputed_21_051022.csv',row.names=1)
olink_baseline_mat_imputed_21 <- read.csv('olink_baseline_mat_imputed_21_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_21 <- read.csv('abtiters_baseline_mat_imputed_21_051022.csv',row.names=1)

# Get age at boost
library(lubridate)
meta.2020$date_of_boost<-parse_date_time(meta.2020$date_of_boost,"ymd")
meta.2020$year_of_birth<-parse_date_time(meta.2020$year_of_birth,"ymd")
meta.2020$age_at_boost<- as.numeric(round(difftime(meta.2020$date_of_boost,
                                                    meta.2020$year_of_birth,units="weeks")/52,2))
meta.2021$date_of_boost<-parse_date_time(meta.2021$date_of_boost,"ymd")
meta.2021$year_of_birth<-parse_date_time(meta.2021$year_of_birth,"ymd")
meta.2021$age_at_boost<- as.numeric(round(difftime(meta.2021$date_of_boost,
                                                    meta.2021$year_of_birth,units="weeks")/52,2))

meta <- rbind(meta.2020[c("age_at_boost", "infancy_vac", "biological_sex")], meta.2021[c("age_at_boost", "infancy_vac", "biological_sex")])

meta$infancy_vac <- as.numeric(meta$infancy_vac)
meta$biological_sex <- as.numeric(meta$biological_sex)
colnames(meta)

## [1] "age_at_boost"   "infancy_vac"    "biological_sex"

All data from abtiter been log2 transformed. Why? to make them normally distributed since our glm uses gaussian as the distribution kernel?

# Get the y data
dataY = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/yData_task_matrix.common_names.mfi_raw.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)

dataY <- dataY[c("IgG.PT.day14", "ENSG00000277632.day3", "Monocytes.day1")]
colnames(dataY) <- c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
dataY$IgG.PT.day14 <- log2(dataY$IgG.PT.day14)
dataY$subj_id <- rownames(dataY)
colnames(dataY)

## [1] "IgG.PT.day14"   "CCL3.day3"      "Monocytes.day1" "subj_id"

colnames(rnaseq_baseline_mat_imputed_20)[which(names(rnaseq_baseline_mat_imputed_20) == "ENSG00000277632")] <- "CCL3"
colnames(rnaseq_baseline_mat_imputed_21)[which(names(rnaseq_baseline_mat_imputed_21) == "ENSG00000277632")] <- "CCL3"
rnaDf <- rbind(rnaseq_baseline_mat_imputed_20["CCL3"], rnaseq_baseline_mat_imputed_21["CCL3"])

abtiterDf <- log2(rbind(abtiters_baseline_mat_imputed_20["IgG.PT"], abtiters_baseline_mat_imputed_21["IgG.PT"]))

cytofDf <- rbind(cytof_baseline_mat_imputed_20["Monocytes"], cytof_baseline_mat_imputed_21["Monocytes"])

dataDf1 <- merge(rnaDf, abtiterDf, by='row.names', all=T)
colnames(dataDf1)[1] <- "subj_id" 

dataDf2 <- merge(cytofDf, meta, by='row.names', all=T)
colnames(dataDf2)[1] <- "subj_id" 

dataX <- merge(dataDf1, dataDf2, by='subj_id', all=T)

dataDf <- merge(dataX, dataY, by='subj_id', all=T)
rownames(dataDf) <- dataDf$subj_id

dataDf

##    subj_id      CCL3       IgG.PT  Monocytes age_at_boost infancy_vac
## 1        1  5.333531  1.166054115  7.5260062        30.80           2
## 10      10  4.101398 -4.589296443 13.4642547        34.68           2
## 11      11  4.542939  0.829429646 10.7905171        30.76           2
## 12      12        NA           NA         NA        34.68           2
## 13      13  3.399718  0.452132222  5.0304685        19.63           1
## 14      14        NA           NA         NA        23.70           2
## 15      15  4.835874 -1.392116120 15.6373679        27.71           2
## 16      16        NA           NA         NA        29.66           2
## 17      17  4.512669  1.788908531 14.4574172        36.82           2
## 18      18  4.083384  0.754753184  0.9568001        19.73           1
## 19      19  4.459300  1.113018455 13.5202756        22.81           2
## 2        2        NA           NA         NA        51.25           2
## 20      20  3.963474  1.350624258 21.3991621        35.78           2
## 21      21  3.823138  1.472218478 15.2266453        33.77           2
## 22      22  4.462314  1.113658793 11.9355890        31.77           2
## 23      23  4.421223 -1.314917015 11.7041207        25.82           2
## 24      24  3.106516 -1.889370403 11.5201600        24.79           2
## 25      25  4.042732 -0.415668881 13.4709985        28.80           2
## 26      26  4.735685 -0.460255865  5.5320667        33.85           2
## 27      27  3.782618 -0.375555190 13.8965682        19.80           1
## 28      28        NA           NA         NA        34.85           2
## 29      29  4.447249 -0.004503163 16.0947648        19.80           1
## 3        3  4.599972  0.094763759 13.0245725        33.89           2
## 30      30        NA           NA         NA        28.84           2
## 31      31  4.693487 -2.595524868  4.8790233        27.83           2
## 32      32  5.779549  0.489224006 13.5492381        19.88           1
## 33      33  5.311794 -0.647825498  7.6518328        26.87           2
## 34      34        NA           NA         NA        33.93           2
## 35      35  5.434762  0.254624301 13.5420630        25.86           2
## 36      36  3.016496  0.057705925 11.2437986        19.88           1
## 37      37        NA           NA         NA        18.91           1
## 38      38  3.017922 -0.374276619 13.7721006        19.88           1
## 39      39        NA           NA         NA        31.92           2
## 4        4  4.094574  0.684579811  5.4908380        28.76           2
## 40      40        NA           NA         NA        22.89           2
## 41      41        NA           NA         NA        31.96           2
## 42      42  5.230280  0.159044470 10.6095832        19.92           1
## 43      43  3.871351  0.004479489 13.6337391        18.91           1
## 44      44  4.518409 -0.106782869 23.1032762        18.91           1
## 45      45        NA           NA         NA        19.98           1
## 46      46        NA           NA         NA        18.91           1
## 47      47  6.900831  1.410588836 10.1292397        20.98           1
## 48      48  6.731536 -4.589296443 14.9597507        19.11           1
## 49      49        NA           NA         NA        20.11           1
## 5        5  4.916572  1.177876505 13.1258248        25.75           2
## 50      50  8.091139 -0.977973206  9.3613078        19.98           1
## 51      51        NA           NA         NA        19.98           1
## 52      52  7.288949  0.979670866 28.5490352        19.07           1
## 53      53  9.867189  2.062506433 12.8294284        19.07           1
## 54      54        NA           NA         NA        20.11           1
## 55      55        NA           NA         NA        20.11           1
## 56      56        NA           NA         NA        20.15           1
## 57      57        NA           NA         NA        21.15           1
## 58      58        NA           NA         NA        20.15           1
## 59      59        NA           NA         NA        20.15           1
## 6        6  4.575554 -1.841820815 22.7930283        28.87           2
## 60      60        NA           NA         NA        20.15           1
## 61      61 12.042340  0.000000000 12.8005992        32.38           2
## 62      62  6.937333  0.105707874 13.5692650        25.99           2
## 63      63  6.691953  0.844550618 15.3000000        23.98           2
## 64      64  6.760926  1.033547520 12.9000000        25.99           2
## 65      65  7.605079 -0.319402203 15.8000000        29.02           2
## 66      66  5.042425  0.406126752 15.8000000        43.07           2
## 67      67  5.889230 -0.501878380 34.6000000        47.24           2
## 68      68  5.853946  0.076693001 13.5000000        47.24           2
## 69      69  5.528321  0.101619092 13.1000000        29.17           2
## 7        7        NA           NA         NA        35.97           2
## 70      70  5.815166 -1.088411741 32.3000000        21.15           1
## 71      71  5.987707  1.539329657 12.2000000        21.15           1
## 72      72  5.880881  1.062400400 19.9000000        28.25           2
## 73      73  4.529196  0.913242768 15.7000000        24.23           2
## 74      74  3.871745 -0.942946522 36.3000000        24.23           2
## 75      75  5.678804 -2.495167595 14.4559889        21.22           1
## 76      76  4.663800  1.822297344 18.0000000        21.22           1
## 77      77  8.217396 -0.533637004 28.8000000        31.32           2
## 78      78  7.151839 -0.006950768 29.0000000        26.30           2
## 79      79  6.969300  1.215720668 41.5000000        32.32           2
## 8        8        NA           NA         NA        34.27           2
## 80      80  7.178107 -1.094129745 21.9000000        27.30           2
## 81      81 10.726589  0.860773397 34.9000000        26.30           2
## 82      82  9.413611 -0.382267012 22.5000000        21.28           1
## 83      83  8.588209  0.337503062 32.4000000        20.34           1
## 84      84  7.593122 -1.833745832 18.9000000        22.34           1
## 85      85  4.937862 -1.587430915 19.7000000        19.39           1
## 86      86  5.362224 -1.717702905 23.5000000        21.40           1
## 87      87  5.295650  0.110998420 22.9000000        19.39           1
## 88      88  5.567850  0.064114365 17.6000000        19.39           1
## 89      89  5.130601 -3.876808062 15.7000000        22.49           1
## 9        9  4.238481 -1.796667911  5.7229578        20.63           1
## 90      90  4.837136 -1.556993755 20.6000000        20.49           1
## 91      91  4.140370 -0.108585996 25.4000000        21.49           1
## 92      92  4.623047 -2.069156334 40.6000000        19.54           1
## 93      93  4.616769  0.657620865 16.3000000        23.56           1
## 94      94  4.785446  1.410473077 26.2000000        20.55           1
## 95      95  4.878235  1.200330628 17.3000000        21.55           1
## 96      96  4.356355 -0.387945822 34.2000000        19.54           1
##    biological_sex IgG.PT.day14 CCL3.day3 Monocytes.day1
## 1               1    7.6403727       369             NA
## 10              1    1.9341726       179             NA
## 11              1    8.6952768       144       7.257095
## 12              2    6.5980417        NA             NA
## 13              2    5.8595207       911             NA
## 14              2    8.0714691        NA             NA
## 15              2    6.6509003       277      10.585489
## 16              1    6.4375506        NA             NA
## 17              1    7.4000330       295      16.401488
## 18              1    7.5149593        99             NA
## 19              2    7.4109494       133             NA
## 2               1           NA        NA             NA
## 20              1    7.0582039       480      26.605583
## 21              2    5.1444074       238      34.812168
## 22              1    6.4916872        82             NA
## 23              1    7.0467922       133             NA
## 24              1    4.5626555       150             NA
## 25              1    4.5297855       188             NA
## 26              1    5.5156670       148      16.108508
## 27              1    5.4581092       192             NA
## 28              2    6.8639171        NA             NA
## 29              2    6.2305637       284      25.083209
## 3               1    7.0134394       236             NA
## 30              1    5.9703873        NA             NA
## 31              1    5.0445286       476       8.545243
## 32              2    8.0912621       653             NA
## 33              2    3.7410300       749      17.703064
## 34              1    7.4288998        NA             NA
## 35              2    6.2731520       218             NA
## 36              1    5.3715685       191      17.446750
## 37              1           NA        NA             NA
## 38              1    6.9208097        62             NA
## 39              1    6.1675411        NA             NA
## 4               2    7.1787678       133       7.211965
## 40              1    7.1802644        NA             NA
## 41              2    7.6336562        NA             NA
## 42              1    7.3454611       504             NA
## 43              1    4.7284002       318             NA
## 44              1    8.1990877       147      35.241054
## 45              1    5.2667517        NA      13.545087
## 46              1    7.6527670        NA      30.018793
## 47              1    8.1338753      9238       8.663056
## 48              1   -0.8996951      3997      18.252658
## 49              1    7.1292784        NA      10.347126
## 5               2    6.6109253       187             NA
## 50              1    5.7247065      8629             NA
## 51              2    6.2011152        NA             NA
## 52              2    6.7813333     10181      23.512649
## 53              1    7.7766260      6593             NA
## 54              1    7.4293074        NA             NA
## 55              1    6.9147112        NA      15.334381
## 56              1    7.6619286        NA             NA
## 57              1    6.3750970        NA             NA
## 58              1    7.8029005        NA             NA
## 59              1    8.4633555        NA             NA
## 6               1    7.3879859       216      41.380502
## 60              2    4.4048843        NA             NA
## 61              1    8.2479275       420             NA
## 62              1   10.5966103      1532             NA
## 63              1   10.9044098       615      18.700000
## 64              2    9.1252876       644      13.800000
## 65              2   10.8437248      2353      20.400000
## 66              1   11.0080784       989      13.900000
## 67              1   11.0555875       453      31.100000
## 68              2   10.6123594       324      15.500000
## 69              1   10.1676691       792      18.900000
## 7               1    6.5343049        NA             NA
## 70              2    7.6524957       360      46.100000
## 71              1    9.8407013       942      18.200000
## 72              1   10.7617590       302      27.500000
## 73              1   10.3984203       420      23.400000
## 74              1    8.5913258       207      50.000000
## 75              1    7.0901124       242             NA
## 76              1   10.2842457       205      22.300000
## 77              2    8.9844185      1928      31.000000
## 78              1    9.6995725       312      35.700000
## 79              2    9.9932922      2348      52.600000
## 8               1           NA        NA             NA
## 80              1    7.9571020      3592      26.400000
## 81              2    9.0133227      2426      35.600000
## 82              1           NA      8124      20.500000
## 83              1    8.7696728       130      27.700000
## 84              1    8.8916357       595      25.100000
## 85              1    7.3987437       356      18.700000
## 86              1    9.2649118       571      25.200000
## 87              2           NA      1068      22.500000
## 88              2           NA      4288      20.200000
## 89              1    7.0655232       664      16.600000
## 9               2    4.1218190       334             NA
## 90              1    7.7940913       439      21.600000
## 91              2    8.9291029       175      40.400000
## 92              1    8.5612879       275      33.000000
## 93              1    9.3337144       291      15.000000
## 94              2    9.9567391       140      39.500000
## 95              1   10.0750746       210      23.500000
## 96              2    8.5924570       177      35.000000

Test model quality

options(warn=-1)
xCols = c("CCL3", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex")
yCols = c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")

pred_cor <- data.frame(matrix(nrow=length(yCols), ncol=1))
rownames(pred_cor) <- yCols
colnames(pred_cor) <- c('cor.pred.true')

rownames(dataDf) <- attr(dataDf, "row.names")
print(typeof(row.names(dataDf)))

## [1] "character"

# 
for (i in 1:length(yCols)){
  all_preds <- c()
  all_true <- c()
  set.seed(1)
  
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  for (j in 1:length(row_int)){
    train <- row_int[-c(j)]
    xData <- filteredX[train, xCols]
    yData <- filteredY[train,]
    
    allidx = row_int
    predidx = setdiff(allidx, train)
    
    # create lasso model
    suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(xData), yData, family='gaussian',
                         alpha=1, nfolds=nrow(xData[train,]-1)))

    preds <- predict(cvfit_out, newx = as.matrix(data.frame(filteredX[predidx,])), s='lambda.min')

    all_preds <- c(all_preds, preds)
    all_true<- c(all_true, filteredY[predidx, yCols[i]])
  }

  pred_cor[yCols[i],'cor.pred.true'] <- cor(all_preds,all_true)
}

pred_cor

##                cor.pred.true
## IgG.PT.day14       0.5857162
## CCL3.day3          0.4638935
## Monocytes.day1     0.7889658

For each model, can assess which features contribute to non-zero coefficients

Consider only choosing models for follow-on analysis that show good correlation scores

size <- length(yCols)
all_models_coef<-vector(mode='list',length=size)
all_models_names<-vector(mode='list',length=size)
all_models<-vector(mode='list',length=size)

for (i in 1:length(yCols)){
  set.seed(1)
  filteredY <- na.omit(dataDf[yCols[i]])
  filteredX <- na.omit(dataDf[xCols])
  
  row_int <- intersect(rownames(filteredY), rownames(filteredX))
  
  # create lasso model
  suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(filteredX[row_int,]), as.matrix(filteredY[row_int,]), family='gaussian', alpha=1, nfolds=nrow(filteredX[row_int,]-1)))
  plot(cvfit_out)
  all_models_coef[i]=list(coef(cvfit_out, s = 'lambda.min')[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
  all_models_names[i]=list(rownames(coef(cvfit_out, s = 'lambda.min'))[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
}

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient

names(all_models_coef) <- yCols
names(all_models_names) <- yCols

for (i in 1:size){
  all_models[[i]] = data.frame(cbind(all_models_names[[i]],all_models_coef[[i]]))
  colnames(all_models[[i]])<-c("Variable","Coefficient")
  all_models[[i]]$Coefficient<-as.numeric(all_models[[i]]$Coefficient)
  all_models[[i]]$Coefficient=round(all_models[[i]]$Coefficient,3)
  all_models[[i]]<-all_models[[i]] %>% arrange(desc(abs(Coefficient)))
}
names(all_models)<-yCols

all_models

## $IgG.PT.day14
##         Variable Coefficient
## 1    (Intercept)       5.192
## 2         IgG.PT       0.760
## 3 biological_sex      -0.627
## 4           CCL3       0.156
## 5      Monocytes       0.093
## 6   age_at_boost       0.038
## 
## $CCL3.day3
##       Variable Coefficient
## 1  infancy_vac    -718.299
## 2  (Intercept)    -690.236
## 3         CCL3     547.333
## 4 age_at_boost      -1.765
## 
## $Monocytes.day1
##         Variable Coefficient
## 1    (Intercept)       8.542
## 2    infancy_vac       4.551
## 3 biological_sex       1.772
## 4           CCL3      -1.463
## 5      Monocytes       1.057
## 6   age_at_boost      -0.250
## 7         IgG.PT       0.173

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.