This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(omicade4)
## Loading required package: ade4
library(mogsa)
library(RSpectra)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:mogsa':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
You can also embed plots, for example:
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/harmonized")
# Read in metadata
meta.2020<-read.table('clinical_metadata.2020.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
meta.2021<-read.table('clinical_metadata.2021.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
# imputed_dir is path to local drive where data is saved
setwd("/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/main/cmi_pb_datasets/processed/imputed")
# Import imputed datasets
rnaseq_baseline_mat_imputed_20 <- read.csv('rnaseq_baseline_mat_imputed_20_051022.csv',row.names=1)
cytof_baseline_mat_imputed_20 <- read.csv('cytof_baseline_mat_imputed_20_051022.csv',row.names=1)
olink_baseline_mat_imputed_20 <- read.csv('olink_baseline_mat_imputed_20_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_20 <- read.csv('abtiters_baseline_mat_imputed_20_051022.csv',row.names=1)
rnaseq_baseline_mat_imputed_21 <- read.csv('rnaseq_baseline_mat_imputed_21_051022.csv',row.names=1)
cytof_baseline_mat_imputed_21 <- read.csv('cytof_baseline_mat_imputed_21_051022.csv',row.names=1)
olink_baseline_mat_imputed_21 <- read.csv('olink_baseline_mat_imputed_21_051022.csv',row.names=1)
abtiters_baseline_mat_imputed_21 <- read.csv('abtiters_baseline_mat_imputed_21_051022.csv',row.names=1)
# Get age at boost
library(lubridate)
meta.2020$date_of_boost<-parse_date_time(meta.2020$date_of_boost,"ymd")
meta.2020$year_of_birth<-parse_date_time(meta.2020$year_of_birth,"ymd")
meta.2020$age_at_boost<- as.numeric(round(difftime(meta.2020$date_of_boost,
meta.2020$year_of_birth,units="weeks")/52,2))
meta.2021$date_of_boost<-parse_date_time(meta.2021$date_of_boost,"ymd")
meta.2021$year_of_birth<-parse_date_time(meta.2021$year_of_birth,"ymd")
meta.2021$age_at_boost<- as.numeric(round(difftime(meta.2021$date_of_boost,
meta.2021$year_of_birth,units="weeks")/52,2))
meta <- rbind(meta.2020[c("age_at_boost", "infancy_vac", "biological_sex")], meta.2021[c("age_at_boost", "infancy_vac", "biological_sex")])
meta$infancy_vac <- as.numeric(meta$infancy_vac)
meta$biological_sex <- as.numeric(meta$biological_sex)
colnames(meta)
## [1] "age_at_boost" "infancy_vac" "biological_sex"
# Get the y data
dataY = read.table('/Users/rnili/Desktop/repo/gitLab/cmi-pb-multiomics-main/results/yData_task_matrix.common_names.mfi_raw.tsv',sep='\t',header=TRUE,stringsAsFactors=TRUE,row.names=1)
dataY <- dataY[c("IgG.PT.day14", "ENSG00000277632.day3", "Monocytes.day1")]
colnames(dataY) <- c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
dataY$IgG.PT.day14 <- log2(dataY$IgG.PT.day14)
dataY$subj_id <- rownames(dataY)
colnames(dataY)
## [1] "IgG.PT.day14" "CCL3.day3" "Monocytes.day1" "subj_id"
colnames(rnaseq_baseline_mat_imputed_20)[which(names(rnaseq_baseline_mat_imputed_20) == "ENSG00000277632")] <- "CCL3"
colnames(rnaseq_baseline_mat_imputed_21)[which(names(rnaseq_baseline_mat_imputed_21) == "ENSG00000277632")] <- "CCL3"
rnaDf <- rbind(rnaseq_baseline_mat_imputed_20["CCL3"], rnaseq_baseline_mat_imputed_21["CCL3"])
abtiterDf <- log2(rbind(abtiters_baseline_mat_imputed_20["IgG.PT"], abtiters_baseline_mat_imputed_21["IgG.PT"]))
cytofDf <- rbind(cytof_baseline_mat_imputed_20["Monocytes"], cytof_baseline_mat_imputed_21["Monocytes"])
dataDf1 <- merge(rnaDf, abtiterDf, by='row.names', all=T)
colnames(dataDf1)[1] <- "subj_id"
dataDf2 <- merge(cytofDf, meta, by='row.names', all=T)
colnames(dataDf2)[1] <- "subj_id"
dataX <- merge(dataDf1, dataDf2, by='subj_id', all=T)
dataDf <- merge(dataX, dataY, by='subj_id', all=T)
rownames(dataDf) <- dataDf$subj_id
dataDf
## subj_id CCL3 IgG.PT Monocytes age_at_boost infancy_vac
## 1 1 5.333531 1.166054115 7.5260062 30.80 2
## 10 10 4.101398 -4.589296443 13.4642547 34.68 2
## 11 11 4.542939 0.829429646 10.7905171 30.76 2
## 12 12 NA NA NA 34.68 2
## 13 13 3.399718 0.452132222 5.0304685 19.63 1
## 14 14 NA NA NA 23.70 2
## 15 15 4.835874 -1.392116120 15.6373679 27.71 2
## 16 16 NA NA NA 29.66 2
## 17 17 4.512669 1.788908531 14.4574172 36.82 2
## 18 18 4.083384 0.754753184 0.9568001 19.73 1
## 19 19 4.459300 1.113018455 13.5202756 22.81 2
## 2 2 NA NA NA 51.25 2
## 20 20 3.963474 1.350624258 21.3991621 35.78 2
## 21 21 3.823138 1.472218478 15.2266453 33.77 2
## 22 22 4.462314 1.113658793 11.9355890 31.77 2
## 23 23 4.421223 -1.314917015 11.7041207 25.82 2
## 24 24 3.106516 -1.889370403 11.5201600 24.79 2
## 25 25 4.042732 -0.415668881 13.4709985 28.80 2
## 26 26 4.735685 -0.460255865 5.5320667 33.85 2
## 27 27 3.782618 -0.375555190 13.8965682 19.80 1
## 28 28 NA NA NA 34.85 2
## 29 29 4.447249 -0.004503163 16.0947648 19.80 1
## 3 3 4.599972 0.094763759 13.0245725 33.89 2
## 30 30 NA NA NA 28.84 2
## 31 31 4.693487 -2.595524868 4.8790233 27.83 2
## 32 32 5.779549 0.489224006 13.5492381 19.88 1
## 33 33 5.311794 -0.647825498 7.6518328 26.87 2
## 34 34 NA NA NA 33.93 2
## 35 35 5.434762 0.254624301 13.5420630 25.86 2
## 36 36 3.016496 0.057705925 11.2437986 19.88 1
## 37 37 NA NA NA 18.91 1
## 38 38 3.017922 -0.374276619 13.7721006 19.88 1
## 39 39 NA NA NA 31.92 2
## 4 4 4.094574 0.684579811 5.4908380 28.76 2
## 40 40 NA NA NA 22.89 2
## 41 41 NA NA NA 31.96 2
## 42 42 5.230280 0.159044470 10.6095832 19.92 1
## 43 43 3.871351 0.004479489 13.6337391 18.91 1
## 44 44 4.518409 -0.106782869 23.1032762 18.91 1
## 45 45 NA NA NA 19.98 1
## 46 46 NA NA NA 18.91 1
## 47 47 6.900831 1.410588836 10.1292397 20.98 1
## 48 48 6.731536 -4.589296443 14.9597507 19.11 1
## 49 49 NA NA NA 20.11 1
## 5 5 4.916572 1.177876505 13.1258248 25.75 2
## 50 50 8.091139 -0.977973206 9.3613078 19.98 1
## 51 51 NA NA NA 19.98 1
## 52 52 7.288949 0.979670866 28.5490352 19.07 1
## 53 53 9.867189 2.062506433 12.8294284 19.07 1
## 54 54 NA NA NA 20.11 1
## 55 55 NA NA NA 20.11 1
## 56 56 NA NA NA 20.15 1
## 57 57 NA NA NA 21.15 1
## 58 58 NA NA NA 20.15 1
## 59 59 NA NA NA 20.15 1
## 6 6 4.575554 -1.841820815 22.7930283 28.87 2
## 60 60 NA NA NA 20.15 1
## 61 61 12.042340 0.000000000 12.8005992 32.38 2
## 62 62 6.937333 0.105707874 13.5692650 25.99 2
## 63 63 6.691953 0.844550618 15.3000000 23.98 2
## 64 64 6.760926 1.033547520 12.9000000 25.99 2
## 65 65 7.605079 -0.319402203 15.8000000 29.02 2
## 66 66 5.042425 0.406126752 15.8000000 43.07 2
## 67 67 5.889230 -0.501878380 34.6000000 47.24 2
## 68 68 5.853946 0.076693001 13.5000000 47.24 2
## 69 69 5.528321 0.101619092 13.1000000 29.17 2
## 7 7 NA NA NA 35.97 2
## 70 70 5.815166 -1.088411741 32.3000000 21.15 1
## 71 71 5.987707 1.539329657 12.2000000 21.15 1
## 72 72 5.880881 1.062400400 19.9000000 28.25 2
## 73 73 4.529196 0.913242768 15.7000000 24.23 2
## 74 74 3.871745 -0.942946522 36.3000000 24.23 2
## 75 75 5.678804 -2.495167595 14.4559889 21.22 1
## 76 76 4.663800 1.822297344 18.0000000 21.22 1
## 77 77 8.217396 -0.533637004 28.8000000 31.32 2
## 78 78 7.151839 -0.006950768 29.0000000 26.30 2
## 79 79 6.969300 1.215720668 41.5000000 32.32 2
## 8 8 NA NA NA 34.27 2
## 80 80 7.178107 -1.094129745 21.9000000 27.30 2
## 81 81 10.726589 0.860773397 34.9000000 26.30 2
## 82 82 9.413611 -0.382267012 22.5000000 21.28 1
## 83 83 8.588209 0.337503062 32.4000000 20.34 1
## 84 84 7.593122 -1.833745832 18.9000000 22.34 1
## 85 85 4.937862 -1.587430915 19.7000000 19.39 1
## 86 86 5.362224 -1.717702905 23.5000000 21.40 1
## 87 87 5.295650 0.110998420 22.9000000 19.39 1
## 88 88 5.567850 0.064114365 17.6000000 19.39 1
## 89 89 5.130601 -3.876808062 15.7000000 22.49 1
## 9 9 4.238481 -1.796667911 5.7229578 20.63 1
## 90 90 4.837136 -1.556993755 20.6000000 20.49 1
## 91 91 4.140370 -0.108585996 25.4000000 21.49 1
## 92 92 4.623047 -2.069156334 40.6000000 19.54 1
## 93 93 4.616769 0.657620865 16.3000000 23.56 1
## 94 94 4.785446 1.410473077 26.2000000 20.55 1
## 95 95 4.878235 1.200330628 17.3000000 21.55 1
## 96 96 4.356355 -0.387945822 34.2000000 19.54 1
## biological_sex IgG.PT.day14 CCL3.day3 Monocytes.day1
## 1 1 7.6403727 369 NA
## 10 1 1.9341726 179 NA
## 11 1 8.6952768 144 7.257095
## 12 2 6.5980417 NA NA
## 13 2 5.8595207 911 NA
## 14 2 8.0714691 NA NA
## 15 2 6.6509003 277 10.585489
## 16 1 6.4375506 NA NA
## 17 1 7.4000330 295 16.401488
## 18 1 7.5149593 99 NA
## 19 2 7.4109494 133 NA
## 2 1 NA NA NA
## 20 1 7.0582039 480 26.605583
## 21 2 5.1444074 238 34.812168
## 22 1 6.4916872 82 NA
## 23 1 7.0467922 133 NA
## 24 1 4.5626555 150 NA
## 25 1 4.5297855 188 NA
## 26 1 5.5156670 148 16.108508
## 27 1 5.4581092 192 NA
## 28 2 6.8639171 NA NA
## 29 2 6.2305637 284 25.083209
## 3 1 7.0134394 236 NA
## 30 1 5.9703873 NA NA
## 31 1 5.0445286 476 8.545243
## 32 2 8.0912621 653 NA
## 33 2 3.7410300 749 17.703064
## 34 1 7.4288998 NA NA
## 35 2 6.2731520 218 NA
## 36 1 5.3715685 191 17.446750
## 37 1 NA NA NA
## 38 1 6.9208097 62 NA
## 39 1 6.1675411 NA NA
## 4 2 7.1787678 133 7.211965
## 40 1 7.1802644 NA NA
## 41 2 7.6336562 NA NA
## 42 1 7.3454611 504 NA
## 43 1 4.7284002 318 NA
## 44 1 8.1990877 147 35.241054
## 45 1 5.2667517 NA 13.545087
## 46 1 7.6527670 NA 30.018793
## 47 1 8.1338753 9238 8.663056
## 48 1 -0.8996951 3997 18.252658
## 49 1 7.1292784 NA 10.347126
## 5 2 6.6109253 187 NA
## 50 1 5.7247065 8629 NA
## 51 2 6.2011152 NA NA
## 52 2 6.7813333 10181 23.512649
## 53 1 7.7766260 6593 NA
## 54 1 7.4293074 NA NA
## 55 1 6.9147112 NA 15.334381
## 56 1 7.6619286 NA NA
## 57 1 6.3750970 NA NA
## 58 1 7.8029005 NA NA
## 59 1 8.4633555 NA NA
## 6 1 7.3879859 216 41.380502
## 60 2 4.4048843 NA NA
## 61 1 8.2479275 420 NA
## 62 1 10.5966103 1532 NA
## 63 1 10.9044098 615 18.700000
## 64 2 9.1252876 644 13.800000
## 65 2 10.8437248 2353 20.400000
## 66 1 11.0080784 989 13.900000
## 67 1 11.0555875 453 31.100000
## 68 2 10.6123594 324 15.500000
## 69 1 10.1676691 792 18.900000
## 7 1 6.5343049 NA NA
## 70 2 7.6524957 360 46.100000
## 71 1 9.8407013 942 18.200000
## 72 1 10.7617590 302 27.500000
## 73 1 10.3984203 420 23.400000
## 74 1 8.5913258 207 50.000000
## 75 1 7.0901124 242 NA
## 76 1 10.2842457 205 22.300000
## 77 2 8.9844185 1928 31.000000
## 78 1 9.6995725 312 35.700000
## 79 2 9.9932922 2348 52.600000
## 8 1 NA NA NA
## 80 1 7.9571020 3592 26.400000
## 81 2 9.0133227 2426 35.600000
## 82 1 NA 8124 20.500000
## 83 1 8.7696728 130 27.700000
## 84 1 8.8916357 595 25.100000
## 85 1 7.3987437 356 18.700000
## 86 1 9.2649118 571 25.200000
## 87 2 NA 1068 22.500000
## 88 2 NA 4288 20.200000
## 89 1 7.0655232 664 16.600000
## 9 2 4.1218190 334 NA
## 90 1 7.7940913 439 21.600000
## 91 2 8.9291029 175 40.400000
## 92 1 8.5612879 275 33.000000
## 93 1 9.3337144 291 15.000000
## 94 2 9.9567391 140 39.500000
## 95 1 10.0750746 210 23.500000
## 96 2 8.5924570 177 35.000000
options(warn=-1)
xCols = c("CCL3", "IgG.PT", "Monocytes", "age_at_boost", "infancy_vac", "biological_sex")
yCols = c("IgG.PT.day14", "CCL3.day3", "Monocytes.day1")
pred_cor <- data.frame(matrix(nrow=length(yCols), ncol=1))
rownames(pred_cor) <- yCols
colnames(pred_cor) <- c('cor.pred.true')
rownames(dataDf) <- attr(dataDf, "row.names")
print(typeof(row.names(dataDf)))
## [1] "character"
#
for (i in 1:length(yCols)){
all_preds <- c()
all_true <- c()
set.seed(1)
filteredY <- na.omit(dataDf[yCols[i]])
filteredX <- na.omit(dataDf[xCols])
row_int <- intersect(rownames(filteredY), rownames(filteredX))
for (j in 1:length(row_int)){
train <- row_int[-c(j)]
xData <- filteredX[train, xCols]
yData <- filteredY[train,]
allidx = row_int
predidx = setdiff(allidx, train)
# create lasso model
suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(xData), yData, family='gaussian',
alpha=1, nfolds=nrow(xData[train,]-1)))
preds <- predict(cvfit_out, newx = as.matrix(data.frame(filteredX[predidx,])), s='lambda.min')
all_preds <- c(all_preds, preds)
all_true<- c(all_true, filteredY[predidx, yCols[i]])
}
pred_cor[yCols[i],'cor.pred.true'] <- cor(all_preds,all_true)
}
pred_cor
## cor.pred.true
## IgG.PT.day14 0.5857162
## CCL3.day3 0.4638935
## Monocytes.day1 0.7889658
Consider only choosing models for follow-on analysis that show good correlation scores
size <- length(yCols)
all_models_coef<-vector(mode='list',length=size)
all_models_names<-vector(mode='list',length=size)
all_models<-vector(mode='list',length=size)
for (i in 1:length(yCols)){
set.seed(1)
filteredY <- na.omit(dataDf[yCols[i]])
filteredX <- na.omit(dataDf[xCols])
row_int <- intersect(rownames(filteredY), rownames(filteredX))
# create lasso model
suppressWarnings(cvfit_out <- cv.glmnet(x=as.matrix(filteredX[row_int,]), as.matrix(filteredY[row_int,]), family='gaussian', alpha=1, nfolds=nrow(filteredX[row_int,]-1)))
plot(cvfit_out)
all_models_coef[i]=list(coef(cvfit_out, s = 'lambda.min')[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
all_models_names[i]=list(rownames(coef(cvfit_out, s = 'lambda.min'))[coef(cvfit_out, s = 'lambda.min')[,1]!= 0])
}
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
## <sparse>[ <logic> ]: .M.sub.i.logical() maybe inefficient
names(all_models_coef) <- yCols
names(all_models_names) <- yCols
for (i in 1:size){
all_models[[i]] = data.frame(cbind(all_models_names[[i]],all_models_coef[[i]]))
colnames(all_models[[i]])<-c("Variable","Coefficient")
all_models[[i]]$Coefficient<-as.numeric(all_models[[i]]$Coefficient)
all_models[[i]]$Coefficient=round(all_models[[i]]$Coefficient,3)
all_models[[i]]<-all_models[[i]] %>% arrange(desc(abs(Coefficient)))
}
names(all_models)<-yCols
all_models
## $IgG.PT.day14
## Variable Coefficient
## 1 (Intercept) 5.192
## 2 IgG.PT 0.760
## 3 biological_sex -0.627
## 4 CCL3 0.156
## 5 Monocytes 0.093
## 6 age_at_boost 0.038
##
## $CCL3.day3
## Variable Coefficient
## 1 infancy_vac -718.299
## 2 (Intercept) -690.236
## 3 CCL3 547.333
## 4 age_at_boost -1.765
##
## $Monocytes.day1
## Variable Coefficient
## 1 (Intercept) 8.542
## 2 infancy_vac 4.551
## 3 biological_sex 1.772
## 4 CCL3 -1.463
## 5 Monocytes 1.057
## 6 age_at_boost -0.250
## 7 IgG.PT 0.173
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.