Given the simulated data set of N = 373 samples X 289 normalized genes (see file 2021BioinfoFTE_ML_dat.csv), build a model to predict ‘BM_label’. The goal is to achieve high specificity at sensitivity 90 – meaning at least 90% of M correctly classified as M.
BM = read.csv("C:/Users/wwhla/Downloads/job/data scientist/interview/veracyte/2021BioinfoFTE_ML_dat.csv",as.is=T)
BM[1:5,1:5]
## sampleid BM_label Subtype_label Gexp_1 Gexp_2
## 1 BFN_1 B BFN 2.012434 6.475739
## 2 BFN_2 B BFN 4.325741 9.022011
## 3 BFN_3 B BFN 5.215917 5.643612
## 4 BFN_4 B BFN 3.416560 7.975687
## 5 BFN_5 B BFN 3.590157 5.804780
dim(BM)
## [1] 373 292
BMexp = BM[1:nrow(BM),4:ncol(BM)]
boxplot(t(BMexp),main="Boxplot for each sample",ylab="Expression")
par(mfrow=c(2,2))
hist(unlist(BMexp),main="Distribution of gene expression")
#barplot(c(length(which(BM[[2]]=="M")),length(which(BM[[2]]=="B"))))
barplot(table(BM[[2]]),main="# samples for each label")
barplot(table(BM[[3]][which(BM[[2]]=="M")]),main="# samples for each subcluster for label M")
barplot(table(BM[[3]][which(BM[[2]]=="B")]),main="# samples for each subcluster for label B")
#length(which(is.na(BMexp)))
For each sample, their genes have similar mean expression and standard deviation. The whole distribution is close to normal. We can find that the data are well normalized.
The number number of samples for each label are different but still in the same scale. We should not worry about the label bias issue.
For group “M”, there are 3 subgroups and one major group. For group “B”, there are also 3 subgroups but the 3 subgroups are comparable. ## Use PCA to check if there is any outlier.
tmp = prcomp(BMexp,center=TRUE, scale = TRUE)
clab = matrix(1,nrow=nrow(BM),ncol=1)
clab[which(BM$BM_label=="B")]=2
clab2 = matrix(1, nrow=nrow(BM),ncol=1)
clab2[which(BM$Subtype_label == "HCC")] =2
clab2[which(BM$Subtype_label == "PTC")] =3
clab2[which(BM$Subtype_label == "BFN")] =4
clab2[which(BM$Subtype_label == "FA")] =5
clab2[which(BM$Subtype_label == "HCA")] =6
#par(mfrow=c(1,2))
plot(tmp$x[,1],tmp$x[,2],col=c("green","red")[clab],xlab="PC1",ylab="PC2")
legend("topright",legend = c("M","B"),fill =c("green","red"))
plot(tmp$x[,1],tmp$x[,2],col=c("green","blue","cyan","pink","red","purple")[clab2],xlab="PC1",ylab="PC2")
legend("bottomright",legend = c("FC","HCC","PTC","BFN","FA","HCA"),fill = c("green","blue","cyan","pink","red","purple"))
The first question is which model we want to use to train and predict
the label. We want to validate as more methods as we can based on
available resources. For this case, we want to validate logistic
regression with elastic net regularization, support vector machine
(svm), random forest(RF) and neural network(nnet).
Since all these models have hyperparameters, we choose to use nested cross validation (CV) to validate the performance of the method. In order to make sure the stability of the validation, we should repeat the nested CV for 100 or more times. However, we don’t have enough time for this.
library(glmnet)
library(caret)
set.seed(123)
BM$BM_label = as.factor(BM$BM_label)
#split the data into 5 folds
BM_folds = createFolds(BM$BM_label, k = 5, list = FALSE)
cv_5 = trainControl(method = "cv", number = 5, classProbs = TRUE, verboseIter = FALSE, summaryFunction = twoClassSummary)
#collect the predict score got from CV.
BM_predicted = matrix(0, nrow = nrow(BM), ncol =1)
BM_predicted_svm = matrix(0,nrow=nrow(BM), ncol =1)
BM_predicted_rf = matrix(0, nrow=nrow(BM), ncol = 1)
BM_predicted_nnet = matrix(0, nrow=nrow(BM), ncol = 1)
tune_grid_neural <- expand.grid(size = c(1:5),decay = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1))
for(i in 1:5){
BM_idx = which(BM_folds == i)
BM_trn = BM[-BM_idx,]
BM_tst = BM[BM_idx,]
BM_trn=BM_trn[,-1] #remove Subgroup_label and Sample_id
BM_trn=BM_trn[,-2]
BM_tst=BM_tst[,-1]
BM_tst=BM_tst[,-2]
#train elastic net
def_elenet = train(
BM_label ~., data = BM_trn,
method = "glmnet",
metric = "ROC",
tuneLength =5,
verbose=FALSE,
trControl = cv_5
)
#collect the predict score based on trained elastic net
BM_predicted[which(BM_folds == i)]=predict(def_elenet,newdata=BM_tst,type = "prob")[,1]
#train SVM
def_elenet_svm = train(
BM_label ~., data = BM_trn,
method = "svmLinear",
metric = "ROC",
tuneLength =5,
verbose=FALSE,
trControl = cv_5
)
#collect the predict score based on trained SVM
BM_predicted_svm[which(BM_folds == i)]=predict(def_elenet_svm,newdata=BM_tst,type = "prob")[,1]
#train RF
def_elenet_rf = train(
BM_label ~., data = BM_trn,
method = "rf",
metric = "ROC",
tuneLength =5,
verbose=FALSE,
trControl = cv_5
)
#collect the predict score based on trained RF
BM_predicted_rf[which(BM_folds == i)]=predict(def_elenet_rf,newdata=BM_tst,type = "prob")[,1]
#train Neural network
def_elenet_nnet = train(
BM_label ~., data = BM_trn,
method = "nnet",
metric = "ROC",
#metric = "Accuracy",
#tuneLength =10,
verbose=FALSE,
trControl = cv_5,
trace = FALSE,
tuneGrid = tune_grid_neural,
linout = FALSE
)
#collect the predict score based on trained Neural network
BM_predicted_nnet[which(BM_folds == i)]=predict(def_elenet_nnet,newdata=BM_tst,type = "prob")[,1]
}
BM_label = BM$BM_label
library(pROC)
#plot ROC curve and calculate AUC for elastic net
roc_object <- roc(as.vector(BM_label) ~ as.vector(BM_predicted),plot=TRUE,print.auc=TRUE,legacy.axes=TRUE,col="green",main="ROC curve")
#plot ROC curve and calculate AUC for SVM
roc_object_svm <- roc(as.vector(BM_label) ~ as.vector(BM_predicted_svm),plot=TRUE,print.auc=TRUE,legacy.axes=TRUE,col="blue",main="ROC curve",add=TRUE,print.auc.y=0.4)
#plot ROC curve and calculate AUC for RF
roc_object_rf <- roc(as.vector(BM_label) ~ as.vector(BM_predicted_rf),plot=TRUE,print.auc=TRUE,legacy.axes=TRUE,col="purple",main="ROC curve",add=TRUE,print.auc.y=0.6)
#plot ROC curve and calculate AUC for Neural network
roc_object_nnet <- roc(as.vector(BM_label) ~ as.vector(BM_predicted_nnet),plot=TRUE,print.auc=TRUE,legacy.axes=TRUE,col="orange",main="ROC curve",add=TRUE,print.auc.y=0.2)
legend("bottomright",legend=c("Elastic net","SVM", "Random Forest","Neural Network"),col=c("green","blue","purple","orange"),lwd=4)
Based on the nested cross-validation performance, we can find that Elastic Net performs the best. We will focus on Elastic Net. If we have more time, we can definitely try ensemble several of these models.
BM_idx = createDataPartition(BM$BM_label, p = 0.75, list = FALSE) #random select 75% of samples to train the model and 25% of samples to validate the model
BM_trn = BM[BM_idx,]
BM_tst = BM[-BM_idx,]
BM_trn=BM_trn[,-1]
BM_trn=BM_trn[,-2]
BM_tst=BM_tst[,-1]
BM_tst=BM_tst[,-2]
def_elenet = train(
BM_label ~., data = BM_trn,
method = "glmnet",
metric = "ROC",
tuneLength =5,
verbose=FALSE,
trControl = cv_5,
trace = FALSE
)
BM_predict = predict(def_elenet,newdata=BM_tst,type = "prob")[,1]
library(pROC)
roc_object <- roc(as.vector(BM_tst$BM_label) ~ as.vector(BM_predict),plot=TRUE,print.auc=TRUE,legacy.axes=TRUE,col="green",main="ROC curve")
cutoff_id = which.min(abs(roc_object$sensitivities-0.9))
#Sensitivity:
roc_object$sensitivities[cutoff_id]
## [1] 0.8965517
#Specificity:
roc_object$specificities[cutoff_id]
## [1] 0.9206349
#cutoff:
roc_object$thresholds[cutoff_id]
## [1] 0.8136003
library(vioplot)
par(mfrow=c(1,2))
barplot(table(BM_tst$BM_label),main="number of samples for each label",ylab="#sample")
vioplot(BM_predict[which(BM_tst$BM_label == "B")],BM_predict[which(BM_tst$BM_label == "M")],names = c("B","M"),main="Vioplot of predict score of two labels",ylab = "predict score")
par(mfrow=c(1,2))
barplot(table(BM$Subtype_label[-BM_idx]),main="number of samples for each subgroup in test set",ylab = "#sample")
vioplot(BM_predict[which(BM$Subtype_label[-BM_idx] == "FC")],BM_predict[which(BM$Subtype_label[-BM_idx] == "HCC")],BM_predict[which(BM$Subtype_label[-BM_idx] == "PTC")],BM_predict[which(BM$Subtype_label[-BM_idx] == "BFN")],BM_predict[which(BM$Subtype_label[-BM_idx] == "FA")],BM_predict[which(BM$Subtype_label[-BM_idx] == "HCA")],names = c("FC","HCC","PTC","BFN","FA","HCA"),main="Vioplot of predict score of 6 subgroups",ylab = "predict score")
We can find that there are overlaps for the prediction score between small number of samples in B and M. If we break down the results to subgroups, we can find that M subgroup HCC are hard to differentiate with B. Their prediction score are very similar to B samples. This is in accordance with SVM plot before building the model. HCC samples are mixed with B samples on PCA plot. In such a case, it is highly probable that HCC are hard to split with B samples with linear model. We can consider ensemble another non linear model in the future such as SVM or neural network.