Description
The following code will generate the figures and tables in the manuscript titled “Evaluation of in silico algorithms for use with ACMG/AMP clinical variant interpretation guidelines” by Rajarshi Ghosh, Ninak Oak and Sharon E. Plon using the supplemental data of the manuscript. Libraries required:
suppressWarnings(suppressMessages(library(tidyverse)))#version 1.1.1
theme_set(theme_bw())
library(formattable)#version 0.2.0.1
library(pvclust)#version 2.0-0
suppressWarnings(suppressMessages(library(dendextend)))#version 1.4.0
suppressWarnings(suppressMessages(library(pROC)))#version 1.9.1
library(OptimalCutpoints)#version 1.1-3
suppressWarnings(suppressMessages(library(data.table)))#version 1.10.4Figure 1
We generated a matrix of binary predictions (pathogenic or benign) for 14819 ClinVar variants with scores from 18 algorithms. The thresholds of pathogenicity were publicly available or were inferred as noted in Supplemental Table 1. Dataset used : Supplemental_data_1.
Import the Supplemental_data_1.txt file. This file contains 14819 variants in Clinvar annotated with scores and predictions using thresholds specified in Additional Table 1.
df<-read.delim("~/Your_Path/Supplemental_data_1.txt", stringsAsFactors=FALSE) # We have update the Supplemental_data_1.txt with the correct LRT predictions. The rest of the file is unchanged.
preds<-df %>% select(id,clinical_significance:hgvs_p,ends_with("_pred")) #select predictions
vars<-names(preds)[c(6:23)]
preds_all_var_bin<-preds %>% mutate_each_(funs(gsub("D",1,.)),vars) %>% mutate_each_(funs(gsub("T",0,.)),vars) # substitute "D" (Damaging) and "T" (Tolerant) with 1 and 0 respectively.
#get the set of variants without any missing data
good_preds_all_var_bin<-complete.cases(preds_all_var_bin[,c(2,6:23)])
all_alg_ok_preds<-preds_all_var_bin[good_preds_all_var_bin,] # this will generate a dataset of 8386 variantsObtain the proportion of algorithms that are in complete concordance (dataset with 14819 variants)
mutate(preds_all_var_bin,dam=apply(preds_all_var_bin[,c(6:23)],1,function(x)length(grep("1",as.factor(x)))))->damaging
tol_dam_algorithms <- damaging %>% mutate(tol=apply(preds_all_var_bin[,c(6:23)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))# to get the variants with complete concordance (both True and False concordances)Figure 1A: Plot the heatmap for concordance among algorithms with all variants ClinVar 1* or greater.
colnames(tol_dam_algorithms)[6:23]<- c("LRT","MutationTaster","PROVEAN","SIFT","FATHMM","Polyphen2","VEST3","Mcap","Mutpred","Condel","REVEL","MutationAssessor", "MetaSVM","MetaLR","CADD","DANN","Eigen","Genocanyon")
final_all_var_opt<-tol_dam_algorithms %>% gather(algorithm,prediction,c(LRT:Genocanyon)) %>% mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon" ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile()+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")Figure 1B: Plot the heatmap for concordance among algorithms with all variants ClinVar 2* or greater.
# Filter the ClinVar 1* or above dataset removing the variants with "criteria, provided, single submitter" review status.
tol_dam_algorithms_2star<-tol_dam_algorithms %>% filter(!grepl("single",review_status))
final_all_var_opt<-tol_dam_algorithms_2star %>% gather(algorithm,prediction,c(LRT:Genocanyon)) %>% mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon" ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile(aes())+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")Table 1
Generate data for Table 1: rows 1 and 2.
final_table<-tol_dam_algorithms%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
# obtain the variants for which all algorithms assertion was opposite to the assertion in ClinVar.
length(which(tol_dam_algorithms$dam==0 & tol_dam_algorithms$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms$tol==0 & tol_dam_algorithms$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_all<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100, digits=2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_all)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
# This table can also be drawn separately as follows:
formattable(final_table_all,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 7346 | 5.20 | 57 | 0.78 |
| Pathogenic | 7473 | 39.21 | 2 | 0.03 |
Generate data for Table 1: rows 3 and 4.
final_table<-tol_dam_algorithms_2star%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_2star$dam==0 & tol_dam_algorithms_2star$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_2star$tol==0 & tol_dam_algorithms_2star$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_twostars<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_twostars)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_twostars,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 1914 | 4.49 | 12 | 0.63 |
| Pathogenic | 1052 | 46.77 | 0 | 0.00 |
Generate data for Table 1: rows 5 and 6. Five commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT, Provean, Mutationtaster
df_top5<-preds_all_var_bin%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,PROVEAN_pred,CADD_pred,MutationTaster_pred)
mutate(df_top5,dam=apply(df_top5[,c(6:10)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top5[,c(6:10)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top5
final_table_top5<-tol_dam_algorithms_top5%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top5$dam==0 & tol_dam_algorithms_top5$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top5$tol==0 & tol_dam_algorithms_top5$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_top5<-cbind(final_table_top5,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top5)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_top5,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 7346 | 33.54 | 815 | 11.09 |
| Pathogenic | 7473 | 79.00 | 68 | 0.91 |
Generate data for Table 1: rows 7 and 8. Three commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT
df_top3<-preds_all_var_bin%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,CADD_pred)
mutate(df_top3,dam=apply(df_top3[,c(6:8)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top3[,c(6:8)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top3
final_table_top3<-tol_dam_algorithms_top3%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top3$dam==0 & tol_dam_algorithms_top3$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top3$tol==0 & tol_dam_algorithms_top3$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_top3<-cbind(final_table_top3,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top3)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_top3,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 7346 | 46.17 | 1340 | 18.24 |
| Pathogenic | 7473 | 84.87 | 156 | 2.09 |
Table 1: Compiling all the different combinations of algorithms in a single table. This is the table that is plotted in the manuscript.
final_table_all<-final_table_all%>%mutate(Variants= "ClinVar * or above",Algorithms= "18")
final_table_twostars<-final_table_twostars%>%mutate(Variants= "ClinVar ** or above",Algorithms= "18")
final_table_top5<-final_table_top5%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD;PROVEAN;MTaster")
final_table_top3<-final_table_top3%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD")
final_table_fin<-rbind(final_table_all,final_table_twostars,final_table_top5,final_table_top3)
colnames(final_table_fin)[1]<-"ClinVar Assertion"
#Draw the table
formattable(final_table_fin,list('ClinVar Assertion' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| ClinVar Assertion | Total variants | Concordance(%) | False Concordance | False Concordance(%) | Variants | Algorithms |
|---|---|---|---|---|---|---|
| Benign | 7346 | 5.20 | 57 | 0.78 | ClinVar * or above | 18 |
| Pathogenic | 7473 | 39.21 | 2 | 0.03 | ClinVar * or above | 18 |
| Benign | 1914 | 4.49 | 12 | 0.63 | ClinVar ** or above | 18 |
| Pathogenic | 1052 | 46.77 | 0 | 0.00 | ClinVar ** or above | 18 |
| Benign | 7346 | 33.54 | 815 | 11.09 | ClinVar * or above | Polyphen2;SIFT;CADD;PROVEAN;MTaster |
| Pathogenic | 7473 | 79.00 | 68 | 0.91 | ClinVar * or above | Polyphen2;SIFT;CADD;PROVEAN;MTaster |
| Benign | 7346 | 46.17 | 1340 | 18.24 | ClinVar * or above | Polyphen2;SIFT;CADD |
| Pathogenic | 7473 | 84.87 | 156 | 2.09 | ClinVar * or above | Polyphen2;SIFT;CADD |
Obtain the proportion of algorithms that are in complete concordance (for variants without missing data)
mutate(all_alg_ok_preds,dam=apply(all_alg_ok_preds[,c(6:23)],1,function(x)length(grep("1",as.factor(x)))))->damaging
tol_dam_algorithms_s1 <- damaging %>% mutate(tol=apply(all_alg_ok_preds[,c(6:23)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))# to get the variants with complete concordance (both True and False concordances)Additional Figure 1A: Plot the heatmap for concordance among algorithms with all variants ClinVar 1* or greater and without any missing data.
colnames(tol_dam_algorithms_s1)[6:23]<- c("LRT","MutationTaster","PROVEAN","SIFT","FATHMM","Polyphen2","VEST3","Mcap","Mutpred","Condel","REVEL","MutationAssessor", "MetaSVM","MetaLR","CADD","DANN","Eigen","Genocanyon")
final_all_var_opt<-tol_dam_algorithms_s1%>%gather(algorithm,prediction,c(LRT:Genocanyon))%>%mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon" ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile()+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")Additional Figure 1B: Plot the heatmap for concordance among algorithms with variants ClinVar 2* or greater and without any missing data.
# Filter the ClinVar 1* or above dataset removing the variants with "criteria, provided, single submitter" review status.
tol_dam_algorithms_2star_s1<-tol_dam_algorithms_s1%>%filter(!grepl("single",review_status))
final_all_var_opt<-tol_dam_algorithms_2star_s1%>%gather(algorithm,prediction,c(LRT:Genocanyon))%>%mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon" ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile(aes())+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")Additional Table 1
Generate data for Additional Table 1: rows 1 and 2.
final_table<-tol_dam_algorithms_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
# obtain the variants for which all algorithms assertion was opposite to the assertion in ClinVar.
length(which(tol_dam_algorithms_s1$dam==0 & tol_dam_algorithms_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_s1$tol==0 & tol_dam_algorithms_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_all<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100, digits=2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_all)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
# This table can also be drawn separately as follows:
formattable(final_table_all,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 2555 | 3.21 | 40 | 1.57 |
| Pathogenic | 5831 | 41.47 | 1 | 0.02 |
Generate data for Additional Table 1: rows 3 and 4.
final_table<-tol_dam_algorithms_2star_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_2star_s1$dam==0 & tol_dam_algorithms_2star_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_2star_s1$tol==0 & tol_dam_algorithms_2star_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_twostars<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_twostars)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_twostars,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 470 | 2.13 | 11 | 2.34 |
| Pathogenic | 848 | 50.59 | 0 | 0.00 |
Generate data for Additional Table 1: rows 5 and 6. Five commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT, Provean, Mutationtaster
df_top5<-all_alg_ok_preds%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,PROVEAN_pred,CADD_pred,MutationTaster_pred)
mutate(df_top5,dam=apply(df_top5[,c(6:10)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top5[,c(6:10)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top5_s1
final_table_top5<-tol_dam_algorithms_top5_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top5_s1$dam==0 & tol_dam_algorithms_top5_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top5_s1$tol==0 & tol_dam_algorithms_top5_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_top5<-cbind(final_table_top5,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top5)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_top5,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 2555 | 29.47 | 280 | 10.96 |
| Pathogenic | 5831 | 81.67 | 43 | 0.74 |
Generate data for Additional Table 1: rows 7 and 8. Three commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT
df_top3<-all_alg_ok_preds%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,CADD_pred)
mutate(df_top3,dam=apply(df_top3[,c(6:8)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top3[,c(6:8)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top3_s1
final_table_top3<-tol_dam_algorithms_top3_s1 %>% group_by(clinical_significance) %>% summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top3_s1$dam==0 & tol_dam_algorithms_top3_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top3_s1$tol==0 & tol_dam_algorithms_top3_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_top3<-cbind(final_table_top3,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top3)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")
#This table can also be drawn separately as follows:
formattable(final_table_top3,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| Clinical Significance | Total variants | Concordance(%) | False Concordance | False Concordance(%) |
|---|---|---|---|---|
| Benign | 2555 | 44.15 | 494 | 19.33 |
| Pathogenic | 5831 | 86.76 | 97 | 1.66 |
Additional Table 1: Compiling all the different combinations of algorithms in a single table. This is the table that is plotted in the manuscript.
final_table_all<-final_table_all%>%mutate(Variants= "ClinVar * or above",Algorithms= "18")
final_table_twostars<-final_table_twostars%>%mutate(Variants= "ClinVar ** or above",Algorithms= "18")
final_table_top5<-final_table_top5%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD;PROVEAN;MTaster")
final_table_top3<-final_table_top3%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD")
final_table_fin<-rbind(final_table_all,final_table_twostars,final_table_top5,final_table_top3)
colnames(final_table_fin)[1]<-"ClinVar Assertion"
#Draw the table
formattable(final_table_fin,list('ClinVar Assertion' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))| ClinVar Assertion | Total variants | Concordance(%) | False Concordance | False Concordance(%) | Variants | Algorithms |
|---|---|---|---|---|---|---|
| Benign | 2555 | 3.21 | 40 | 1.57 | ClinVar * or above | 18 |
| Pathogenic | 5831 | 41.47 | 1 | 0.02 | ClinVar * or above | 18 |
| Benign | 470 | 2.13 | 11 | 2.34 | ClinVar ** or above | 18 |
| Pathogenic | 848 | 50.59 | 0 | 0.00 | ClinVar ** or above | 18 |
| Benign | 2555 | 29.47 | 280 | 10.96 | ClinVar * or above | Polyphen2;SIFT;CADD;PROVEAN;MTaster |
| Pathogenic | 5831 | 81.67 | 43 | 0.74 | ClinVar * or above | Polyphen2;SIFT;CADD;PROVEAN;MTaster |
| Benign | 2555 | 44.15 | 494 | 19.33 | ClinVar * or above | Polyphen2;SIFT;CADD |
| Pathogenic | 5831 | 86.76 | 97 | 1.66 | ClinVar * or above | Polyphen2;SIFT;CADD |
Generating Supplemental data 2-5 for concordances among different combinations of algorithms for variants without any missing data. Used for Figure 2A, 2B and Table 2. Uncomment to run the code.
# df_for_combo<-all_alg_ok_preds %>%select(hgvs_p,clinical_significance,LRT_pred:Genocanyon_pred)
#
# df_fin<-data.frame(hgvs_p=df_for_combo$hgvs_p,clinsig=df_for_combo$clinical_significance,apply(df_for_combo[,3:20],2,function(x)as.numeric(as.character(x))))#convert to numeric
#
# df_fin_benign<-df_fin%>%filter(grepl("Benign",clinsig)) #select benign variant predictions
# df_fin_path<-df_fin%>%filter(grepl("Pathogenic",clinsig)) #select pathogenic variant predictions
# j_path2 = combn(x = df_fin_path[,3:20], m = 2, simplify = FALSE)
# sapply(j_path2, function(x) length(which(apply(x,1,sum) == 2 ))/NROW(x[complete.cases(x),]))->path_true_concordant2
# sapply(j_path2, function(x) length(which(apply(x,1,sum) ==0 ))/NROW(x[complete.cases(x),]))->path_false_concordant2
#
# df_path_2<-data.frame(True_concordance=path_true_concordant2,False_concordance=path_false_concordant2,Clinsig=rep("Pathogenic",length(j_path2)))
#
# grp_name2newpath<-data.frame(t(sapply(j_path2,function(x) colnames(x)[1:2])))
# grp_name2_fin_newpath<-cbind(grp=c(1:nrow(grp_name2newpath)),grp_name2newpath)
# df_2newpath<-cbind(grp_name2_fin_newpath,df_path_2)
#
# #Dataset: df_fin_benign = a subset of the 'preds_all_var_bin' dataframe of Benign variants with predictions from 18 algorithms as above.
# j_ben2 = combn(x = df_fin_benign[,3:20], m = 2, simplify = FALSE)
# sapply(j_ben2, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant2
# sapply(j_ben2, function(x) length(which(apply(x,1,sum)==2))/NROW(x[complete.cases(x),]))->ben_false_concordant2
#
# df_ben_2<-data.frame(True_concordance=ben_true_concordant2,False_concordance=ben_false_concordant2,Clinsig=rep("Benign",length(j_ben2)))
#
#
# grp_name2newben<-data.frame(t(sapply(j_ben2,function(x) colnames(x)[1:2])))
# grp_name2_fin_newben<-cbind(grp=c(1:nrow(grp_name2newben)),grp_name2newben)
# df_2newben<-cbind(grp_name2_fin_newben,df_ben_2)
#
# df_2<-rbind(df_2newpath,df_2newben)
# df_2<-df_2%>%mutate(num.algorithm=2)
# #write.table(df_2,"~/Your_Path/Supplemental_data_2.txt")
#
# # #All combinations of 3 algorithms
# # #pathogenic variants
# j_path3 = combn(x = df_fin_path[,3:20], m = 3, simplify = FALSE)
# sapply(j_path3, function(x) length(which(apply(x,1,sum) == 3))/NROW(x[complete.cases(x),]))->path_true_concordant3
# sapply(j_path3, function(x) length(which(apply(x,1,sum) ==0))/NROW(x[complete.cases(x),]))->path_false_concordant3
#
# df_path_3<-data.frame(True_concordance=path_true_concordant3,False_concordance=path_false_concordant3,Clinsig=rep("Pathogenic",length(j_path3)))
#
# grp_name3newpath<-data.frame(t(sapply(j_path3,function(x) colnames(x)[1:3])))
# grp_name3_fin_newpath<-cbind(grp=c(1:nrow(grp_name3newpath)),grp_name3newpath)
# df_3newpath<-cbind(grp_name3_fin_newpath,df_path_3)
#
# j_ben3 = combn(x = df_fin_benign[,3:20], m = 3, simplify = FALSE)
# sapply(j_ben3, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant3
# sapply(j_ben3, function(x) length(which(apply(x,1,sum)==3))/NROW(x[complete.cases(x),]))->ben_false_concordant3
#
# df_ben_3<-data.frame(True_concordance=ben_true_concordant3,False_concordance=ben_false_concordant3,Clinsig=rep("Benign",length(j_ben3)))
#
# grp_name3newben<-data.frame(t(sapply(j_ben3,function(x) colnames(x)[1:3])))
# grp_name3_fin_newben<-cbind(grp=c(1:nrow(grp_name3newben)),grp_name3newben)
# df_3newben<-cbind(grp_name3_fin_newben,df_ben_3)
#
# df_3<-rbind(df_3newpath,df_3newben)
# df_3<-df_3%>%mutate(num.algorithm=3)
# #write.table(df_3,"~/Your_Path/Supplemental_data_3.txt")
#
# # #All combinations of 4 algorithms
# j_path4 = combn(x = df_fin_path[,3:20], m = 4, simplify = FALSE)
# sapply(j_path4, function(x) length(which(apply(x,1,sum) == 4 ))/NROW(x[complete.cases(x),]))->path_true_concordant4
# sapply(j_path4, function(x) length(which(apply(x,1,sum) ==0 ))/NROW(x[complete.cases(x),]))->path_false_concordant4
#
# df_path_4<-data.frame(True_concordance=path_true_concordant4,False_concordance=path_false_concordant4,Clinsig=rep("Pathogenic",length(j_path4)))
#
# grp_name4newpath<-data.frame(t(sapply(j_path4,function(x) colnames(x)[1:4])))
# grp_name4_fin_newpath<-cbind(grp=c(1:nrow(grp_name4newpath)),grp_name4newpath)
# df_4newpath<-cbind(grp_name4_fin_newpath,df_path_4)
#
#
# j_ben4 = combn(x = df_fin_benign[,3:20], m = 4, simplify = FALSE)
# sapply(j_ben4, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant4
# sapply(j_ben4, function(x) length(which(apply(x,1,sum)==4))/NROW(x[complete.cases(x),]))->ben_false_concordant4
#
# df_ben_4<-data.frame(True_concordance=ben_true_concordant4,False_concordance=ben_false_concordant4,Clinsig=rep("Benign",length(j_ben4)))
#
# grp_name4newben<-data.frame(t(sapply(j_ben4,function(x) colnames(x)[1:4])))
# grp_name4_fin_newben<-cbind(grp=c(1:nrow(grp_name4newben)),grp_name4newben)
# df_4newben<-cbind(grp_name4_fin_newben,df_ben_4)
#
# df_4<-rbind(df_4newpath,df_4newben)
# df_4<-df_4%>%mutate(num.algorithm=4)
# #write.table(df_4,"~/Your_Path/Supplemental_data_4.txt")
#
# # #All combinations of 5 algorithms
# j_path5 = combn(x = df_fin_path[,3:20], m = 5, simplify = FALSE)
# sapply(j_path5, function(x) length(which(apply(x,1,sum) == 5))/NROW(x[complete.cases(x),]))->path_true_concordant5
# sapply(j_path5, function(x) length(which(apply(x,1,sum) ==0))/NROW(x[complete.cases(x),]))->path_false_concordant5
#
# df_path_5<-data.frame(True_concordance=path_true_concordant5,False_concordance=path_false_concordant5,Clinsig=rep("Pathogenic",length(j_path5)))
#
# grp_name5newpath<-data.frame(t(sapply(j_path5,function(x) colnames(x)[1:5])))
# grp_name5_fin_newpath<-cbind(grp=c(1:nrow(grp_name5newpath)),grp_name5newpath)
# df_5newpath<-cbind(grp_name5_fin_newpath,df_path_5)
#
#
#
# j_ben5 = combn(x = df_fin_benign[,3:20], m = 5, simplify = FALSE)
# sapply(j_ben5, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant5
# sapply(j_ben5, function(x) length(which(apply(x,1,sum)==5))/NROW(x[complete.cases(x),]))->ben_false_concordant5
#
# df_ben_5<-data.frame(True_concordance=ben_true_concordant5,False_concordance=ben_false_concordant5,Clinsig=rep("Benign",length(j_ben5)))
#
# grp_name5newben<-data.frame(t(sapply(j_ben5,function(x) colnames(x)[1:5])))
# grp_name5_fin_newben<-cbind(grp=c(1:nrow(grp_name5newben)),grp_name5newben)
# df_5newben<-cbind(grp_name5_fin_newben,df_ben_5)
#
# df_5<-rbind(df_5newpath,df_5newben)
# df_5<-df_5%>%mutate(num.algorithm=5)
# #write.table(df_5,"~/Your_Path/Supplemental_data_5.txt")Figure 2
This figure plots the concordance among different combinations of algorithms for benign and pathogenic variants. It also identifies algorithms more likely to be in concordance by a heirarchical clustering.
Figure 2A: Plotting histogram of true concordance among pairs of algorithms.
# Import Supplemental data 6 to plot Figure 2A.
df_for_plot<-read.table("~/Your_Path/Supplemental_data_2.txt",stringsAsFactors = FALSE,header=TRUE)
ggplot(df_for_plot[,c(4,6)],aes(True_concordance))+geom_histogram(data=subset(df_for_plot[,c(4,6)],Clinsig=="Pathogenic"),fill="orange",size=1.5,alpha = .4,bins=20)+geom_histogram(data=subset(df_for_plot[,c(4,6)],Clinsig=="Benign"),fill="green",size=1.5,alpha = .4,bins=15)+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,hjust=1,vjust=.5))+theme(axis.text.y= element_text(size=20))+ylab("Number of pairs of algorithms")+xlab("Proportion of variants with concordant predictions")+theme(strip.text.x = element_text(size = 14, colour = "black",face=c('bold')))+scale_y_continuous(limits=c(0,50),expand=c(0.01,0))+scale_x_continuous(limits=c(0,1),expand=c(0.01,0))Figure 2B: Plot True vs False concordance for various combinations of algorithms.
#Data: Import Supplemental data 3 through 5.These datasets contain the true and false concordances of various combinations of algorithms.
df_3 <- read.delim("~/Your_Path/Supplemental_data_3.txt",stringsAsFactors = FALSE)
df_4 <- read.delim("~/Your_Path/Supplemental_data_4.txt",stringsAsFactors = FALSE)
df_5 <- read.delim("~/Your_Path/Supplemental_data_5.txt",stringsAsFactors = FALSE)
final_table_for_plot_new<-rbind(df_3[,5:8],df_4[,6:9],df_5[,7:10])
ggplot(final_table_for_plot_new,aes(x=100*False_concordance,y=100*True_concordance))+geom_point(size=1,alpha=0.4,aes(colour=Clinsig))+scale_colour_manual(values=c("light green","orange"))+theme(axis.text.x=element_text(size=16,angle=90))+theme(axis.text.y=element_text(size=16))+theme(axis.title.x=element_text(size=20))+theme(axis.title.y=element_text(size=20))+xlab("False Concordance(%)")+scale_y_continuous(limits=c(0,100))+ylab("True concordance(%)")+theme(panel.border=element_rect(colour="black",size=2))+theme(legend.position="none")+facet_grid(.~num.algorithm)+theme(strip.text.x=element_text(size=20,face="bold"))+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Pathogenic"),aes(colour=Clinsig),alpha=0.5,sides="t")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Benign"),aes(colour=Clinsig),alpha=0.5,sides="b")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Pathogenic"),aes(colour=Clinsig),alpha=0.5,sides="r")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Benign"),aes(colour=Clinsig),alpha=0.5,sides="l")Figure 2C: Heirarchical clustering of algorithms.
# Supplemental_data_1 was processed , clustered and saved as follows.
# df<-read.delim("~/Your_Path/Supplemental_data_1.txt", stringsAsFactors=FALSE)
# df_final_hclust<-df %>% select(id,dplyr::contains("_score")) %>% select(-CADD_score,-Eigen_score) %>% mutate(Eigen_raw_score=as.numeric(as.character(Eigen_raw_score))) %>% mutate(PROVEAN_score=-1*PROVEAN_score_new,FATHMM_score=-1*FATHMM_score_new,SIFT_score=-1*SIFT_score_new,LRT_score=-1*LRT_score) %>% select(-SIFT_score_new,-FATHMM_score_new,-PROVEAN_score_new) %>% rename_(.dots=setNames(names(.),gsub("_score", "", names(.))))%>% rename_(.dots=setNames(names(.),gsub("_raw", "", names(.))))%>% rename_(.dots=setNames(names(.),gsub("_new", "", names(.)))) # changed contains to dplyr::contains .
# df_forpvclust<-df_final_hclust[,-1]
# pvclust(scale(df_forpvclust),method.hclust = "ward.D2",nboot = 1000)->fit
#saveRDS(fit,"~/Your_Path/Supplemental_data_16.rds")
# following code was used for plotting Figure 2C .'fit' was saved as Supplemental Data 17.rds
readRDS("~/Your_Path/Supplemental_data_16.rds")->fit
as.dendrogram(fit) %>% hang.dendrogram %>% dendextend::set("branches_lwd",4) %>% dendextend::set("labels_cex", 1)%>% plot()
fit %>% pvrect(alpha=0.99)Figure 3
Performance of algorithms on different datasets.
Figure 3: AUCs of each algorithm on various datasets included in the manuscript.
#The AUCs for the ROC curve of different algorithms were generated using the following general code.
#Here 'dataset' refers to any of the datasets in the column labels of Figures 3A, 3B or Additional Figure 3. These correspond to Supplemental data 1, and Supplemental data 6 through Supplemental data 14 and Supplemental data 18 and 19, which were processed further and used in the general code below.
# library(OptimalCutpoints)
# library(data.table)
# #Getting AUCs
# auc_SpSe <- function(x) optimal.cutpoints(X = "Score", status = "labels", tag.healthy = 0, methods = "MaxSpSe", data=x,conf.level = 0.99)[1]$MaxSpSe$Global$measures.acc$AUC
# We used AUC for this manuscript. However the optimal.cutpoints function also provides a list with all possible cutoffs, sensitivity, specificity, predictive Values, the sample size for both healthy and diseased populations among other metrics.
# setDT(dataset)[,auc_SpSe(.SD), by=Algorithm]->measures_SpSe_dataset
#
# data.frame(measures_SpSe_dataset)->dauc1
# dauc1%>% group_by(Algorithm) %>% summarize(Max_AUC= nth(V1,3),Min_AUC=nth(V1,2),Mean_AUC= nth(V1,1))->dauc_Mean_CI_dataset
# The AUCs from the different datasets were compiled together in Supplemental data 15. Import this file to generate Figures 3A, #B and Supplemental Figure 2.
#Import the compiled AUC file (Supplemental_data_15)
AUC <- read.delim("~/Your_Path/Supplemental_data_15.txt")
AUC<-AUC%>%mutate(Performance_AUC=ifelse(AUC>0.9,"AUC>0.9","AUC<0.9"))
AUC_fin<-AUC%>%mutate(Type=ifelse(grepl("CADD|Condel|Mcap|REVEL|Meta|Eigen",Algorithm),"Metapredictor","other"))
AUC_fin_type1<-AUC_fin%>%filter(grepl("ClinVar|trainset|REVEL|predict",geneclass)) # datasets for addressing type 1 circularity.
AUC_fin_bio<-AUC_fin%>%filter(grepl("Status|Dominant|Recessive|TSG|Oncogenes|constraint|Exclude",geneclass))# datasets with different variant properties.
AUC_fin_type2<-AUC_fin%>%filter(grepl("ClinVar|varibench|balanced",geneclass))%>%filter(!grepl("new",geneclass)) %>% filter(!grepl("Sep2016|Dec2016",geneclass)) #datasets for addressing type 2 circularity.
AUC_fin_bio$Algorithm<-reorder(subset(AUC_fin_bio,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_bio, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_bio$geneclass<-factor(AUC_fin_bio$geneclass,levels=c("ClinVar Status *","ClinVar Status **","Exclude LP and LB","Dominant","Recessive" ,"Oncogenes","TSG","High constraint ", "Low constraint ", "Medium constraint "))
AUC_fin_type1$Algorithm<-reorder(subset(AUC_fin_type1,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_type1, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_type1$geneclass<-factor(AUC_fin_type1$geneclass,levels=c("ClinVar Status *","ClinVar Status **","ClinVar Oct2015 to Dec2016","ClinVar Sep2016 to Mar2017","predictSNPselected","REVEL_testset","No MetaSVM/LR trainset","No MetaSVM/LR trainset **"))
AUC_fin_type2$Algorithm<-reorder(subset(AUC_fin_type2,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_type2, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_type2$geneclass<-factor(AUC_fin_type2$geneclass,levels=c("ClinVar Status *","ClinVar Status **","varibenchselected","balanced *","balanced **"))
# Highlight the ensembl predictors (also referred to as Metapredictors) by coloring them green.
alg<-AUC_fin_type1 %>% group_by(Algorithm,Type)%>%summarize()
colvec <- ifelse(alg$Type=="Metapredictor", "green", "black")Plot Figure 3A
ggplot(AUC_fin_bio,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))Plot Figure 3B
ggplot(AUC_fin_type1,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))Plot Additional Figure 3
ggplot(AUC_fin_type2,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))Plot Additional Figure 2
#Data : AUC_fin from above.
AUC_fin$Algorithm<-reorder(subset(AUC_fin,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin, geneclass=="ClinVar Status *")$AUC,function(x)-mean(x))
my_col<-c("#CA0011","#C26300","#AEFA1E","#25F989","#0B4627","#53D61F","#377717","#1B4F0D","#D1C361","#1A20CB","red","salmon","brown","#e1e11e","gray","black","purple","maroon")
ggplot(AUC_fin,aes(x=Algorithm,y=AUC))+geom_boxplot(outlier.shape = NA)+facet_grid(.~Algorithm,scale="free")+geom_hline(yintercept = 0.8,color="red",linetype="longdash")+geom_jitter(aes(color=geneclass),size=3,alpha=0.6,width = 0.2)+scale_color_manual(values=my_col)+scale_y_continuous(expand=c(0,.01))+ theme(legend.text = element_text(size=15, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_blank())+theme(axis.text.x= element_blank())+theme(axis.text.y= element_text(size=20))+ylab("AUC")+theme(legend.title=element_text(size=15,face="bold"))+theme(strip.text.x=element_text(size=14,face="bold"))+theme(axis.ticks.x = element_blank())