Description

The following code will generate the figures and tables in the manuscript titled “Evaluation of in silico algorithms for use with ACMG/AMP clinical variant interpretation guidelines” by Rajarshi Ghosh, Ninak Oak and Sharon E. Plon using the supplemental data of the manuscript. Libraries required:

suppressWarnings(suppressMessages(library(tidyverse)))#version 1.1.1
theme_set(theme_bw())
library(formattable)#version 0.2.0.1
library(pvclust)#version 2.0-0
suppressWarnings(suppressMessages(library(dendextend)))#version 1.4.0
suppressWarnings(suppressMessages(library(pROC)))#version 1.9.1
library(OptimalCutpoints)#version 1.1-3
suppressWarnings(suppressMessages(library(data.table)))#version 1.10.4

Figure 1

We generated a matrix of binary predictions (pathogenic or benign) for 14819 ClinVar variants with scores from 18 algorithms. The thresholds of pathogenicity were publicly available or were inferred as noted in Supplemental Table 1. Dataset used : Supplemental_data_1.

Import the Supplemental_data_1.txt file. This file contains 14819 variants in Clinvar annotated with scores and predictions using thresholds specified in Additional Table 1.

df<-read.delim("~/Your_Path/Supplemental_data_1.txt", stringsAsFactors=FALSE) # We have update the Supplemental_data_1.txt with the correct LRT predictions. The rest of the file is unchanged.

preds<-df %>% select(id,clinical_significance:hgvs_p,ends_with("_pred")) #select predictions
vars<-names(preds)[c(6:23)]
preds_all_var_bin<-preds %>% mutate_each_(funs(gsub("D",1,.)),vars) %>% mutate_each_(funs(gsub("T",0,.)),vars) # substitute  "D" (Damaging)  and "T" (Tolerant) with 1 and 0 respectively.
#get the set of variants without any missing data
good_preds_all_var_bin<-complete.cases(preds_all_var_bin[,c(2,6:23)])
all_alg_ok_preds<-preds_all_var_bin[good_preds_all_var_bin,] # this will generate a dataset of 8386 variants

Obtain the proportion of algorithms that are in complete concordance (dataset with 14819 variants)

mutate(preds_all_var_bin,dam=apply(preds_all_var_bin[,c(6:23)],1,function(x)length(grep("1",as.factor(x)))))->damaging
tol_dam_algorithms <- damaging %>% mutate(tol=apply(preds_all_var_bin[,c(6:23)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))# to get the variants with complete concordance (both True and False concordances)

Figure 1A: Plot the heatmap for concordance among algorithms with all variants ClinVar 1* or greater.

colnames(tol_dam_algorithms)[6:23]<- c("LRT","MutationTaster","PROVEAN","SIFT","FATHMM","Polyphen2","VEST3","Mcap","Mutpred","Condel","REVEL","MutationAssessor", "MetaSVM","MetaLR","CADD","DANN","Eigen","Genocanyon")
final_all_var_opt<-tol_dam_algorithms %>% gather(algorithm,prediction,c(LRT:Genocanyon)) %>% mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon"  ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile()+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")

Figure 1B: Plot the heatmap for concordance among algorithms with all variants ClinVar 2* or greater.

# Filter the ClinVar 1* or above dataset removing the variants with "criteria, provided, single submitter" review status.
tol_dam_algorithms_2star<-tol_dam_algorithms %>% filter(!grepl("single",review_status))
final_all_var_opt<-tol_dam_algorithms_2star %>% gather(algorithm,prediction,c(LRT:Genocanyon)) %>% mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon"  ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile(aes())+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")

Table 1

Generate data for Table 1: rows 1 and 2.

final_table<-tol_dam_algorithms%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
# obtain the variants for which all algorithms assertion was opposite to the assertion in ClinVar.
length(which(tol_dam_algorithms$dam==0 & tol_dam_algorithms$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms$tol==0 & tol_dam_algorithms$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_all<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100, digits=2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_all)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

# This table can also be drawn separately as follows:
formattable(final_table_all,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	7346	5.20	57	0.78
Pathogenic	7473	39.21	2	0.03

Generate data for Table 1: rows 3 and 4.

final_table<-tol_dam_algorithms_2star%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_2star$dam==0 & tol_dam_algorithms_2star$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_2star$tol==0 & tol_dam_algorithms_2star$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_twostars<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_twostars)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_twostars,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	1914	4.49	12	0.63
Pathogenic	1052	46.77	0	0.00

Generate data for Table 1: rows 5 and 6. Five commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT, Provean, Mutationtaster

df_top5<-preds_all_var_bin%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,PROVEAN_pred,CADD_pred,MutationTaster_pred)
mutate(df_top5,dam=apply(df_top5[,c(6:10)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top5[,c(6:10)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top5

final_table_top5<-tol_dam_algorithms_top5%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top5$dam==0 & tol_dam_algorithms_top5$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top5$tol==0 & tol_dam_algorithms_top5$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_top5<-cbind(final_table_top5,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top5)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_top5,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	7346	33.54	815	11.09
Pathogenic	7473	79.00	68	0.91

Generate data for Table 1: rows 7 and 8. Three commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT

df_top3<-preds_all_var_bin%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,CADD_pred)
mutate(df_top3,dam=apply(df_top3[,c(6:8)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top3[,c(6:8)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top3
final_table_top3<-tol_dam_algorithms_top3%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))

length(which(tol_dam_algorithms_top3$dam==0 & tol_dam_algorithms_top3$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top3$tol==0 & tol_dam_algorithms_top3$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_top3<-cbind(final_table_top3,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top3)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_top3,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	7346	46.17	1340	18.24
Pathogenic	7473	84.87	156	2.09

Table 1: Compiling all the different combinations of algorithms in a single table. This is the table that is plotted in the manuscript.

final_table_all<-final_table_all%>%mutate(Variants= "ClinVar * or above",Algorithms= "18")
final_table_twostars<-final_table_twostars%>%mutate(Variants= "ClinVar ** or above",Algorithms= "18")

final_table_top5<-final_table_top5%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD;PROVEAN;MTaster")
final_table_top3<-final_table_top3%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD")

final_table_fin<-rbind(final_table_all,final_table_twostars,final_table_top5,final_table_top3)
colnames(final_table_fin)[1]<-"ClinVar Assertion"

#Draw the table
formattable(final_table_fin,list('ClinVar Assertion' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

ClinVar Assertion	Total variants	Concordance(%)	False Concordance	False Concordance(%)	Variants	Algorithms
Benign	7346	5.20	57	0.78	ClinVar * or above	18
Pathogenic	7473	39.21	2	0.03	ClinVar * or above	18
Benign	1914	4.49	12	0.63	ClinVar ** or above	18
Pathogenic	1052	46.77	0	0.00	ClinVar ** or above	18
Benign	7346	33.54	815	11.09	ClinVar * or above	Polyphen2;SIFT;CADD;PROVEAN;MTaster
Pathogenic	7473	79.00	68	0.91	ClinVar * or above	Polyphen2;SIFT;CADD;PROVEAN;MTaster
Benign	7346	46.17	1340	18.24	ClinVar * or above	Polyphen2;SIFT;CADD
Pathogenic	7473	84.87	156	2.09	ClinVar * or above	Polyphen2;SIFT;CADD

Obtain the proportion of algorithms that are in complete concordance (for variants without missing data)

mutate(all_alg_ok_preds,dam=apply(all_alg_ok_preds[,c(6:23)],1,function(x)length(grep("1",as.factor(x)))))->damaging
tol_dam_algorithms_s1 <- damaging %>% mutate(tol=apply(all_alg_ok_preds[,c(6:23)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))# to get the variants with complete concordance (both True and False concordances)

Additional Figure 1A: Plot the heatmap for concordance among algorithms with all variants ClinVar 1* or greater and without any missing data.

colnames(tol_dam_algorithms_s1)[6:23]<- c("LRT","MutationTaster","PROVEAN","SIFT","FATHMM","Polyphen2","VEST3","Mcap","Mutpred","Condel","REVEL","MutationAssessor", "MetaSVM","MetaLR","CADD","DANN","Eigen","Genocanyon")
final_all_var_opt<-tol_dam_algorithms_s1%>%gather(algorithm,prediction,c(LRT:Genocanyon))%>%mutate(ClinVar_num=nchar(as.character(clinical_significance))) 
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon"  ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile()+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")

Additional Figure 1B: Plot the heatmap for concordance among algorithms with variants ClinVar 2* or greater and without any missing data.

# Filter the ClinVar 1* or above dataset removing the variants with "criteria, provided, single submitter" review status.
tol_dam_algorithms_2star_s1<-tol_dam_algorithms_s1%>%filter(!grepl("single",review_status))
final_all_var_opt<-tol_dam_algorithms_2star_s1%>%gather(algorithm,prediction,c(LRT:Genocanyon))%>%mutate(ClinVar_num=nchar(as.character(clinical_significance)))
final_all_var_opt$id<-reorder(final_all_var_opt$id,final_all_var_opt$ClinVar_num)
final_all_var_opt$clinical_significance<-factor(final_all_var_opt$clinical_significance, levels=c("Pathogenic","Benign"))
final_all_var_opt$algorithm<-factor(final_all_var_opt$algorithm, levels=c("SIFT","Polyphen2","Mutpred", "PROVEAN","MutationTaster","CADD","Condel","Mcap","MetaLR","MetaSVM","REVEL","Eigen","DANN", "FATHMM","LRT","MutationAssessor","VEST3","Genocanyon"  ))
p<-ggplot(final_all_var_opt,aes(y = id, x = algorithm, fill = as.factor(prediction)))+ geom_tile(aes())+ scale_fill_manual(values = c("light green","orange"),labels=c("Tolerant","Damaging"))+ scale_y_discrete(expand = c(0, 0))+ scale_x_discrete(expand = c(0, 0))
p+facet_grid(clinical_significance~.,scale="free")+theme(axis.text.y=element_blank())+theme(axis.ticks.y=element_blank())+ylab("Variants")+theme(axis.title.y=element_text(size=20))+xlab("Algorithms")+theme(axis.title.x=element_text(size=20))+theme(axis.text.x=element_text(angle=90,size=18,hjust=1,vjust=.5))+ guides(fill=guide_legend(title="Algorithm Predictions"))+ theme(legend.text = element_text(colour="black", size = 16))+ theme(legend.title = element_text(colour="black", size=16))+theme(legend.position=c(0.7,0.5))+theme(strip.text.y=element_text(size=20,face="bold"))+theme(legend.position="none")

Additional Table 1

Generate data for Additional Table 1: rows 1 and 2.

final_table<-tol_dam_algorithms_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
# obtain the variants for which all algorithms assertion was opposite to the assertion in ClinVar.
length(which(tol_dam_algorithms_s1$dam==0 & tol_dam_algorithms_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_s1$tol==0 & tol_dam_algorithms_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)
final_table_all<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100, digits=2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_all)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

# This table can also be drawn separately as follows:
formattable(final_table_all,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	2555	3.21	40	1.57
Pathogenic	5831	41.47	1	0.02

Generate data for Additional Table 1: rows 3 and 4.

final_table<-tol_dam_algorithms_2star_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_2star_s1$dam==0 & tol_dam_algorithms_2star_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_2star_s1$tol==0 & tol_dam_algorithms_2star_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_twostars<-cbind(final_table,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_twostars)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_twostars,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	470	2.13	11	2.34
Pathogenic	848	50.59	0	0.00

Generate data for Additional Table 1: rows 5 and 6. Five commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT, Provean, Mutationtaster

df_top5<-all_alg_ok_preds%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,PROVEAN_pred,CADD_pred,MutationTaster_pred)
mutate(df_top5,dam=apply(df_top5[,c(6:10)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top5[,c(6:10)],1,function(x)length(grep("0",as.factor(x)))))%>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top5_s1

final_table_top5<-tol_dam_algorithms_top5_s1%>%group_by(clinical_significance)%>%summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))
length(which(tol_dam_algorithms_top5_s1$dam==0 & tol_dam_algorithms_top5_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top5_s1$tol==0 & tol_dam_algorithms_top5_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_top5<-cbind(final_table_top5,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top5)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_top5,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	2555	29.47	280	10.96
Pathogenic	5831	81.67	43	0.74

Generate data for Additional Table 1: rows 7 and 8. Three commonly used algorithms as identified by literature search: CADD, Polyphen-2, SIFT

df_top3<-all_alg_ok_preds%>%select(id:hgvs_p,SIFT_pred,Polyphen2_pred,CADD_pred)
mutate(df_top3,dam=apply(df_top3[,c(6:8)],1,function(x)length(grep("1",as.factor(x)))))->damaging
damaging%>%mutate(tol=apply(df_top3[,c(6:8)],1,function(x)length(grep("0",as.factor(x))))) %>% mutate(prop.concordant=ifelse(dam==0,1,dam/(dam+tol)))->tol_dam_algorithms_top3_s1
final_table_top3<-tol_dam_algorithms_top3_s1 %>% group_by(clinical_significance) %>% summarise(count=n(),complete_concordance_num=length(which(prop.concordant==1)))

length(which(tol_dam_algorithms_top3_s1$dam==0 & tol_dam_algorithms_top3_s1$clinical_significance=="Pathogenic"))->falsep
length(which(tol_dam_algorithms_top3_s1$tol==0 & tol_dam_algorithms_top3_s1$clinical_significance=="Benign"))->falseb
falseconc<-c(falseb,falsep)

final_table_top3<-cbind(final_table_top3,falseconc)%>%mutate(Net=complete_concordance_num-falseconc)%>%mutate(Percent_concordant=round((Net/count)*100,digits=2))%>%mutate(Percent_false_concordant=round((falseconc/count)*100,2))%>%select(clinical_significance,count,Percent_concordant,falseconc,Percent_false_concordant)
colnames(final_table_top3)<-c("Clinical Significance","Total variants","Concordance(%)","False Concordance","False Concordance(%)")

#This table can also be drawn separately as follows:
formattable(final_table_top3,list('Clinical Significance' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

Clinical Significance	Total variants	Concordance(%)	False Concordance	False Concordance(%)
Benign	2555	44.15	494	19.33
Pathogenic	5831	86.76	97	1.66

Additional Table 1: Compiling all the different combinations of algorithms in a single table. This is the table that is plotted in the manuscript.

final_table_all<-final_table_all%>%mutate(Variants= "ClinVar * or above",Algorithms= "18")
final_table_twostars<-final_table_twostars%>%mutate(Variants= "ClinVar ** or above",Algorithms= "18")

final_table_top5<-final_table_top5%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD;PROVEAN;MTaster")
final_table_top3<-final_table_top3%>%mutate(Variants= "ClinVar * or above",Algorithms= "Polyphen2;SIFT;CADD")

final_table_fin<-rbind(final_table_all,final_table_twostars,final_table_top5,final_table_top3)
colnames(final_table_fin)[1]<-"ClinVar Assertion"

#Draw the table
formattable(final_table_fin,list('ClinVar Assertion' = formatter("span",style = function(x) ifelse(x =="Benign", "color:green", "color:red"))))

ClinVar Assertion	Total variants	Concordance(%)	False Concordance	False Concordance(%)	Variants	Algorithms
Benign	2555	3.21	40	1.57	ClinVar * or above	18
Pathogenic	5831	41.47	1	0.02	ClinVar * or above	18
Benign	470	2.13	11	2.34	ClinVar ** or above	18
Pathogenic	848	50.59	0	0.00	ClinVar ** or above	18
Benign	2555	29.47	280	10.96	ClinVar * or above	Polyphen2;SIFT;CADD;PROVEAN;MTaster
Pathogenic	5831	81.67	43	0.74	ClinVar * or above	Polyphen2;SIFT;CADD;PROVEAN;MTaster
Benign	2555	44.15	494	19.33	ClinVar * or above	Polyphen2;SIFT;CADD
Pathogenic	5831	86.76	97	1.66	ClinVar * or above	Polyphen2;SIFT;CADD

Generating Supplemental data 2-5 for concordances among different combinations of algorithms for variants without any missing data. Used for Figure 2A, 2B and Table 2. Uncomment to run the code.

# df_for_combo<-all_alg_ok_preds %>%select(hgvs_p,clinical_significance,LRT_pred:Genocanyon_pred)
# 
# df_fin<-data.frame(hgvs_p=df_for_combo$hgvs_p,clinsig=df_for_combo$clinical_significance,apply(df_for_combo[,3:20],2,function(x)as.numeric(as.character(x))))#convert to numeric 
# 
# df_fin_benign<-df_fin%>%filter(grepl("Benign",clinsig)) #select benign variant predictions
# df_fin_path<-df_fin%>%filter(grepl("Pathogenic",clinsig)) #select pathogenic variant predictions
# j_path2 = combn(x = df_fin_path[,3:20], m = 2, simplify = FALSE)
# sapply(j_path2, function(x) length(which(apply(x,1,sum) == 2 ))/NROW(x[complete.cases(x),]))->path_true_concordant2 
# sapply(j_path2, function(x) length(which(apply(x,1,sum) ==0 ))/NROW(x[complete.cases(x),]))->path_false_concordant2
# 
# df_path_2<-data.frame(True_concordance=path_true_concordant2,False_concordance=path_false_concordant2,Clinsig=rep("Pathogenic",length(j_path2)))
# 
# grp_name2newpath<-data.frame(t(sapply(j_path2,function(x) colnames(x)[1:2])))
# grp_name2_fin_newpath<-cbind(grp=c(1:nrow(grp_name2newpath)),grp_name2newpath)
# df_2newpath<-cbind(grp_name2_fin_newpath,df_path_2)
# 
# #Dataset: df_fin_benign = a subset of the 'preds_all_var_bin' dataframe of Benign variants with predictions from 18 algorithms as above.
# j_ben2 = combn(x = df_fin_benign[,3:20], m = 2, simplify = FALSE)
# sapply(j_ben2, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant2
# sapply(j_ben2, function(x) length(which(apply(x,1,sum)==2))/NROW(x[complete.cases(x),]))->ben_false_concordant2
# 
# df_ben_2<-data.frame(True_concordance=ben_true_concordant2,False_concordance=ben_false_concordant2,Clinsig=rep("Benign",length(j_ben2)))
# 
# 
# grp_name2newben<-data.frame(t(sapply(j_ben2,function(x) colnames(x)[1:2])))
# grp_name2_fin_newben<-cbind(grp=c(1:nrow(grp_name2newben)),grp_name2newben)
# df_2newben<-cbind(grp_name2_fin_newben,df_ben_2)
# 
# df_2<-rbind(df_2newpath,df_2newben)
# df_2<-df_2%>%mutate(num.algorithm=2)
# #write.table(df_2,"~/Your_Path/Supplemental_data_2.txt")
# 
# # #All combinations of 3 algorithms
# # #pathogenic variants
# j_path3 = combn(x = df_fin_path[,3:20], m = 3, simplify = FALSE)
# sapply(j_path3, function(x) length(which(apply(x,1,sum) == 3))/NROW(x[complete.cases(x),]))->path_true_concordant3
# sapply(j_path3, function(x) length(which(apply(x,1,sum) ==0))/NROW(x[complete.cases(x),]))->path_false_concordant3
# 
# df_path_3<-data.frame(True_concordance=path_true_concordant3,False_concordance=path_false_concordant3,Clinsig=rep("Pathogenic",length(j_path3)))
# 
# grp_name3newpath<-data.frame(t(sapply(j_path3,function(x) colnames(x)[1:3])))
# grp_name3_fin_newpath<-cbind(grp=c(1:nrow(grp_name3newpath)),grp_name3newpath)
# df_3newpath<-cbind(grp_name3_fin_newpath,df_path_3)
# 
# j_ben3 = combn(x = df_fin_benign[,3:20], m = 3, simplify = FALSE)
# sapply(j_ben3, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant3
# sapply(j_ben3, function(x) length(which(apply(x,1,sum)==3))/NROW(x[complete.cases(x),]))->ben_false_concordant3
# 
# df_ben_3<-data.frame(True_concordance=ben_true_concordant3,False_concordance=ben_false_concordant3,Clinsig=rep("Benign",length(j_ben3)))
# 
# grp_name3newben<-data.frame(t(sapply(j_ben3,function(x) colnames(x)[1:3])))
# grp_name3_fin_newben<-cbind(grp=c(1:nrow(grp_name3newben)),grp_name3newben)
# df_3newben<-cbind(grp_name3_fin_newben,df_ben_3)
# 
# df_3<-rbind(df_3newpath,df_3newben)
# df_3<-df_3%>%mutate(num.algorithm=3)
# #write.table(df_3,"~/Your_Path/Supplemental_data_3.txt")
# 
# # #All combinations of 4 algorithms
# j_path4 = combn(x = df_fin_path[,3:20], m = 4, simplify = FALSE)
# sapply(j_path4, function(x) length(which(apply(x,1,sum) == 4 ))/NROW(x[complete.cases(x),]))->path_true_concordant4
# sapply(j_path4, function(x) length(which(apply(x,1,sum) ==0 ))/NROW(x[complete.cases(x),]))->path_false_concordant4
# 
# df_path_4<-data.frame(True_concordance=path_true_concordant4,False_concordance=path_false_concordant4,Clinsig=rep("Pathogenic",length(j_path4)))
# 
# grp_name4newpath<-data.frame(t(sapply(j_path4,function(x) colnames(x)[1:4])))
# grp_name4_fin_newpath<-cbind(grp=c(1:nrow(grp_name4newpath)),grp_name4newpath)
# df_4newpath<-cbind(grp_name4_fin_newpath,df_path_4)
# 
# 
# j_ben4 = combn(x = df_fin_benign[,3:20], m = 4, simplify = FALSE)
# sapply(j_ben4, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant4
# sapply(j_ben4, function(x) length(which(apply(x,1,sum)==4))/NROW(x[complete.cases(x),]))->ben_false_concordant4
# 
# df_ben_4<-data.frame(True_concordance=ben_true_concordant4,False_concordance=ben_false_concordant4,Clinsig=rep("Benign",length(j_ben4)))
# 
# grp_name4newben<-data.frame(t(sapply(j_ben4,function(x) colnames(x)[1:4])))
# grp_name4_fin_newben<-cbind(grp=c(1:nrow(grp_name4newben)),grp_name4newben)
# df_4newben<-cbind(grp_name4_fin_newben,df_ben_4)
# 
# df_4<-rbind(df_4newpath,df_4newben)
# df_4<-df_4%>%mutate(num.algorithm=4)
# #write.table(df_4,"~/Your_Path/Supplemental_data_4.txt")
# 
# # #All combinations of 5 algorithms
# j_path5 = combn(x = df_fin_path[,3:20], m = 5, simplify = FALSE)
# sapply(j_path5, function(x) length(which(apply(x,1,sum) == 5))/NROW(x[complete.cases(x),]))->path_true_concordant5
# sapply(j_path5, function(x) length(which(apply(x,1,sum) ==0))/NROW(x[complete.cases(x),]))->path_false_concordant5
# 
# df_path_5<-data.frame(True_concordance=path_true_concordant5,False_concordance=path_false_concordant5,Clinsig=rep("Pathogenic",length(j_path5)))
# 
# grp_name5newpath<-data.frame(t(sapply(j_path5,function(x) colnames(x)[1:5])))
# grp_name5_fin_newpath<-cbind(grp=c(1:nrow(grp_name5newpath)),grp_name5newpath)
# df_5newpath<-cbind(grp_name5_fin_newpath,df_path_5)
# 
# 
# 
# j_ben5 = combn(x = df_fin_benign[,3:20], m = 5, simplify = FALSE)
# sapply(j_ben5, function(x) length(which(apply(x,1,sum)==0))/NROW(x[complete.cases(x),]))->ben_true_concordant5
# sapply(j_ben5, function(x) length(which(apply(x,1,sum)==5))/NROW(x[complete.cases(x),]))->ben_false_concordant5
# 
# df_ben_5<-data.frame(True_concordance=ben_true_concordant5,False_concordance=ben_false_concordant5,Clinsig=rep("Benign",length(j_ben5)))
# 
# grp_name5newben<-data.frame(t(sapply(j_ben5,function(x) colnames(x)[1:5])))
# grp_name5_fin_newben<-cbind(grp=c(1:nrow(grp_name5newben)),grp_name5newben)
# df_5newben<-cbind(grp_name5_fin_newben,df_ben_5)
# 
# df_5<-rbind(df_5newpath,df_5newben)
# df_5<-df_5%>%mutate(num.algorithm=5)
# #write.table(df_5,"~/Your_Path/Supplemental_data_5.txt")

Figure 2

This figure plots the concordance among different combinations of algorithms for benign and pathogenic variants. It also identifies algorithms more likely to be in concordance by a heirarchical clustering.

Figure 2A: Plotting histogram of true concordance among pairs of algorithms.

# Import Supplemental data 6 to plot Figure 2A.
df_for_plot<-read.table("~/Your_Path/Supplemental_data_2.txt",stringsAsFactors = FALSE,header=TRUE)

ggplot(df_for_plot[,c(4,6)],aes(True_concordance))+geom_histogram(data=subset(df_for_plot[,c(4,6)],Clinsig=="Pathogenic"),fill="orange",size=1.5,alpha = .4,bins=20)+geom_histogram(data=subset(df_for_plot[,c(4,6)],Clinsig=="Benign"),fill="green",size=1.5,alpha = .4,bins=15)+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,hjust=1,vjust=.5))+theme(axis.text.y= element_text(size=20))+ylab("Number of pairs of algorithms")+xlab("Proportion of variants with concordant predictions")+theme(strip.text.x = element_text(size = 14, colour = "black",face=c('bold')))+scale_y_continuous(limits=c(0,50),expand=c(0.01,0))+scale_x_continuous(limits=c(0,1),expand=c(0.01,0))

Figure 2B: Plot True vs False concordance for various combinations of algorithms.

#Data: Import Supplemental data 3 through 5.These datasets contain the true and false concordances of various combinations of algorithms.
df_3 <- read.delim("~/Your_Path/Supplemental_data_3.txt",stringsAsFactors = FALSE)
df_4 <- read.delim("~/Your_Path/Supplemental_data_4.txt",stringsAsFactors = FALSE)
df_5 <- read.delim("~/Your_Path/Supplemental_data_5.txt",stringsAsFactors = FALSE)
final_table_for_plot_new<-rbind(df_3[,5:8],df_4[,6:9],df_5[,7:10])

ggplot(final_table_for_plot_new,aes(x=100*False_concordance,y=100*True_concordance))+geom_point(size=1,alpha=0.4,aes(colour=Clinsig))+scale_colour_manual(values=c("light green","orange"))+theme(axis.text.x=element_text(size=16,angle=90))+theme(axis.text.y=element_text(size=16))+theme(axis.title.x=element_text(size=20))+theme(axis.title.y=element_text(size=20))+xlab("False Concordance(%)")+scale_y_continuous(limits=c(0,100))+ylab("True concordance(%)")+theme(panel.border=element_rect(colour="black",size=2))+theme(legend.position="none")+facet_grid(.~num.algorithm)+theme(strip.text.x=element_text(size=20,face="bold"))+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Pathogenic"),aes(colour=Clinsig),alpha=0.5,sides="t")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Benign"),aes(colour=Clinsig),alpha=0.5,sides="b")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Pathogenic"),aes(colour=Clinsig),alpha=0.5,sides="r")+geom_rug(data=subset(final_table_for_plot_new,Clinsig=="Benign"),aes(colour=Clinsig),alpha=0.5,sides="l")

Figure 2C: Heirarchical clustering of algorithms.

# Supplemental_data_1 was processed , clustered and saved as follows.
# df<-read.delim("~/Your_Path/Supplemental_data_1.txt", stringsAsFactors=FALSE)
# df_final_hclust<-df %>% select(id,dplyr::contains("_score")) %>% select(-CADD_score,-Eigen_score) %>% mutate(Eigen_raw_score=as.numeric(as.character(Eigen_raw_score))) %>% mutate(PROVEAN_score=-1*PROVEAN_score_new,FATHMM_score=-1*FATHMM_score_new,SIFT_score=-1*SIFT_score_new,LRT_score=-1*LRT_score) %>% select(-SIFT_score_new,-FATHMM_score_new,-PROVEAN_score_new) %>% rename_(.dots=setNames(names(.),gsub("_score", "", names(.))))%>% rename_(.dots=setNames(names(.),gsub("_raw", "", names(.))))%>% rename_(.dots=setNames(names(.),gsub("_new", "", names(.)))) # changed contains to dplyr::contains .
# df_forpvclust<-df_final_hclust[,-1]
# pvclust(scale(df_forpvclust),method.hclust = "ward.D2",nboot = 1000)->fit
#saveRDS(fit,"~/Your_Path/Supplemental_data_16.rds")
# following code was used for plotting Figure 2C .'fit' was saved as Supplemental Data 17.rds
readRDS("~/Your_Path/Supplemental_data_16.rds")->fit
as.dendrogram(fit) %>% hang.dendrogram %>% dendextend::set("branches_lwd",4) %>% dendextend::set("labels_cex", 1)%>% plot()
fit %>% pvrect(alpha=0.99)

Figure 3

Performance of algorithms on different datasets.

Figure 3: AUCs of each algorithm on various datasets included in the manuscript.

#The AUCs for the ROC curve of different algorithms were generated using the following general code. 
#Here 'dataset' refers to any of the datasets in the column labels of Figures 3A, 3B or Additional Figure 3. These correspond to Supplemental data 1, and Supplemental data 6 through Supplemental data 14 and Supplemental data 18 and 19, which were processed further and used in the general code below.

# library(OptimalCutpoints)
# library(data.table)
# #Getting AUCs
# auc_SpSe <- function(x) optimal.cutpoints(X = "Score", status = "labels", tag.healthy = 0, methods = "MaxSpSe", data=x,conf.level = 0.99)[1]$MaxSpSe$Global$measures.acc$AUC
# We used AUC for this manuscript. However the optimal.cutpoints function also provides a list with all possible cutoffs, sensitivity, specificity, predictive Values,  the sample size for both healthy and diseased populations among other metrics.
# setDT(dataset)[,auc_SpSe(.SD), by=Algorithm]->measures_SpSe_dataset
# 
# data.frame(measures_SpSe_dataset)->dauc1
# dauc1%>% group_by(Algorithm) %>% summarize(Max_AUC= nth(V1,3),Min_AUC=nth(V1,2),Mean_AUC= nth(V1,1))->dauc_Mean_CI_dataset
# The AUCs from the different datasets were compiled together in Supplemental data 15. Import this file to generate Figures 3A, #B and Supplemental Figure 2.

#Import the compiled AUC file (Supplemental_data_15)
AUC <- read.delim("~/Your_Path/Supplemental_data_15.txt") 

AUC<-AUC%>%mutate(Performance_AUC=ifelse(AUC>0.9,"AUC>0.9","AUC<0.9"))
AUC_fin<-AUC%>%mutate(Type=ifelse(grepl("CADD|Condel|Mcap|REVEL|Meta|Eigen",Algorithm),"Metapredictor","other"))
AUC_fin_type1<-AUC_fin%>%filter(grepl("ClinVar|trainset|REVEL|predict",geneclass)) # datasets for addressing type 1 circularity.
AUC_fin_bio<-AUC_fin%>%filter(grepl("Status|Dominant|Recessive|TSG|Oncogenes|constraint|Exclude",geneclass))# datasets with different variant properties.
AUC_fin_type2<-AUC_fin%>%filter(grepl("ClinVar|varibench|balanced",geneclass))%>%filter(!grepl("new",geneclass)) %>% filter(!grepl("Sep2016|Dec2016",geneclass)) #datasets for addressing type 2 circularity.

AUC_fin_bio$Algorithm<-reorder(subset(AUC_fin_bio,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_bio, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_bio$geneclass<-factor(AUC_fin_bio$geneclass,levels=c("ClinVar Status *","ClinVar Status **","Exclude LP and LB","Dominant","Recessive" ,"Oncogenes","TSG","High  constraint ",  "Low  constraint ",   "Medium constraint "))

AUC_fin_type1$Algorithm<-reorder(subset(AUC_fin_type1,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_type1, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_type1$geneclass<-factor(AUC_fin_type1$geneclass,levels=c("ClinVar Status *","ClinVar Status **","ClinVar Oct2015 to Dec2016","ClinVar Sep2016 to Mar2017","predictSNPselected","REVEL_testset","No MetaSVM/LR trainset","No MetaSVM/LR trainset **"))


AUC_fin_type2$Algorithm<-reorder(subset(AUC_fin_type2,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin_type2, geneclass=="ClinVar Status *")$AUC,function(x)mean(x))
AUC_fin_type2$geneclass<-factor(AUC_fin_type2$geneclass,levels=c("ClinVar Status *","ClinVar Status **","varibenchselected","balanced *","balanced **"))

# Highlight the ensembl predictors (also referred to as Metapredictors) by coloring them green.
alg<-AUC_fin_type1 %>% group_by(Algorithm,Type)%>%summarize()
colvec <- ifelse(alg$Type=="Metapredictor", "green", "black")

Plot Figure 3A

ggplot(AUC_fin_bio,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))

Plot Figure 3B

ggplot(AUC_fin_type1,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))

Plot Additional Figure 3

ggplot(AUC_fin_type2,aes(Algorithm,y=AUC,ymin=Min_AUC,ymax=Max_AUC,color=Performance_AUC))+geom_errorbar(width=.5)+geom_point(size=2)+scale_color_manual(values=c("salmon","blue"))+coord_flip()+ theme(legend.text = element_text(size=20, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_text(size=20))+theme(axis.text.x= element_text(size=20,angle=90,vjust=0.5))+theme(axis.text.y= element_text(size=20,colour=colvec))+ylab("AUC")+facet_grid(.~geneclass)+geom_hline(yintercept = 0.9,color="red",linetype="longdash")+theme(legend.position="top")+theme(legend.title=element_text(size=20,face="bold"))+ylim(0.36,1)+theme(strip.text.x=element_text(size=14,face="bold"))

Plot Additional Figure 2

#Data : AUC_fin from above.
AUC_fin$Algorithm<-reorder(subset(AUC_fin,geneclass=="ClinVar Status *")$Algorithm,subset(AUC_fin, geneclass=="ClinVar Status *")$AUC,function(x)-mean(x))
my_col<-c("#CA0011","#C26300","#AEFA1E","#25F989","#0B4627","#53D61F","#377717","#1B4F0D","#D1C361","#1A20CB","red","salmon","brown","#e1e11e","gray","black","purple","maroon")
ggplot(AUC_fin,aes(x=Algorithm,y=AUC))+geom_boxplot(outlier.shape = NA)+facet_grid(.~Algorithm,scale="free")+geom_hline(yintercept = 0.8,color="red",linetype="longdash")+geom_jitter(aes(color=geneclass),size=3,alpha=0.6,width = 0.2)+scale_color_manual(values=my_col)+scale_y_continuous(expand=c(0,.01))+ theme(legend.text = element_text(size=15, face="bold"))+theme(panel.border=element_rect(colour="black",size=2))+theme(axis.title.y=element_text(size=20))+theme(axis.title.x= element_blank())+theme(axis.text.x= element_blank())+theme(axis.text.y= element_text(size=20))+ylab("AUC")+theme(legend.title=element_text(size=15,face="bold"))+theme(strip.text.x=element_text(size=14,face="bold"))+theme(axis.ticks.x = element_blank())

Supplemental file for generating figures and tables

Rajarshi Ghosh

2017-08-20