i— title: “Colon Cancer Bin” author: “José Tamez-Peña” date: “Feb, 2017” output: word_document: fig_height: 6 fig_width: 8 toc: yes —

Colon Data Set Analysis

The Libraries

library("epiR")
library("FRESA.CAD")
library(network)
library(GGally)
library("e1071")
library("gplots")
library("randomForest")
library(rpart)

The Parameters and Data Sets

Loops <- 10
Repeats <- 5
filter <- 0.05

#ColonData <- read.delim("./Colon/cancerColonb.txt")
ColonData <- read.delim("./cancerColonb.txt")
Colon <- ColonData[,-1]
rownames(Colon) <- ColonData[,1]

Modeling with FRESA.CAD



filename = paste("ColonModelBin",Loops,Repeats,sprintf("%5.4f",filter),"res.RDATA",sep="_")
system.time(ColonModelBIN <- FRESA.Model(formula = Class ~ 1, Colon, CVfolds=Loops, repeats=Repeats,filter.p.value=filter,usrFitFun=svm))

save(ColonModelBIN,file=filename)

#load(file=paste("./Colon/",filename,sep=""))

Summary Tables

pander::pander(summary(ColonModelBIN$BSWiMS.model)$coefficients,digits=4)
Table continues below
  Estimate lower OR upper u.Accuracy r.Accuracy
H08393 0.0009409 1 1.001 1.001 0.8 0.8083
R36977 0.0008331 1.001 1.001 1.001 0.8083 0.8333
M22382 0.0001797 1 1 1 0.8083 0.7583
R87126 -0.0001431 0.9998 0.9999 0.9999 0.8083 0.8
M63391 -4.661e-05 0.9999 1 1 0.8333 0.8083
X63629 0.001393 1.001 1.001 1.002 0.7958 0.7792
H40095 8.335e-05 1 1 1 0.7625 0.6833
M76378.2 -0.0001263 0.9998 0.9999 0.9999 0.7583 0.8083
T47377 8.949e-05 1 1 1 0.725 0.775
J02854 -0.0002822 0.9997 0.9997 0.9998 0.7917 0.5958
X12671 0.0001964 1 1 1 0.75 0.6833
M76378 -0.0001534 0.9998 0.9998 0.9999 0.7583 0.725
J05032 0.00108 1.001 1.001 1.001 0.6625 0.6667
U09564 0.00496 1.004 1.005 1.006 0.7167 0.6583
U30825 0.001089 1.001 1.001 1.001 0.6958 0.7333
M76378.1 -5.291e-05 0.9999 0.9999 1 0.775 0.725
R84411 0.0004562 1 1 1.001 0.6875 0.6792
Z50753 -0.0004939 0.9993 0.9995 0.9997 0.7792 0.7958
T71025 -0.000118 0.9998 0.9999 0.9999 0.7333 0.6958
R10066 0.0002187 1 1 1 0.7458 0.6958
T92451 -7.543e-05 0.9999 0.9999 0.9999 0.7417 0.625
T62947 0.001664 1.001 1.002 1.002 0.725 0.7583
H43887 -3.067e-05 1 1 1 0.6833 0.7625
M36634 -0.001084 0.9986 0.9989 0.9992 0.6667 0.7
H55916 0.001342 1.001 1.001 1.002 0.625 0.7417
H06524 -0.001936 0.9977 0.9981 0.9984 0.7292 0.6208
H11084 0.0007596 1.001 1.001 1.001 0.7 0.6667
X12369 -0.0001579 0.9998 0.9998 0.9999 0.6667 0.6625
H64489 -4.463e-05 0.9999 1 1 0.6958 0.7458
T90350 0.001369 1.001 1.001 1.002 0.5958 0.7917
U14631 -0.0002922 0.9996 0.9997 0.9998 0.6792 0.6875
L12723 0.005501 1.004 1.006 1.007 0.6708 0.6292
M59040 0.003246 1.003 1.003 1.004 0.6292 0.6417
L07648 -0.0125 0.9862 0.9876 0.9889 0.6417 0.5667
H20709 -0.0002639 0.9997 0.9997 0.9998 0.6417 0.6292
D14812 0.001434 1.001 1.001 1.002 0.6208 0.7292
D25217 -0.0002523 0.9996 0.9997 0.9998 0.6833 0.75
H09719 0.03764 1.033 1.038 1.044 0.5667 0.6417
R88740 -0.001422 0.9982 0.9986 0.9989 0.6292 0.6708
T47383 -0.004492 0.9944 0.9955 0.9967 0.6583 0.7167
Table continues below
  full.Accuracy u.AUC r.AUC full.AUC IDI NRI
H08393 0.9083 0.8 0.8083 0.9083 0.1657 0.65
R36977 0.9333 0.8083 0.8333 0.9333 0.2888 1.117
M22382 0.8958 0.8083 0.7583 0.8958 0.2701 1.35
R87126 0.9083 0.8083 0.8 0.9083 0.2334 1.083
M63391 0.9333 0.8333 0.8083 0.9333 0.2553 1.333
X63629 0.9125 0.7958 0.7792 0.9125 0.3178 1.367
H40095 0.8667 0.7625 0.6833 0.8667 0.1962 0.65
M76378.2 0.8958 0.7583 0.8083 0.8958 0.2107 1.117
T47377 0.8167 0.725 0.775 0.8167 0.2125 0.8333
J02854 0.8583 0.7917 0.5958 0.8583 0.4804 1.383
X12671 0.8167 0.75 0.6833 0.8167 0.3981 1.233
M76378 0.8583 0.7583 0.725 0.8583 0.3779 1.217
J05032 0.8875 0.6625 0.6667 0.8875 0.3514 1.167
U09564 0.9083 0.7167 0.6583 0.9083 0.6029 1.65
U30825 0.8875 0.6958 0.7333 0.8875 0.3623 1.183
M76378.1 0.8167 0.775 0.725 0.8167 0.2283 1.133
R84411 0.8833 0.6875 0.6792 0.8833 0.4443 1.367
Z50753 0.9125 0.7792 0.7958 0.9125 0.3286 1.35
T71025 0.8875 0.7333 0.6958 0.8875 0.3236 1.283
R10066 0.85 0.7458 0.6958 0.85 0.2976 1.117
T92451 0.8625 0.7417 0.625 0.8625 0.3462 1.267
T62947 0.8583 0.725 0.7583 0.8583 0.2875 1.2
H43887 0.8667 0.6833 0.7625 0.8667 0.1141 0.5167
M36634 0.9 0.6667 0.7 0.9 0.3755 1.433
H55916 0.8625 0.625 0.7417 0.8625 0.3094 0.9833
H06524 0.8958 0.7292 0.6208 0.8958 0.5765 1.783
H11084 0.9 0.7 0.6667 0.9 0.3538 1.317
X12369 0.8875 0.6667 0.6625 0.8875 0.2874 1.317
H64489 0.85 0.6958 0.7458 0.85 0.2767 1.183
T90350 0.8583 0.5958 0.7917 0.8583 0.2377 1.383
U14631 0.8833 0.6792 0.6875 0.8833 0.3135 1.133
L12723 0.8417 0.6708 0.6292 0.8417 0.4916 1.367
M59040 0.8708 0.6292 0.6417 0.8708 0.5457 1.367
L07648 0.9542 0.6417 0.5667 0.9542 0.8099 1.767
H20709 0.8708 0.6417 0.6292 0.8708 0.4972 1.433
D14812 0.8958 0.6208 0.7292 0.8958 0.5032 1.5
D25217 0.8167 0.6833 0.75 0.8167 0.2311 1.217
H09719 0.9542 0.5667 0.6417 0.9542 0.7232 1.817
R88740 0.8417 0.6292 0.6708 0.8417 0.4519 1.267
T47383 0.9083 0.6583 0.7167 0.9083 0.4121 1.383
  z.IDI z.NRI
H08393 3.863 3.108
R36977 5.972 6.21
M22382 5.67 8.486
R87126 5 5.852
M63391 5.335 8.405
X63629 6.037 8.641
H40095 4.539 3.298
M76378.2 4.731 6.401
T47377 4.824 4.338
J02854 8.601 8.692
X12671 7.285 7.177
M76378 7.015 7.286
J05032 6.788 6.496
U09564 11.3 14.11
U30825 7.036 6.65
M76378.1 4.957 6.686
R84411 8.188 8.473
Z50753 6.15 8.403
T71025 6.254 7.735
R10066 5.706 6.32
T92451 6.724 7.676
T62947 5.634 7.15
H43887 3.626 2.599
M36634 7.729 9.517
H55916 6.218 5.504
H06524 10.29 17.79
H11084 7.534 8.609
X12369 5.994 8.189
H64489 5.435 6.777
T90350 5.089 8.675
U14631 6.335 6.464
L12723 8.801 8.413
M59040 9.989 8.489
L07648 18.1 16.95
H20709 9.247 9.434
D14812 8.866 10.29
D25217 5.014 6.93
H09719 14.13 19.93
R88740 8.064 7.479
T47383 7.661 9.4

B:SWiMS Heat Map Plots

opg <- par(no.readonly = TRUE)
par(mfrow=c(1,1))

hm <- heatMaps(ColonModelBIN$BSWiMS.model$baggingAnalysis$RelativeFrequency,Outcome="Class",data=Colon,hCluster = TRUE,Scale=TRUE,xlab="Subject ID",transpose=TRUE,title="B:SWIMS Features")
#> [1] 2

par(opg)

ROC Plots

AccCITable <- NULL
BErrorCITable <- NULL


rp <- plotModels.ROC(ColonModelBIN$cvObject$LASSO.testPredictions,theCVfolds=Loops,main="LASSO",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

rp <- plotModels.ROC(ColonModelBIN$cvObject$KNN.testPrediction,theCVfolds=Loops,main="KNN",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

rp <- plotModels.ROC(ColonModelBIN$cvObject$Models.testPrediction,theCVfolds=Loops,predictor="Prediction",main="B:SWiMS",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

rp <- plotModels.ROC(ColonModelBIN$cvObject$Models.testPrediction,theCVfolds=Loops,predictor="Ensemble.B.SWiMS",main="Ensembe B:SWiMS ",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

Support Vector Machine(SVM) Analysis


ColonModelBIN$cvObject$Models.testPrediction$usrFitFunction_Sel <- ColonModelBIN$cvObject$Models.testPrediction$usrFitFunction_Sel -0.5
ColonModelBIN$cvObject$Models.testPrediction$usrFitFunction <- ColonModelBIN$cvObject$Models.testPrediction$usrFitFunction -0.5

rp <- plotModels.ROC(ColonModelBIN$cvObject$Models.testPrediction,theCVfolds=Loops,predictor="usrFitFunction",main="Filtered:SVM",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

rp <- plotModels.ROC(ColonModelBIN$cvObject$Models.testPrediction,theCVfolds=Loops,predictor="usrFitFunction_Sel",main="B:SWiMS/SVM",cex=0.90)

ci <- epi.tests(rp$predictionTable)
AccCITable <- rbind(AccCITable,ci$elements$diag.acc)
BErrorCITable <- rbind(BErrorCITable,1-0.5*(ci$elements$sensitivity+ci$elements$specificity))

Barplots of Accuracy and Balanced Error


CVthesets <- c("LASSO","KNN","B:SWiMS","B:SWiMS Ensemble","SVM:Filterd","SVM:BSWIMS")



bp <- barPlotCiError(as.matrix(AccCITable),metricname="Accuracy",thesets=CVthesets,themethod="CV",main="Accuracy",args.legend = list(x = "bottomright"))

bp <- barPlotCiError(as.matrix(BErrorCITable),metricname="Balanced Error",thesets=CVthesets,themethod="CV",main="Balanced Error",args.legend = list(x = "topright"))

B:SWiMS Feature Plots

baggColonBSWiMS <- baggedModel(ColonModelBIN$cvObject$allBSWiMSFormulas.list,Colon,type="LOGIT",Outcome="Class")
#> 
#> Num. Models: 946  To Test: 159  TopFreq: 50  Thrf: 1  Removed: 43 
#> ..............................................................................................

cf <- length(ColonModelBIN$cvObject$allBSWiMSFormulas.list)/(Loops*Repeats)

namestoShow <- names(baggColonBSWiMS$coefEvolution)[-c(1,2)]
frac = 0.25*Loops*Repeats

namestoShow <- namestoShow[baggColonBSWiMS$frequencyTable[namestoShow]>=frac]

fnshow <- min(11,length(namestoShow))
barplot(baggColonBSWiMS$frequencyTable[namestoShow],las = 2,cex.axis=1.0,cex.names=0.75,main="B:SWiMS Feature Frequency")


n <- network::network(cf*baggColonBSWiMS$formulaNetwork[1:fnshow,1:fnshow], directed = FALSE,ignore.eval = FALSE,names.eval = "weights")
gplots::heatmap.2(cf*baggColonBSWiMS$formulaNetwork[namestoShow,namestoShow],trace="none",mar=c(10,10),main="B:SWiMS Formula Network")


ggnet2(n, label = TRUE, size = "degree",size.cut = 3,size.min = 1, mode = "circle",edge.label = "weights",edge.label.size=4)

LASSO Feature Plots

baggColonLASSO <- baggedModel(ColonModelBIN$cvObject$LASSOVariables,Colon,type="LOGIT",Outcome="Class")
#> 
#> Num. Models: 51  To Test: 35  TopFreq: 48  Thrf: 1  Removed: 9 
#> .....

toshow <- sum(baggColonLASSO$frequencyTable>=frac)
fnshow <- min(11,length(baggColonLASSO$frequencyTable))
barplot(baggColonLASSO$frequencyTable[1:toshow],las = 2,cex.axis=1.0,cex.names=0.75,main="LASSO Feature Frequency")


n <- network::network(baggColonLASSO$formulaNetwork[1:fnshow,1:fnshow], directed = FALSE,ignore.eval = FALSE,names.eval = "weights")
gplots::heatmap.2(baggColonLASSO$formulaNetwork[1:toshow,1:toshow],trace="none",mar=c(10,10),main="LASSO Formula Network")

ggnet2(n, label = TRUE, size = "degree",size.cut = 3,size.min = 1, mode = "circle",edge.label = "weights",edge.label.size=4)

Venn Diagrams

Here I will explore which features are similar between the LASSO and the BSWiMS models


pvalues <- p.adjust(1.0-pnorm(ColonModelBIN$univariateAnalysis$ZUni),"BH")
topunivec <- as.character(ColonModelBIN$univariateAnalysis$Name[pvalues<0.05])
tob <- baggColonBSWiMS$frequencyTable>frac
topBSwims <- as.character(names(baggColonBSWiMS$frequencyTable[tob]))
tob <- baggColonLASSO$frequencyTable>frac
topLASSO <- as.character(names(baggColonLASSO$frequencyTable[tob]))
featurelist <- list(Univariate=topunivec,CVLASSO=topLASSO,BSWIMS=topBSwims)
vend <- venn(featurelist)
vgroups <- attr(vend, "intersections")
legend("center",vgroups$`Univariate:CVLASSO:BSWIMS`,cex=0.75)