1 Data Decorrelated Options

Here I’ll show the impact of decorrelating high-dimensional data sets.

library("FRESA.CAD")
library(whitening)

1.1 The ARCENE Data Set


trainLabeled <- read.delim("./trainSet.txt")
validLabeled <- read.delim("./arcene_valid.txt")


trainLabeled$Labels <-  1*(trainLabeled$Labels > 0)
validLabeled$Labels <-  1*(validLabeled$Labels > 0)

trainLabeled[,1:ncol(trainLabeled)] <- sapply(trainLabeled,as.numeric)
validLabeled[,1:ncol(validLabeled)] <- sapply(validLabeled,as.numeric)

pander::pander(table(trainLabeled$Labels))
0 1
56 44
pander::pander(table(validLabeled$Labels))
0 1
56 44

1.2 Arcene Decorrelation Train and Test

Removing highly correlated features and setting training and testing sets


trainingSet <- trainLabeled
testingSet <- validLabeled


vartoAdjust <- colnames(trainingSet)[!(colnames(trainingSet) %in% c("Labels"))]
noncorrelated <- correlated_Remove(data= trainingSet,fnames= vartoAdjust)

trainingSet <- trainingSet[,c("Labels",noncorrelated)]
testingSet <- testingSet[,c("Labels",noncorrelated)]

rownames(testingSet) <- paste("T",rownames(testingSet),sep="_")

trainingSetc <- trainingSet
trainingSetc$Labels <- as.factor(trainingSetc$Labels)
ArceneSet <- rbind(trainingSet,testingSet)
trainIDS <- rownames(trainingSet)
testIDS <- rownames(testingSet)

1.2.1 PCA decorrelation


pcaData <- prcomp(trainingSet[,noncorrelated],center = TRUE, scale.= TRUE);
pcaTraining <- as.data.frame(cbind(Labels=trainingSet[,"Labels"],pcaData$x));
pcaTesting <- as.data.frame(cbind(Labels=testingSet[,"Labels"],predict(pcaData,testingSet[,noncorrelated])));

pcaTrainingc <- pcaTraining
pcaTrainingc$Labels <- as.factor(pcaTrainingc$Labels)

#COVarcene <- cov(trainingSet)
#W.ZCAcor = whiteningMatrix(COVarcene, method="ZCA-cor")

1.2.2 Linear decorrelation


LMDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=0.05,type="NZLM")
#> 5761 :66 :( 500 ){4906 :}[]( 1328 ){3509 :}[]( 1343 ){2117 :}[]( 921 ){905 :}[]( 426 ){205 :}[]( 247 ){61 :}[]( 180 ){27 :}[]( 151 ){18 :}[]( 141 ){6 :}[]( 137 ){2 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]
attr(LMDecorrelated, "featureMatrix") <- NULL

dataTrainLMDecorrelatedc <- LMDecorrelated[trainIDS,]
dataTestLMDecorrelated <- LMDecorrelated[testIDS,]
dataTrainLMDecorrelatedc$Labels <- as.factor(dataTrainLMDecorrelatedc$Labels)

1.2.3 Linear decorrelation


LM2Decorrelated <- featureDecorrelation(ArceneSet[,noncorrelated],refdata=trainingSet[,noncorrelated],thr=0.80,unipvalue=1.0e-4,type="NZLM")
#> 5761 :0 :( 500 ){4939 :}[]( 1321 ){3361 :}[]( 1328 ){1985 :}[]( 962 ){857 :}[]( 514 ){196 :}[]( 336 ){62 :}[]( 267 ){18 :}[]( 244 ){12 :}[]( 234 ){2 :}[]( 233 ){1 :}[]( 232 ){0 :}[]
attr(LM2Decorrelated, "featureMatrix") <- NULL
LM2Decorrelated$Labels <- ArceneSet$Labels

dataTrainLM2Decorrelatedc <- LM2Decorrelated[trainIDS,]
dataTestLM2Decorrelated <- LM2Decorrelated[testIDS,]
dataTrainLM2Decorrelatedc$Labels <- as.factor(dataTrainLM2Decorrelatedc$Labels)

1.2.4 Loess decorrelation


LOESSDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="LOESS",degre=2,span=1.0,family="symmetric")
#> 5761 :66 :( 500 ){4900 :}[]( 1372 ){3167 :}[]( 1376 ){1773 :}[]( 870 ){601 :}[]( 462 ){160 :}[]( 361 ){78 :}[]( 325 ){57 :}[]( 300 ){56 :}[]( 289 ){42 :}[]( 284 ){37 :}[]( 285 ){36 :}[]( 284 ){35 :}[]( 284 ){34 :}[]( 284 ){33 :}[]( 283 ){32 :}[]( 282 ){30 :}[]( 282 ){30 :}[]( 281 ){29 :}[]( 281 ){28 :}[]( 281 ){28 :}[]
attr(LOESSDecorrelated, "featureMatrix") <- NULL


dataTrainDecorrelatedc <- LOESSDecorrelated[trainIDS,]
dataTestDecorrelated <- LOESSDecorrelated[testIDS,]
dataTrainDecorrelatedc$Labels <- as.factor(dataTrainDecorrelatedc$Labels)

plot(as.data.frame(cbind(LOESS=LOESSDecorrelated$V1476,lm=LMDecorrelated$V1476,raw=ArceneSet$V1476)))

1.2.5 RLM decorrelation


RLMDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="RLM")
#> 5761 :66 :( 500 ){4805 :}[]( 1335 ){3383 :}[]( 1413 ){2003 :}[]( 960 ){890 :}[]( 526 ){365 :}[]( 387 ){237 :}[]( 338 ){224 :}[]( 306 ){207 :}[]( 301 ){203 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]
attr(RLMDecorrelated, "featureMatrix") <- NULL

dataTrainRLMDecorrelatedc <- RLMDecorrelated[trainIDS,]
dataTestRLMDecorrelated <- RLMDecorrelated[testIDS,]
dataTrainRLMDecorrelatedc$Labels <- as.factor(dataTrainRLMDecorrelatedc$Labels)

1.2.6 SPLINE decorrelation


SPLINEDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="SPLINE",nknots = 4)
#> 5761 :66 :( 500 ){4906 :}[]( 1376 ){3324 :}[]( 1368 ){1883 :}[]( 822 ){616 :}[]( 388 ){124 :}[]( 275 ){39 :}[]( 226 ){17 :}[]( 206 ){8 :}[]( 200 ){1 :}[]( 199 ){0 :}[]
attr(SPLINEDecorrelated, "featureMatrix") <- NULL

dataTrainSPLINEDecorrelatedc <- SPLINEDecorrelated[trainIDS,]
dataTestSPLINEDecorrelated <- SPLINEDecorrelated[testIDS,]
dataTrainSPLINEDecorrelatedc$Labels <- as.factor(dataTrainSPLINEDecorrelatedc$Labels)

plot(as.data.frame(cbind(spline=SPLINEDecorrelated$V1476,rlm=RLMDecorrelated$V1476,raw=ArceneSet$V1476)))

1.2.7 MARS decorrelation


MARSDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="MARS",nk=3)
#> 5761 :66 :( 500 ){4904 :}[]( 1412 ){3228 :}[]( 1366 ){1666 :}[]( 753 ){463 :}[]( 402 ){85 :}[]( 304 ){33 :}[]( 262 ){12 :}[]( 245 ){9 :}[]( 237 ){0 :}[]
attr(MARSDecorrelated, "featureMatrix") <- NULL

dataTrainMARSDecorrelatedc <- MARSDecorrelated[trainIDS,]
dataTestMARSecorrelated <- MARSDecorrelated[testIDS,]
dataTrainMARSDecorrelatedc$Labels <- as.factor(dataTrainMARSDecorrelatedc$Labels)

1.3 Filtered SVM and Decorrelation Comparison


op <- par(no.readonly = TRUE,pty="m")
#par(mfrow = c(2,2),cex = 0.5);
par(cex = 0.5);

RAWFilteredSVM_ml <- filteredFit(Labels~.,
                              trainingSetc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )

cmat <- abs(cor(trainingSetc[,RAWFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0

gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Raw Selected Features", cexRow = 0.5,cexCol = 0.75)


bsFilteredSVM <- predictionStats_binary(cbind(testingSet$Labels,predict(RAWFilteredSVM_ml,testingSet)),"RAW Filtered:SVM",cex = 0.97)

par(op)

PCAFilteredSVM_ml <- filteredFit(Labels~.,
                              pcaTrainingc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.20,limit=0.2),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
PCAFilteredSVM_ml$selectedfeatures
bsFilteredSVM <- predictionStats_binary(cbind(pcaTesting$Labels,predict(PCAFilteredSVM_ml,pcaTesting)),"PCA Filtered:SVM",cex = 0.97)

par(op)




LMFilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainLMDecorrelatedc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
cmat <- abs(cor(dataTrainLMDecorrelatedc[,LMFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0



gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestLMDecorrelated$Labels,predict(LMFilteredSVM_ml,dataTestLMDecorrelated)),"LM Decorrelated Filtered:SVM",cex = 0.97)

par(op)


LM2FilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainLM2Decorrelatedc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )

cmat <- abs(cor(dataTrainLMDecorrelatedc[,LM2FilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0

gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestLM2Decorrelated$Labels,predict(LM2FilteredSVM_ml,dataTestLM2Decorrelated)),"LM2 Decorrelated Filtered:SVM",cex = 0.97)

par(op)




RLMFilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainRLMDecorrelatedc,
                              fitmethod=e1071::svm,
                               filtermethod.control = list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
cmat <- abs(cor(dataTestRLMDecorrelated[,RLMFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0

gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestRLMDecorrelated$Labels,predict(RLMFilteredSVM_ml,dataTestRLMDecorrelated)),"RLM Decorrelated Filtered:SVM",cex = 0.97)

par(op)

LOESSFilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainDecorrelatedc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
cmat <- abs(cor(dataTrainLMDecorrelatedc[,LOESSFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0

gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestDecorrelated$Labels,predict(LOESSFilteredSVM_ml,dataTestDecorrelated)),"LOESS Decorrelated Filtered:SVM",cex = 0.97)

par(op)



SPLINEFilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainSPLINEDecorrelatedc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
cmat <- abs(cor(dataTrainLMDecorrelatedc[,SPLINEFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestSPLINEDecorrelated$Labels,predict(SPLINEFilteredSVM_ml,dataTestSPLINEDecorrelated)),"SPLINE Decorrelated Filtered:SVM",cex = 0.97)

par(op)

MARSFilteredSVM_ml <- filteredFit(Labels~.,
                              dataTrainMARSDecorrelatedc,
                              fitmethod=e1071::svm,
                              filtermethod.control=list(pvalue=0.05,limit=150),
                              Scale="OrderLogit",
                              probability = TRUE,
                              scale=FALSE
                              )
cmat <- abs(cor(dataTrainLMDecorrelatedc[,MARSFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0

gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)

bsFilteredSVM <- predictionStats_binary(cbind(dataTestMARSecorrelated$Labels,predict(MARSFilteredSVM_ml,dataTestMARSecorrelated)),"MARS Decorrelated Filtered:SVM",cex = 0.97)

par(op)