Here I’ll show the impact of decorrelating high-dimensional data sets.
library("FRESA.CAD")
library(whitening)
trainLabeled <- read.delim("./trainSet.txt")
validLabeled <- read.delim("./arcene_valid.txt")
trainLabeled$Labels <- 1*(trainLabeled$Labels > 0)
validLabeled$Labels <- 1*(validLabeled$Labels > 0)
trainLabeled[,1:ncol(trainLabeled)] <- sapply(trainLabeled,as.numeric)
validLabeled[,1:ncol(validLabeled)] <- sapply(validLabeled,as.numeric)
pander::pander(table(trainLabeled$Labels))
| 0 | 1 |
|---|---|
| 56 | 44 |
pander::pander(table(validLabeled$Labels))
| 0 | 1 |
|---|---|
| 56 | 44 |
Removing highly correlated features and setting training and testing sets
trainingSet <- trainLabeled
testingSet <- validLabeled
vartoAdjust <- colnames(trainingSet)[!(colnames(trainingSet) %in% c("Labels"))]
noncorrelated <- correlated_Remove(data= trainingSet,fnames= vartoAdjust)
trainingSet <- trainingSet[,c("Labels",noncorrelated)]
testingSet <- testingSet[,c("Labels",noncorrelated)]
rownames(testingSet) <- paste("T",rownames(testingSet),sep="_")
trainingSetc <- trainingSet
trainingSetc$Labels <- as.factor(trainingSetc$Labels)
ArceneSet <- rbind(trainingSet,testingSet)
trainIDS <- rownames(trainingSet)
testIDS <- rownames(testingSet)
pcaData <- prcomp(trainingSet[,noncorrelated],center = TRUE, scale.= TRUE);
pcaTraining <- as.data.frame(cbind(Labels=trainingSet[,"Labels"],pcaData$x));
pcaTesting <- as.data.frame(cbind(Labels=testingSet[,"Labels"],predict(pcaData,testingSet[,noncorrelated])));
pcaTrainingc <- pcaTraining
pcaTrainingc$Labels <- as.factor(pcaTrainingc$Labels)
#COVarcene <- cov(trainingSet)
#W.ZCAcor = whiteningMatrix(COVarcene, method="ZCA-cor")
LMDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=0.05,type="NZLM")
#> 5761 :66 :( 500 ){4906 :}[]( 1328 ){3509 :}[]( 1343 ){2117 :}[]( 921 ){905 :}[]( 426 ){205 :}[]( 247 ){61 :}[]( 180 ){27 :}[]( 151 ){18 :}[]( 141 ){6 :}[]( 137 ){2 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]( 136 ){1 :}[]
attr(LMDecorrelated, "featureMatrix") <- NULL
dataTrainLMDecorrelatedc <- LMDecorrelated[trainIDS,]
dataTestLMDecorrelated <- LMDecorrelated[testIDS,]
dataTrainLMDecorrelatedc$Labels <- as.factor(dataTrainLMDecorrelatedc$Labels)
LM2Decorrelated <- featureDecorrelation(ArceneSet[,noncorrelated],refdata=trainingSet[,noncorrelated],thr=0.80,unipvalue=1.0e-4,type="NZLM")
#> 5761 :0 :( 500 ){4939 :}[]( 1321 ){3361 :}[]( 1328 ){1985 :}[]( 962 ){857 :}[]( 514 ){196 :}[]( 336 ){62 :}[]( 267 ){18 :}[]( 244 ){12 :}[]( 234 ){2 :}[]( 233 ){1 :}[]( 232 ){0 :}[]
attr(LM2Decorrelated, "featureMatrix") <- NULL
LM2Decorrelated$Labels <- ArceneSet$Labels
dataTrainLM2Decorrelatedc <- LM2Decorrelated[trainIDS,]
dataTestLM2Decorrelated <- LM2Decorrelated[testIDS,]
dataTrainLM2Decorrelatedc$Labels <- as.factor(dataTrainLM2Decorrelatedc$Labels)
LOESSDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="LOESS",degre=2,span=1.0,family="symmetric")
#> 5761 :66 :( 500 ){4900 :}[]( 1372 ){3167 :}[]( 1376 ){1773 :}[]( 870 ){601 :}[]( 462 ){160 :}[]( 361 ){78 :}[]( 325 ){57 :}[]( 300 ){56 :}[]( 289 ){42 :}[]( 284 ){37 :}[]( 285 ){36 :}[]( 284 ){35 :}[]( 284 ){34 :}[]( 284 ){33 :}[]( 283 ){32 :}[]( 282 ){30 :}[]( 282 ){30 :}[]( 281 ){29 :}[]( 281 ){28 :}[]( 281 ){28 :}[]
attr(LOESSDecorrelated, "featureMatrix") <- NULL
dataTrainDecorrelatedc <- LOESSDecorrelated[trainIDS,]
dataTestDecorrelated <- LOESSDecorrelated[testIDS,]
dataTrainDecorrelatedc$Labels <- as.factor(dataTrainDecorrelatedc$Labels)
plot(as.data.frame(cbind(LOESS=LOESSDecorrelated$V1476,lm=LMDecorrelated$V1476,raw=ArceneSet$V1476)))
RLMDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="RLM")
#> 5761 :66 :( 500 ){4805 :}[]( 1335 ){3383 :}[]( 1413 ){2003 :}[]( 960 ){890 :}[]( 526 ){365 :}[]( 387 ){237 :}[]( 338 ){224 :}[]( 306 ){207 :}[]( 301 ){203 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]( 299 ){200 :}[]
attr(RLMDecorrelated, "featureMatrix") <- NULL
dataTrainRLMDecorrelatedc <- RLMDecorrelated[trainIDS,]
dataTestRLMDecorrelated <- RLMDecorrelated[testIDS,]
dataTrainRLMDecorrelatedc$Labels <- as.factor(dataTrainRLMDecorrelatedc$Labels)
SPLINEDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="SPLINE",nknots = 4)
#> 5761 :66 :( 500 ){4906 :}[]( 1376 ){3324 :}[]( 1368 ){1883 :}[]( 822 ){616 :}[]( 388 ){124 :}[]( 275 ){39 :}[]( 226 ){17 :}[]( 206 ){8 :}[]( 200 ){1 :}[]( 199 ){0 :}[]
attr(SPLINEDecorrelated, "featureMatrix") <- NULL
dataTrainSPLINEDecorrelatedc <- SPLINEDecorrelated[trainIDS,]
dataTestSPLINEDecorrelated <- SPLINEDecorrelated[testIDS,]
dataTrainSPLINEDecorrelatedc$Labels <- as.factor(dataTrainSPLINEDecorrelatedc$Labels)
plot(as.data.frame(cbind(spline=SPLINEDecorrelated$V1476,rlm=RLMDecorrelated$V1476,raw=ArceneSet$V1476)))
MARSDecorrelated <- featureDecorrelation(ArceneSet,Outcome="Labels",refdata=trainingSet,thr=0.80,unipvalue=1.0e-4,type="MARS",nk=3)
#> 5761 :66 :( 500 ){4904 :}[]( 1412 ){3228 :}[]( 1366 ){1666 :}[]( 753 ){463 :}[]( 402 ){85 :}[]( 304 ){33 :}[]( 262 ){12 :}[]( 245 ){9 :}[]( 237 ){0 :}[]
attr(MARSDecorrelated, "featureMatrix") <- NULL
dataTrainMARSDecorrelatedc <- MARSDecorrelated[trainIDS,]
dataTestMARSecorrelated <- MARSDecorrelated[testIDS,]
dataTrainMARSDecorrelatedc$Labels <- as.factor(dataTrainMARSDecorrelatedc$Labels)
op <- par(no.readonly = TRUE,pty="m")
#par(mfrow = c(2,2),cex = 0.5);
par(cex = 0.5);
RAWFilteredSVM_ml <- filteredFit(Labels~.,
trainingSetc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(trainingSetc[,RAWFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Raw Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(testingSet$Labels,predict(RAWFilteredSVM_ml,testingSet)),"RAW Filtered:SVM",cex = 0.97)
par(op)
PCAFilteredSVM_ml <- filteredFit(Labels~.,
pcaTrainingc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.20,limit=0.2),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
PCAFilteredSVM_ml$selectedfeatures
bsFilteredSVM <- predictionStats_binary(cbind(pcaTesting$Labels,predict(PCAFilteredSVM_ml,pcaTesting)),"PCA Filtered:SVM",cex = 0.97)
par(op)
LMFilteredSVM_ml <- filteredFit(Labels~.,
dataTrainLMDecorrelatedc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTrainLMDecorrelatedc[,LMFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestLMDecorrelated$Labels,predict(LMFilteredSVM_ml,dataTestLMDecorrelated)),"LM Decorrelated Filtered:SVM",cex = 0.97)
par(op)
LM2FilteredSVM_ml <- filteredFit(Labels~.,
dataTrainLM2Decorrelatedc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTrainLMDecorrelatedc[,LM2FilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestLM2Decorrelated$Labels,predict(LM2FilteredSVM_ml,dataTestLM2Decorrelated)),"LM2 Decorrelated Filtered:SVM",cex = 0.97)
par(op)
RLMFilteredSVM_ml <- filteredFit(Labels~.,
dataTrainRLMDecorrelatedc,
fitmethod=e1071::svm,
filtermethod.control = list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTestRLMDecorrelated[,RLMFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestRLMDecorrelated$Labels,predict(RLMFilteredSVM_ml,dataTestRLMDecorrelated)),"RLM Decorrelated Filtered:SVM",cex = 0.97)
par(op)
LOESSFilteredSVM_ml <- filteredFit(Labels~.,
dataTrainDecorrelatedc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTrainLMDecorrelatedc[,LOESSFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestDecorrelated$Labels,predict(LOESSFilteredSVM_ml,dataTestDecorrelated)),"LOESS Decorrelated Filtered:SVM",cex = 0.97)
par(op)
SPLINEFilteredSVM_ml <- filteredFit(Labels~.,
dataTrainSPLINEDecorrelatedc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTrainLMDecorrelatedc[,SPLINEFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestSPLINEDecorrelated$Labels,predict(SPLINEFilteredSVM_ml,dataTestSPLINEDecorrelated)),"SPLINE Decorrelated Filtered:SVM",cex = 0.97)
par(op)
MARSFilteredSVM_ml <- filteredFit(Labels~.,
dataTrainMARSDecorrelatedc,
fitmethod=e1071::svm,
filtermethod.control=list(pvalue=0.05,limit=150),
Scale="OrderLogit",
probability = TRUE,
scale=FALSE
)
cmat <- abs(cor(dataTrainLMDecorrelatedc[,MARSFilteredSVM_ml$selectedfeatures],method="spearman"))
diag(cmat) <- 0
gplots::heatmap.2(cmat,trace = "none",mar = c(7,13),main = "Selected Features", cexRow = 0.5,cexCol = 0.75)
bsFilteredSVM <- predictionStats_binary(cbind(dataTestMARSecorrelated$Labels,predict(MARSFilteredSVM_ml,dataTestMARSecorrelated)),"MARS Decorrelated Filtered:SVM",cex = 0.97)
par(op)