UPSTM Decorrelation on the multiple features Dataset

This document describes the use of the FRESA.CAD::IDeA() function that runs the feature correlation analysis (IDeA) algorithm on the mfeat data set:

https://archive.ics.uci.edu/ml/datasets/Multiple+Features

M. van Breukelen, R.P.W. Duin, D.M.J. Tax, and J.E. den Hartog, Handwritten digit recognition by combined classifiers, Kybernetika, vol. 34, no. 4, 1998, 381-386.

This script uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
## Loading required package: Rcpp
## Loading required package: stringr
## Loading required package: miscTools
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the mfeat data set

mfeat <- read.delim("../Data/mfeat.txt", stringsAsFactors=TRUE)
mfeat$ID <- NULL
print(table(mfeat$Class))
#> 
#>   0   1   2   3   4   5   6   7   8   9 
#> 200 200 200 200 200 200 200 200 200 200

Setting some variables for downstream analysis

studyName = "mfeat"
datasetframe <- mfeat
Outcome <- "Class"

trainFraction = 0.50
correlationThreshold = 0.6
featnames <- colnames(datasetframe)[colnames(datasetframe) != Outcome]

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
All
0 1 2 3 4 5 6 7 8 9
200 200 200 200 200 200 200 200 200 200
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
Training
0 1 2 3 4 5 6 7 8 9
100 100 100 100 100 100 100 100 100 100
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
Testing
0 1 2 3 4 5 6 7 8 9
100 100 100 100 100 100 100 100 100 100

IDeA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



system.time(decorrelatedDF <- IDeA(datasetframe_train))

user system elapsed 1.25 0.08 1.31 ### Returned Data Frame Specifications

The IDeA function returns a data frame with the following specifications for the column names:

The output basis after transformation will be named after the original names:

The Unlatered Features

## The unaltered features set 1: 
## 1) Uncorrelated to any other feature or
## 2) Factors or
## 3) features with not enough unique values.
pander::pander(colnames(decorrelatedDF)[colnames(decorrelatedDF) %in% colnames(datasetframe_train)])

Class, FAC_1, FAC_5, FAC_6, FAC_8, FAC_9, FAC_10, FAC_11, FAC_12, FAC_13, FAC_14, FAC_18, FAC_20, FAC_21, FAC_23, FAC_24, FAC_25, FAC_27, FAC_28, FAC_31, FAC_32, FAC_33, FAC_34, FAC_35, FAC_36, FAC_37, FAC_38, FAC_42, FAC_43, FAC_45, FAC_46, FAC_47, FAC_48, FAC_49, FAC_60, FAC_61, FAC_62, FAC_64, FAC_66, FAC_69, FAC_71, FAC_73, FAC_74, FAC_75, FAC_77, FAC_79, FAC_80, FAC_82, FAC_84, FAC_85, FAC_86, FAC_92, FAC_94, FAC_95, FAC_96, FAC_97, FAC_99, FAC_100, FAC_101, FAC_104, FAC_106, FAC_107, FAC_108, FAC_109, FAC_110, FAC_111, FAC_113, FAC_116, FAC_119, FAC_121, FAC_122, FAC_127, FAC_130, FAC_131, FAC_133, FAC_139, FAC_145, FAC_158, FAC_178, FAC_180, FAC_182, FAC_205, FAC_206, FAC_210, FAC_211, FAC_213, FAC_214, FAC_215, FO_1, FO_2, FO_3, FO_4, FO_5, FO_6, FO_7, FO_8, FO_9, FO_10, FO_11, FO_12, FO_13, FO_14, FO_15, FO_16, FO_17, FO_18, FO_19, FO_20, FO_21, FO_22, FO_23, FO_24, FO_25, FO_26, FO_27, FO_28, FO_29, FO_30, FO_31, FO_32, FO_33, FO_34, FO_35, FO_36, FO_37, FO_38, FO_39, FO_40, FO_41, FO_42, FO_43, FO_44, FO_45, FO_46, FO_47, FO_48, FO_49, FO_50, FO_51, FO_52, FO_53, FO_54, FO_55, FO_56, FO_57, FO_58, FO_59, FO_60, FO_61, FO_62, FO_63, FO_64, FO_65, FO_66, FO_67, FO_68, FO_69, FO_70, FO_71, FO_72, FO_73, FO_74, FO_75, FO_76, KAR_2, KAR_3, KAR_4, KAR_5, KAR_6, KAR_7, KAR_8, KAR_9, KAR_10, KAR_11, KAR_12, KAR_13, KAR_14, KAR_15, KAR_16, KAR_17, KAR_18, KAR_19, KAR_20, KAR_21, KAR_22, KAR_23, KAR_24, KAR_25, KAR_26, KAR_27, KAR_28, KAR_29, KAR_30, KAR_31, KAR_32, KAR_33, KAR_34, KAR_35, KAR_36, KAR_37, KAR_38, KAR_39, KAR_40, KAR_41, KAR_42, KAR_43, KAR_44, KAR_45, KAR_46, KAR_47, KAR_48, KAR_49, KAR_50, KAR_51, KAR_52, KAR_53, KAR_54, KAR_55, KAR_56, KAR_57, KAR_58, KAR_59, KAR_60, KAR_61, KAR_62, KAR_63, KAR_64, MOR_1, MOR_2, MOR_3, MOR_4, ZER_1, ZER_2, ZER_4, ZER_5, ZER_6, ZER_7, ZER_8, ZER_9, ZER_12, ZER_13, ZER_15, ZER_18, ZER_19, ZER_21, ZER_23, ZER_24, ZER_28, ZER_29, ZER_31, ZER_35, ZER_37, ZER_39, ZER_40, ZER_42 and ZER_43

The Latent Basis

## The discovered latent features. i.e. Decorrelated by the IDeA
pander::pander(colnames(decorrelatedDF)[str_detect(colnames(decorrelatedDF),"La_")])

La_FAC_2, La_FAC_3, La_FAC_4, La_FAC_7, La_FAC_15, La_FAC_16, La_FAC_17, La_FAC_19, La_FAC_22, La_FAC_26, La_FAC_29, La_FAC_30, La_FAC_39, La_FAC_40, La_FAC_41, La_FAC_44, La_FAC_50, La_FAC_51, La_FAC_52, La_FAC_53, La_FAC_54, La_FAC_55, La_FAC_56, La_FAC_57, La_FAC_58, La_FAC_59, La_FAC_63, La_FAC_65, La_FAC_67, La_FAC_68, La_FAC_70, La_FAC_72, La_FAC_76, La_FAC_78, La_FAC_81, La_FAC_83, La_FAC_87, La_FAC_88, La_FAC_89, La_FAC_90, La_FAC_91, La_FAC_93, La_FAC_98, La_FAC_102, La_FAC_103, La_FAC_105, La_FAC_112, La_FAC_114, La_FAC_115, La_FAC_117, La_FAC_118, La_FAC_120, La_FAC_123, La_FAC_124, La_FAC_125, La_FAC_126, La_FAC_128, La_FAC_129, La_FAC_132, La_FAC_134, La_FAC_135, La_FAC_136, La_FAC_137, La_FAC_138, La_FAC_140, La_FAC_141, La_FAC_142, La_FAC_143, La_FAC_144, La_FAC_146, La_FAC_147, La_FAC_148, La_FAC_149, La_FAC_150, La_FAC_151, La_FAC_152, La_FAC_153, La_FAC_154, La_FAC_155, La_FAC_156, La_FAC_157, La_FAC_159, La_FAC_160, La_FAC_161, La_FAC_162, La_FAC_163, La_FAC_164, La_FAC_165, La_FAC_166, La_FAC_167, La_FAC_168, La_FAC_169, La_FAC_170, La_FAC_171, La_FAC_172, La_FAC_173, La_FAC_174, La_FAC_175, La_FAC_176, La_FAC_177, La_FAC_179, La_FAC_181, La_FAC_183, La_FAC_184, La_FAC_185, La_FAC_186, La_FAC_187, La_FAC_188, La_FAC_189, La_FAC_190, La_FAC_191, La_FAC_192, La_FAC_193, La_FAC_194, La_FAC_195, La_FAC_196, La_FAC_197, La_FAC_198, La_FAC_199, La_FAC_200, La_FAC_201, La_FAC_202, La_FAC_203, La_FAC_204, La_FAC_207, La_FAC_208, La_FAC_209, La_FAC_212, La_FAC_216, La_KAR_1, La_MOR_5, La_MOR_6, La_ZER_3, La_ZER_10, La_ZER_11, La_ZER_14, La_ZER_16, La_ZER_17, La_ZER_20, La_ZER_22, La_ZER_25, La_ZER_26, La_ZER_27, La_ZER_30, La_ZER_32, La_ZER_33, La_ZER_34, La_ZER_36, La_ZER_38, La_ZER_41, La_ZER_44, La_ZER_45, La_ZER_46 and La_ZER_47

The Discovered Latent Variables

The returned decorrelated object can be analyzed by the getLatentCoefficients() function to return the set of discovered latent variables with their corresponding formula.

latentlist <- getLatentCoefficients(decorrelatedDF)
pander::pander(head(latentlist))
  • La_FAC_2:

    FAC_2 FAC_38
    1 0.7623
  • La_FAC_3:

    FAC_3 FAC_111
    1 -0.9091
  • La_FAC_4:

    FAC_4 FAC_100
    1 -0.8614
  • La_FAC_7:

    FAC_7 FAC_111
    1 -0.06728
  • La_FAC_15:

    FAC_15 FAC_75
    1 -0.8605
  • La_FAC_16:

    FAC_16 FAC_100
    1 -0.6102

Attributes of the Returned Dataframe.

Furthermore, the returned data frame will have the following main attributes:

  1. Transformation matrix: “UPSTM

  2. Feature Analysis:

    1. TotalAdjustments

    2. fsocre

UPSTM

The “UPSTM attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the user specified threshold



## The Spatial Transformation Matrix:
UPSTM <- attr(decorrelatedDF,"UPSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(UPSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("UPSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="UPSTM Feature", ylab="Input Feature")


pander::pander(t(colnames(UPSTM)),caption="New names of decorrelated matrix")
New names of decorrelated matrix (continued below)
La_FAC_2 La_FAC_3 La_FAC_4 FAC_5 FAC_6 La_FAC_7 FAC_8 FAC_9
Table continues below
FAC_10 FAC_11 FAC_12 FAC_13 FAC_14 La_FAC_15 La_FAC_16 La_FAC_17
Table continues below
FAC_18 La_FAC_19 FAC_20 La_FAC_22 FAC_23 FAC_24 La_FAC_26
Table continues below
La_FAC_29 La_FAC_30 FAC_32 FAC_33 FAC_34 FAC_35 FAC_36 FAC_38
Table continues below
La_FAC_39 La_FAC_40 La_FAC_41 La_FAC_44 FAC_45 FAC_46 FAC_47
Table continues below
FAC_49 La_FAC_50 La_FAC_51 La_FAC_52 La_FAC_53 La_FAC_54 La_FAC_55
Table continues below
La_FAC_56 La_FAC_57 La_FAC_58 La_FAC_59 FAC_60 FAC_61 La_FAC_63
Table continues below
FAC_64 La_FAC_65 FAC_66 La_FAC_67 La_FAC_68 FAC_69 La_FAC_70
Table continues below
FAC_71 La_FAC_72 FAC_75 La_FAC_76 FAC_77 La_FAC_78 FAC_79
Table continues below
La_FAC_81 FAC_82 La_FAC_83 FAC_84 FAC_86 La_FAC_87 La_FAC_88
Table continues below
La_FAC_89 La_FAC_90 La_FAC_91 FAC_92 La_FAC_93 FAC_94 FAC_96
Table continues below
FAC_97 La_FAC_98 FAC_99 FAC_100 FAC_101 La_FAC_102 La_FAC_103
Table continues below
FAC_104 La_FAC_105 FAC_106 FAC_107 FAC_109 FAC_111 La_FAC_112
Table continues below
FAC_113 La_FAC_114 La_FAC_115 La_FAC_117 La_FAC_118 FAC_119
Table continues below
La_FAC_120 La_FAC_123 La_FAC_124 La_FAC_125 La_FAC_126 FAC_127
Table continues below
La_FAC_128 La_FAC_129 FAC_130 La_FAC_132 La_FAC_134 La_FAC_135
Table continues below
La_FAC_136 La_FAC_137 La_FAC_138 FAC_139 La_FAC_140 La_FAC_141
Table continues below
La_FAC_142 La_FAC_143 La_FAC_144 La_FAC_146 La_FAC_147 La_FAC_148
Table continues below
La_FAC_149 La_FAC_150 La_FAC_151 La_FAC_152 La_FAC_153 La_FAC_154
Table continues below
La_FAC_155 La_FAC_156 La_FAC_157 FAC_158 La_FAC_159 La_FAC_160
Table continues below
La_FAC_161 La_FAC_162 La_FAC_163 La_FAC_164 La_FAC_165 La_FAC_166
Table continues below
La_FAC_167 La_FAC_168 La_FAC_169 La_FAC_170 La_FAC_171 La_FAC_172
Table continues below
La_FAC_173 La_FAC_174 La_FAC_175 La_FAC_176 La_FAC_177 La_FAC_179
Table continues below
FAC_180 La_FAC_181 La_FAC_183 La_FAC_184 La_FAC_185 La_FAC_186
Table continues below
La_FAC_187 La_FAC_188 La_FAC_189 La_FAC_190 La_FAC_191 La_FAC_192
Table continues below
La_FAC_193 La_FAC_194 La_FAC_195 La_FAC_196 La_FAC_197 La_FAC_198
Table continues below
La_FAC_199 La_FAC_200 La_FAC_201 La_FAC_202 La_FAC_203 La_FAC_204
Table continues below
La_FAC_207 La_FAC_208 La_FAC_209 La_FAC_212 La_FAC_216 La_KAR_1
Table continues below
MOR_4 La_MOR_5 La_MOR_6 ZER_2 La_ZER_3 ZER_4 ZER_9 La_ZER_10
Table continues below
La_ZER_11 La_ZER_14 ZER_15 La_ZER_16 La_ZER_17 La_ZER_20 ZER_21
Table continues below
La_ZER_22 La_ZER_25 La_ZER_26 La_ZER_27 La_ZER_30 ZER_31 La_ZER_32
Table continues below
La_ZER_33 La_ZER_34 ZER_35 La_ZER_36 La_ZER_38 ZER_39 ZER_40
La_ZER_41 ZER_42 La_ZER_44 La_ZER_45 La_ZER_46 La_ZER_47

The Feature Index Score

The IDeA analysis of the data features are stored in three attributes: “TotalAdjustments”, and “fscore”.

  • TotalAdjustments” returns named vector of with the number of times a variable was decorrelated by the IDeA.

  • fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features. The score is equal to number of times the features was uses as independent feature minus the “TotalAdjustments”.


#pander::pander(t(attr(decorrelatedDF,"TotalAdjustments")),caption="Total Adjustments")

#pander::pander(t(attr(decorrelatedDF,"fscore")),caption="The Score")

totA <- attr(decorrelatedDF,"TotalAdjustments") 
totA <- totA[order(-totA)];
totA <- totA[totA>0];
barplot(totA,las=2,cex.names = 0.5)


fscore <- attr(decorrelatedDF,"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.5)

Testing Set Decorrelation

To decorrelate any new data set use the predictDecorrelate() function

decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- rownames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
cormat[is.na(cormat)] <- 0;
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

IDeA with Options

The following code are examples of running IDeA() function with options:


datasetframeDecor<-list();

# With default parameters
datasetframeDecor[[1]] <- decorrelatedDF
decortype <- list();
decortype[[1]] <- "Default"


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 2.19 0.07 2.19


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 6.92 0.22 7.04



# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- IDeA(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 5.48 0.11 5.50


# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 9.25 0.15 9.37



# The following are for outcome driven learning

# Set the target class for association learning
decortype[[6]] <- "Driven_Default"
system.time(datasetframeDecor[[6]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 1.61 0.11 1.74



# Change the maximum correlation goal
decortype[[7]] <- "Driven_AtThr"
system.time(datasetframeDecor[[7]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

user system elapsed 2.56 0.13 2.55


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Driven_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 7.31 0.09 7.25


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Driven_LM_Spearman"
system.time(datasetframeDecor[[9]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 5.68 0.13 5.75


# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Driven_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 8.37 0.07 8.42



# With user defined basis 
decortype[[11]] <- "Driven_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 8.92 0.04 8.89

Machine Learning and UPSTM

Train a simple NB model on the raw data set

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

Training using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   decorrelatedDF,
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

Selected Raw Features


vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

Selected decorrelated Features


vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- decorrelatedDF;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Once we have the transformed testing dataset we can make a side by side comparison of predictions


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")

classNames <- as.character(classNames)
par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("UPSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : 0 PCA : 0 UPSTM : 0 Raw : 1 PCA : 1 UPSTM : 1 Raw : 2 PCA : 2 UPSTM : 2 Raw : 3 PCA : 3 UPSTM : 3 Raw : 4 PCA : 4 UPSTM : 4 Raw : 5 PCA : 5 UPSTM : 5 Raw : 6 PCA : 6 UPSTM : 6 Raw : 7 PCA : 7 UPSTM : 7 Raw : 8 PCA : 8 UPSTM : 8 Raw : 9 PCA : 9 UPSTM : 9

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Training and Prediction on all Decorrelations Sets

par(mfrow=c(4,4))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.35)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:0 Default:1 Default:2 Default:3 Default:4 Default:5 Default:6 Default:7 Default:8 Default:9 AtThr:0 AtThr:1 AtThr:2 AtThr:3 AtThr:4 AtThr:5 AtThr:6 AtThr:7 AtThr:8 AtThr:9 RLM_Pearson:0 RLM_Pearson:1 RLM_Pearson:2 RLM_Pearson:3 RLM_Pearson:4 RLM_Pearson:5 RLM_Pearson:6 RLM_Pearson:7 RLM_Pearson:8 RLM_Pearson:9 LM_Spearman:0 LM_Spearman:1 LM_Spearman:2 LM_Spearman:3 LM_Spearman:4 LM_Spearman:5 LM_Spearman:6 LM_Spearman:7 LM_Spearman:8 LM_Spearman:9 RLM_Spearman:0 RLM_Spearman:1 RLM_Spearman:2 RLM_Spearman:3 RLM_Spearman:4 RLM_Spearman:5 RLM_Spearman:6 RLM_Spearman:7 RLM_Spearman:8 RLM_Spearman:9 Driven_Default:0 Driven_Default:1 Driven_Default:2 Driven_Default:3 Driven_Default:4 Driven_Default:5 Driven_Default:6 Driven_Default:7 Driven_Default:8 Driven_Default:9 Driven_AtThr:0 Driven_AtThr:1 Driven_AtThr:2 Driven_AtThr:3 Driven_AtThr:4 Driven_AtThr:5 Driven_AtThr:6 Driven_AtThr:7 Driven_AtThr:8 Driven_AtThr:9 Driven_RLM_Pearson:0 Driven_RLM_Pearson:1 Driven_RLM_Pearson:2 Driven_RLM_Pearson:3 Driven_RLM_Pearson:4 Driven_RLM_Pearson:5 Driven_RLM_Pearson:6 Driven_RLM_Pearson:7 Driven_RLM_Pearson:8 Driven_RLM_Pearson:9 Driven_LM_Spearman:0 Driven_LM_Spearman:1 Driven_LM_Spearman:2 Driven_LM_Spearman:3 Driven_LM_Spearman:4 Driven_LM_Spearman:5 Driven_LM_Spearman:6 Driven_LM_Spearman:7 Driven_LM_Spearman:8 Driven_LM_Spearman:9 Driven_RLM_Spearman:0 Driven_RLM_Spearman:1 Driven_RLM_Spearman:2 Driven_RLM_Spearman:3 Driven_RLM_Spearman:4 Driven_RLM_Spearman:5 Driven_RLM_Spearman:6 Driven_RLM_Spearman:7 Driven_RLM_Spearman:8 Driven_RLM_Spearman:9 Driven_KS_RLM_Spearman:0 Driven_KS_RLM_Spearman:1 Driven_KS_RLM_Spearman:2 Driven_KS_RLM_Spearman:3 Driven_KS_RLM_Spearman:4 Driven_KS_RLM_Spearman:5 Driven_KS_RLM_Spearman:6 Driven_KS_RLM_Spearman:7 Driven_KS_RLM_Spearman:8 Driven_KS_RLM_Spearman:9

Final Plot Comparing the ROC AUC of all Options

par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
  est lower upper
Raw 0.9981 0.9959 0.9999
PCA 0.9092 0.873 0.9455
Default 0.9981 0.9961 0.9999
AtThr 0.9982 0.9961 0.9999
RLM_Pearson 0.9977 0.9953 0.9998
LM_Spearman 0.9981 0.9961 0.9998
RLM_Spearman 0.9974 0.995 0.9996
Driven_Default 0.998 0.9959 0.9999
Driven_AtThr 0.998 0.9959 0.9999
Driven_RLM_Pearson 0.9973 0.9947 0.9997
Driven_LM_Spearman 0.9979 0.9957 0.9999
Driven_RLM_Spearman 0.997 0.9942 0.9995
Driven_KS_RLM_Spearman 0.9976 0.9951 0.9998
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )