IDeA and the UPSTM

UPSTM Decorrelation on the multiple features Dataset

This document describes the use of the FRESA.CAD::IDeA() function that runs the feature correlation analysis (IDeA) algorithm on the mfeat data set:

https://archive.ics.uci.edu/ml/datasets/Multiple+Features

M. van Breukelen, R.P.W. Duin, D.M.J. Tax, and J.E. den Hartog, Handwritten digit recognition by combined classifiers, Kybernetika, vol. 34, no. 4, 1998, 381-386.

This script uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")

## Loading required package: Rcpp

## Loading required package: stringr

## Loading required package: miscTools

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: pROC

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the mfeat data set

mfeat <- read.delim("../Data/mfeat.txt", stringsAsFactors=TRUE)
mfeat$ID <- NULL
print(table(mfeat$Class))
#> 
#>   0   1   2   3   4   5   6   7   8   9 
#> 200 200 200 200 200 200 200 200 200 200

Setting some variables for downstream analysis

studyName = "mfeat"
datasetframe <- mfeat
Outcome <- "Class"

trainFraction = 0.50
correlationThreshold = 0.6
featnames <- colnames(datasetframe)[colnames(datasetframe) != Outcome]

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")

All
0	1	2	3	4	5	6	7	8	9
200	200	200	200	200	200	200	200	200	200

pander::pander(table(datasetframe_train[,Outcome]),caption="Training")

Training
0	1	2	3	4	5	6	7	8	9
100	100	100	100	100	100	100	100	100	100

pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")

Testing
0	1	2	3	4	5	6	7	8	9
100	100	100	100	100	100	100	100	100	100

IDeA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



system.time(decorrelatedDF <- IDeA(datasetframe_train))

user system elapsed 1.25 0.08 1.31 ### Returned Data Frame Specifications

The IDeA function returns a data frame with the following specifications for the column names:

The output basis after transformation will be named after the original names:

The feature name will have the “La_” prefix, if the feature was decorrelated by the algorithm. These features are the discovered latent variables

The Unlatered Features

## The unaltered features set 1: 
## 1) Uncorrelated to any other feature or
## 2) Factors or
## 3) features with not enough unique values.
pander::pander(colnames(decorrelatedDF)[colnames(decorrelatedDF) %in% colnames(datasetframe_train)])

Class, FAC_1, FAC_5, FAC_6, FAC_8, FAC_9, FAC_10, FAC_11, FAC_12, FAC_13, FAC_14, FAC_18, FAC_20, FAC_21, FAC_23, FAC_24, FAC_25, FAC_27, FAC_28, FAC_31, FAC_32, FAC_33, FAC_34, FAC_35, FAC_36, FAC_37, FAC_38, FAC_42, FAC_43, FAC_45, FAC_46, FAC_47, FAC_48, FAC_49, FAC_60, FAC_61, FAC_62, FAC_64, FAC_66, FAC_69, FAC_71, FAC_73, FAC_74, FAC_75, FAC_77, FAC_79, FAC_80, FAC_82, FAC_84, FAC_85, FAC_86, FAC_92, FAC_94, FAC_95, FAC_96, FAC_97, FAC_99, FAC_100, FAC_101, FAC_104, FAC_106, FAC_107, FAC_108, FAC_109, FAC_110, FAC_111, FAC_113, FAC_116, FAC_119, FAC_121, FAC_122, FAC_127, FAC_130, FAC_131, FAC_133, FAC_139, FAC_145, FAC_158, FAC_178, FAC_180, FAC_182, FAC_205, FAC_206, FAC_210, FAC_211, FAC_213, FAC_214, FAC_215, FO_1, FO_2, FO_3, FO_4, FO_5, FO_6, FO_7, FO_8, FO_9, FO_10, FO_11, FO_12, FO_13, FO_14, FO_15, FO_16, FO_17, FO_18, FO_19, FO_20, FO_21, FO_22, FO_23, FO_24, FO_25, FO_26, FO_27, FO_28, FO_29, FO_30, FO_31, FO_32, FO_33, FO_34, FO_35, FO_36, FO_37, FO_38, FO_39, FO_40, FO_41, FO_42, FO_43, FO_44, FO_45, FO_46, FO_47, FO_48, FO_49, FO_50, FO_51, FO_52, FO_53, FO_54, FO_55, FO_56, FO_57, FO_58, FO_59, FO_60, FO_61, FO_62, FO_63, FO_64, FO_65, FO_66, FO_67, FO_68, FO_69, FO_70, FO_71, FO_72, FO_73, FO_74, FO_75, FO_76, KAR_2, KAR_3, KAR_4, KAR_5, KAR_6, KAR_7, KAR_8, KAR_9, KAR_10, KAR_11, KAR_12, KAR_13, KAR_14, KAR_15, KAR_16, KAR_17, KAR_18, KAR_19, KAR_20, KAR_21, KAR_22, KAR_23, KAR_24, KAR_25, KAR_26, KAR_27, KAR_28, KAR_29, KAR_30, KAR_31, KAR_32, KAR_33, KAR_34, KAR_35, KAR_36, KAR_37, KAR_38, KAR_39, KAR_40, KAR_41, KAR_42, KAR_43, KAR_44, KAR_45, KAR_46, KAR_47, KAR_48, KAR_49, KAR_50, KAR_51, KAR_52, KAR_53, KAR_54, KAR_55, KAR_56, KAR_57, KAR_58, KAR_59, KAR_60, KAR_61, KAR_62, KAR_63, KAR_64, MOR_1, MOR_2, MOR_3, MOR_4, ZER_1, ZER_2, ZER_4, ZER_5, ZER_6, ZER_7, ZER_8, ZER_9, ZER_12, ZER_13, ZER_15, ZER_18, ZER_19, ZER_21, ZER_23, ZER_24, ZER_28, ZER_29, ZER_31, ZER_35, ZER_37, ZER_39, ZER_40, ZER_42 and ZER_43

The Latent Basis

## The discovered latent features. i.e. Decorrelated by the IDeA
pander::pander(colnames(decorrelatedDF)[str_detect(colnames(decorrelatedDF),"La_")])

La_FAC_2, La_FAC_3, La_FAC_4, La_FAC_7, La_FAC_15, La_FAC_16, La_FAC_17, La_FAC_19, La_FAC_22, La_FAC_26, La_FAC_29, La_FAC_30, La_FAC_39, La_FAC_40, La_FAC_41, La_FAC_44, La_FAC_50, La_FAC_51, La_FAC_52, La_FAC_53, La_FAC_54, La_FAC_55, La_FAC_56, La_FAC_57, La_FAC_58, La_FAC_59, La_FAC_63, La_FAC_65, La_FAC_67, La_FAC_68, La_FAC_70, La_FAC_72, La_FAC_76, La_FAC_78, La_FAC_81, La_FAC_83, La_FAC_87, La_FAC_88, La_FAC_89, La_FAC_90, La_FAC_91, La_FAC_93, La_FAC_98, La_FAC_102, La_FAC_103, La_FAC_105, La_FAC_112, La_FAC_114, La_FAC_115, La_FAC_117, La_FAC_118, La_FAC_120, La_FAC_123, La_FAC_124, La_FAC_125, La_FAC_126, La_FAC_128, La_FAC_129, La_FAC_132, La_FAC_134, La_FAC_135, La_FAC_136, La_FAC_137, La_FAC_138, La_FAC_140, La_FAC_141, La_FAC_142, La_FAC_143, La_FAC_144, La_FAC_146, La_FAC_147, La_FAC_148, La_FAC_149, La_FAC_150, La_FAC_151, La_FAC_152, La_FAC_153, La_FAC_154, La_FAC_155, La_FAC_156, La_FAC_157, La_FAC_159, La_FAC_160, La_FAC_161, La_FAC_162, La_FAC_163, La_FAC_164, La_FAC_165, La_FAC_166, La_FAC_167, La_FAC_168, La_FAC_169, La_FAC_170, La_FAC_171, La_FAC_172, La_FAC_173, La_FAC_174, La_FAC_175, La_FAC_176, La_FAC_177, La_FAC_179, La_FAC_181, La_FAC_183, La_FAC_184, La_FAC_185, La_FAC_186, La_FAC_187, La_FAC_188, La_FAC_189, La_FAC_190, La_FAC_191, La_FAC_192, La_FAC_193, La_FAC_194, La_FAC_195, La_FAC_196, La_FAC_197, La_FAC_198, La_FAC_199, La_FAC_200, La_FAC_201, La_FAC_202, La_FAC_203, La_FAC_204, La_FAC_207, La_FAC_208, La_FAC_209, La_FAC_212, La_FAC_216, La_KAR_1, La_MOR_5, La_MOR_6, La_ZER_3, La_ZER_10, La_ZER_11, La_ZER_14, La_ZER_16, La_ZER_17, La_ZER_20, La_ZER_22, La_ZER_25, La_ZER_26, La_ZER_27, La_ZER_30, La_ZER_32, La_ZER_33, La_ZER_34, La_ZER_36, La_ZER_38, La_ZER_41, La_ZER_44, La_ZER_45, La_ZER_46 and La_ZER_47

The Discovered Latent Variables

The returned decorrelated object can be analyzed by the getLatentCoefficients() function to return the set of discovered latent variables with their corresponding formula.

latentlist <- getLatentCoefficients(decorrelatedDF)
pander::pander(head(latentlist))

La_FAC_2:

FAC_2 FAC_38

1 0.7623
La_FAC_3:

FAC_3 FAC_111

1 -0.9091
La_FAC_4:

FAC_4 FAC_100

1 -0.8614
La_FAC_7:

FAC_7 FAC_111

1 -0.06728
La_FAC_15:

FAC_15 FAC_75

1 -0.8605
La_FAC_16:

FAC_16 FAC_100

1 -0.6102

FAC_2	FAC_38
1	0.7623

FAC_3	FAC_111
1	-0.9091

FAC_4	FAC_100
1	-0.8614

FAC_7	FAC_111
1	-0.06728

FAC_15	FAC_75
1	-0.8605

FAC_16	FAC_100
1	-0.6102

Attributes of the Returned Dataframe.

Furthermore, the returned data frame will have the following main attributes:

Transformation matrix: “UPSTM”
Feature Analysis:
1. “TotalAdjustments”
2. “fsocre”

UPSTM

The “UPSTM“ attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the user specified threshold



## The Spatial Transformation Matrix:
UPSTM <- attr(decorrelatedDF,"UPSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(UPSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("UPSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="UPSTM Feature", ylab="Input Feature")


pander::pander(t(colnames(UPSTM)),caption="New names of decorrelated matrix")

New names of decorrelated matrix (continued below)
La_FAC_2	La_FAC_3	La_FAC_4	FAC_5	FAC_6	La_FAC_7	FAC_8	FAC_9

Table continues below
FAC_10	FAC_11	FAC_12	FAC_13	FAC_14	La_FAC_15	La_FAC_16	La_FAC_17

Table continues below
FAC_18	La_FAC_19	FAC_20	La_FAC_22	FAC_23	FAC_24	La_FAC_26

Table continues below
La_FAC_29	La_FAC_30	FAC_32	FAC_33	FAC_34	FAC_35	FAC_36	FAC_38

Table continues below
La_FAC_39	La_FAC_40	La_FAC_41	La_FAC_44	FAC_45	FAC_46	FAC_47

Table continues below
FAC_49	La_FAC_50	La_FAC_51	La_FAC_52	La_FAC_53	La_FAC_54	La_FAC_55

Table continues below
La_FAC_56	La_FAC_57	La_FAC_58	La_FAC_59	FAC_60	FAC_61	La_FAC_63

Table continues below
FAC_64	La_FAC_65	FAC_66	La_FAC_67	La_FAC_68	FAC_69	La_FAC_70

Table continues below
FAC_71	La_FAC_72	FAC_75	La_FAC_76	FAC_77	La_FAC_78	FAC_79

Table continues below
La_FAC_81	FAC_82	La_FAC_83	FAC_84	FAC_86	La_FAC_87	La_FAC_88

Table continues below
La_FAC_89	La_FAC_90	La_FAC_91	FAC_92	La_FAC_93	FAC_94	FAC_96

Table continues below
FAC_97	La_FAC_98	FAC_99	FAC_100	FAC_101	La_FAC_102	La_FAC_103

Table continues below
FAC_104	La_FAC_105	FAC_106	FAC_107	FAC_109	FAC_111	La_FAC_112

Table continues below
FAC_113	La_FAC_114	La_FAC_115	La_FAC_117	La_FAC_118	FAC_119

Table continues below
La_FAC_120	La_FAC_123	La_FAC_124	La_FAC_125	La_FAC_126	FAC_127

Table continues below
La_FAC_128	La_FAC_129	FAC_130	La_FAC_132	La_FAC_134	La_FAC_135

Table continues below
La_FAC_136	La_FAC_137	La_FAC_138	FAC_139	La_FAC_140	La_FAC_141

Table continues below
La_FAC_142	La_FAC_143	La_FAC_144	La_FAC_146	La_FAC_147	La_FAC_148

Table continues below
La_FAC_149	La_FAC_150	La_FAC_151	La_FAC_152	La_FAC_153	La_FAC_154

Table continues below
La_FAC_155	La_FAC_156	La_FAC_157	FAC_158	La_FAC_159	La_FAC_160

Table continues below
La_FAC_161	La_FAC_162	La_FAC_163	La_FAC_164	La_FAC_165	La_FAC_166

Table continues below
La_FAC_167	La_FAC_168	La_FAC_169	La_FAC_170	La_FAC_171	La_FAC_172

Table continues below
La_FAC_173	La_FAC_174	La_FAC_175	La_FAC_176	La_FAC_177	La_FAC_179

Table continues below
FAC_180	La_FAC_181	La_FAC_183	La_FAC_184	La_FAC_185	La_FAC_186

Table continues below
La_FAC_187	La_FAC_188	La_FAC_189	La_FAC_190	La_FAC_191	La_FAC_192

Table continues below
La_FAC_193	La_FAC_194	La_FAC_195	La_FAC_196	La_FAC_197	La_FAC_198

Table continues below
La_FAC_199	La_FAC_200	La_FAC_201	La_FAC_202	La_FAC_203	La_FAC_204

Table continues below
La_FAC_207	La_FAC_208	La_FAC_209	La_FAC_212	La_FAC_216	La_KAR_1

Table continues below
MOR_4	La_MOR_5	La_MOR_6	ZER_2	La_ZER_3	ZER_4	ZER_9	La_ZER_10

Table continues below
La_ZER_11	La_ZER_14	ZER_15	La_ZER_16	La_ZER_17	La_ZER_20	ZER_21

Table continues below
La_ZER_22	La_ZER_25	La_ZER_26	La_ZER_27	La_ZER_30	ZER_31	La_ZER_32

Table continues below
La_ZER_33	La_ZER_34	ZER_35	La_ZER_36	La_ZER_38	ZER_39	ZER_40

La_ZER_41

ZER_42

La_ZER_44

La_ZER_45

La_ZER_46

La_ZER_47

The Feature Index Score

The IDeA analysis of the data features are stored in three attributes: “TotalAdjustments”, and “fscore”.

“TotalAdjustments” returns named vector of with the number of times a variable was decorrelated by the IDeA.
“fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features. The score is equal to number of times the features was uses as independent feature minus the “TotalAdjustments”.


#pander::pander(t(attr(decorrelatedDF,"TotalAdjustments")),caption="Total Adjustments")

#pander::pander(t(attr(decorrelatedDF,"fscore")),caption="The Score")

totA <- attr(decorrelatedDF,"TotalAdjustments") 
totA <- totA[order(-totA)];
totA <- totA[totA>0];
barplot(totA,las=2,cex.names = 0.5)


fscore <- attr(decorrelatedDF,"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.5)

Testing Set Decorrelation

To decorrelate any new data set use the predictDecorrelate() function

decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- rownames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
cormat[is.na(cormat)] <- 0;
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

IDeA with Options

The following code are examples of running IDeA() function with options:


datasetframeDecor<-list();

# With default parameters
datasetframeDecor[[1]] <- decorrelatedDF
decortype <- list();
decortype[[1]] <- "Default"


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 2.19 0.07 2.19


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 6.92 0.22 7.04



# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- IDeA(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 5.48 0.11 5.50


# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 9.25 0.15 9.37



# The following are for outcome driven learning

# Set the target class for association learning
decortype[[6]] <- "Driven_Default"
system.time(datasetframeDecor[[6]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 1.61 0.11 1.74



# Change the maximum correlation goal
decortype[[7]] <- "Driven_AtThr"
system.time(datasetframeDecor[[7]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

user system elapsed 2.56 0.13 2.55


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Driven_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 7.31 0.09 7.25


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Driven_LM_Spearman"
system.time(datasetframeDecor[[9]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 5.68 0.13 5.75


# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Driven_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 8.37 0.07 8.42



# With user defined basis 
decortype[[11]] <- "Driven_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 8.92 0.04 8.89

Machine Learning and UPSTM

Train a simple NB model on the raw data set

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

Training using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   decorrelatedDF,
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

Selected Raw Features


vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

Selected decorrelated Features


vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- decorrelatedDF;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Once we have the transformed testing dataset we can make a side by side comparison of predictions


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")

classNames <- as.character(classNames)
par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("UPSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : 0 PCA : 0 UPSTM : 0 Raw : 1 PCA : 1 UPSTM : 1 Raw : 2 PCA : 2 UPSTM : 2 Raw : 3 PCA : 3 UPSTM : 3 Raw : 4 PCA : 4 UPSTM : 4 Raw : 5 PCA : 5 UPSTM : 5 Raw : 6 PCA : 6 UPSTM : 6 Raw : 7 PCA : 7 UPSTM : 7 Raw : 8 PCA : 8 UPSTM : 8 Raw : 9 PCA : 9 UPSTM : 9

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Training and Prediction on all Decorrelations Sets

par(mfrow=c(4,4))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.35)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:0 Default:1 Default:2 Default:3 Default:4 Default:5 Default:6 Default:7 Default:8 Default:9 AtThr:0 AtThr:1 AtThr:2 AtThr:3 AtThr:4 AtThr:5 AtThr:6 AtThr:7 AtThr:8 AtThr:9 RLM_Pearson:0 RLM_Pearson:1 RLM_Pearson:2 RLM_Pearson:3 RLM_Pearson:4 RLM_Pearson:5 RLM_Pearson:6 RLM_Pearson:7 RLM_Pearson:8 RLM_Pearson:9 LM_Spearman:0 LM_Spearman:1 LM_Spearman:2 LM_Spearman:3 LM_Spearman:4 LM_Spearman:5 LM_Spearman:6 LM_Spearman:7 LM_Spearman:8 LM_Spearman:9 RLM_Spearman:0 RLM_Spearman:1 RLM_Spearman:2 RLM_Spearman:3 RLM_Spearman:4 RLM_Spearman:5 RLM_Spearman:6 RLM_Spearman:7 RLM_Spearman:8 RLM_Spearman:9 Driven_Default:0 Driven_Default:1 Driven_Default:2 Driven_Default:3 Driven_Default:4 Driven_Default:5 Driven_Default:6 Driven_Default:7 Driven_Default:8 Driven_Default:9 Driven_AtThr:0 Driven_AtThr:1 Driven_AtThr:2 Driven_AtThr:3 Driven_AtThr:4 Driven_AtThr:5 Driven_AtThr:6 Driven_AtThr:7 Driven_AtThr:8 Driven_AtThr:9 Driven_RLM_Pearson:0 Driven_RLM_Pearson:1 Driven_RLM_Pearson:2 Driven_RLM_Pearson:3 Driven_RLM_Pearson:4 Driven_RLM_Pearson:5 Driven_RLM_Pearson:6 Driven_RLM_Pearson:7 Driven_RLM_Pearson:8 Driven_RLM_Pearson:9 Driven_LM_Spearman:0 Driven_LM_Spearman:1 Driven_LM_Spearman:2 Driven_LM_Spearman:3 Driven_LM_Spearman:4 Driven_LM_Spearman:5 Driven_LM_Spearman:6 Driven_LM_Spearman:7 Driven_LM_Spearman:8 Driven_LM_Spearman:9 Driven_RLM_Spearman:0 Driven_RLM_Spearman:1 Driven_RLM_Spearman:2 Driven_RLM_Spearman:3 Driven_RLM_Spearman:4 Driven_RLM_Spearman:5 Driven_RLM_Spearman:6 Driven_RLM_Spearman:7 Driven_RLM_Spearman:8 Driven_RLM_Spearman:9 Driven_KS_RLM_Spearman:0 Driven_KS_RLM_Spearman:1 Driven_KS_RLM_Spearman:2 Driven_KS_RLM_Spearman:3 Driven_KS_RLM_Spearman:4 Driven_KS_RLM_Spearman:5 Driven_KS_RLM_Spearman:6 Driven_KS_RLM_Spearman:7 Driven_KS_RLM_Spearman:8 Driven_KS_RLM_Spearman:9

Final Plot Comparing the ROC AUC of all Options

par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)

	est	lower	upper
Raw	0.9981	0.9959	0.9999
PCA	0.9092	0.873	0.9455
Default	0.9981	0.9961	0.9999
AtThr	0.9982	0.9961	0.9999
RLM_Pearson	0.9977	0.9953	0.9998
LM_Spearman	0.9981	0.9961	0.9998
RLM_Spearman	0.9974	0.995	0.9996
Driven_Default	0.998	0.9959	0.9999
Driven_AtThr	0.998	0.9959	0.9999
Driven_RLM_Pearson	0.9973	0.9947	0.9997
Driven_LM_Spearman	0.9979	0.9957	0.9999
Driven_RLM_Spearman	0.997	0.9942	0.9995
Driven_KS_RLM_Spearman	0.9976	0.9951	0.9998

bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )