This document describes the use of the FRESA.CAD::IDeA() function that runs the feature correlation analysis (IDeA) algorithm on the mfeat data set:
https://archive.ics.uci.edu/ml/datasets/Multiple+Features
M. van Breukelen, R.P.W. Duin, D.M.J. Tax, and J.E. den Hartog, Handwritten digit recognition by combined classifiers, Kybernetika, vol. 34, no. 4, 1998, 381-386.
This script uses FRESA.CAD and mlbench R packages:
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")
library("FRESA.CAD")
## Loading required package: Rcpp
## Loading required package: stringr
## Loading required package: miscTools
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(mlbench)
op <- par(no.readonly = TRUE)
I’ll load the mfeat data set
mfeat <- read.delim("../Data/mfeat.txt", stringsAsFactors=TRUE)
mfeat$ID <- NULL
print(table(mfeat$Class))
#>
#> 0 1 2 3 4 5 6 7 8 9
#> 200 200 200 200 200 200 200 200 200 200
Setting some variables for downstream analysis
studyName = "mfeat"
datasetframe <- mfeat
Outcome <- "Class"
trainFraction = 0.50
correlationThreshold = 0.6
featnames <- colnames(datasetframe)[colnames(datasetframe) != Outcome]
Setting the Training and Testing sets
tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])
allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)
trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
classSample <- allrowClass[allrowClass == theClass]
trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}
datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]
outcomes <- datasetframe_train[,Outcome]
pander::pander(table(datasetframe[,Outcome]),caption="All")
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|
| 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 |
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|
| 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|
| 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.
Default Parameters: thr=0.80,method=“fast”,type=“LM”
system.time(decorrelatedDF <- IDeA(datasetframe_train))
user system elapsed 1.25 0.08 1.31 ### Returned Data Frame Specifications
The IDeA function returns a data frame with the following specifications for the column names:
The output basis after transformation will be named after the original names:
## The unaltered features set 1:
## 1) Uncorrelated to any other feature or
## 2) Factors or
## 3) features with not enough unique values.
pander::pander(colnames(decorrelatedDF)[colnames(decorrelatedDF) %in% colnames(datasetframe_train)])
Class, FAC_1, FAC_5, FAC_6, FAC_8, FAC_9, FAC_10, FAC_11, FAC_12, FAC_13, FAC_14, FAC_18, FAC_20, FAC_21, FAC_23, FAC_24, FAC_25, FAC_27, FAC_28, FAC_31, FAC_32, FAC_33, FAC_34, FAC_35, FAC_36, FAC_37, FAC_38, FAC_42, FAC_43, FAC_45, FAC_46, FAC_47, FAC_48, FAC_49, FAC_60, FAC_61, FAC_62, FAC_64, FAC_66, FAC_69, FAC_71, FAC_73, FAC_74, FAC_75, FAC_77, FAC_79, FAC_80, FAC_82, FAC_84, FAC_85, FAC_86, FAC_92, FAC_94, FAC_95, FAC_96, FAC_97, FAC_99, FAC_100, FAC_101, FAC_104, FAC_106, FAC_107, FAC_108, FAC_109, FAC_110, FAC_111, FAC_113, FAC_116, FAC_119, FAC_121, FAC_122, FAC_127, FAC_130, FAC_131, FAC_133, FAC_139, FAC_145, FAC_158, FAC_178, FAC_180, FAC_182, FAC_205, FAC_206, FAC_210, FAC_211, FAC_213, FAC_214, FAC_215, FO_1, FO_2, FO_3, FO_4, FO_5, FO_6, FO_7, FO_8, FO_9, FO_10, FO_11, FO_12, FO_13, FO_14, FO_15, FO_16, FO_17, FO_18, FO_19, FO_20, FO_21, FO_22, FO_23, FO_24, FO_25, FO_26, FO_27, FO_28, FO_29, FO_30, FO_31, FO_32, FO_33, FO_34, FO_35, FO_36, FO_37, FO_38, FO_39, FO_40, FO_41, FO_42, FO_43, FO_44, FO_45, FO_46, FO_47, FO_48, FO_49, FO_50, FO_51, FO_52, FO_53, FO_54, FO_55, FO_56, FO_57, FO_58, FO_59, FO_60, FO_61, FO_62, FO_63, FO_64, FO_65, FO_66, FO_67, FO_68, FO_69, FO_70, FO_71, FO_72, FO_73, FO_74, FO_75, FO_76, KAR_2, KAR_3, KAR_4, KAR_5, KAR_6, KAR_7, KAR_8, KAR_9, KAR_10, KAR_11, KAR_12, KAR_13, KAR_14, KAR_15, KAR_16, KAR_17, KAR_18, KAR_19, KAR_20, KAR_21, KAR_22, KAR_23, KAR_24, KAR_25, KAR_26, KAR_27, KAR_28, KAR_29, KAR_30, KAR_31, KAR_32, KAR_33, KAR_34, KAR_35, KAR_36, KAR_37, KAR_38, KAR_39, KAR_40, KAR_41, KAR_42, KAR_43, KAR_44, KAR_45, KAR_46, KAR_47, KAR_48, KAR_49, KAR_50, KAR_51, KAR_52, KAR_53, KAR_54, KAR_55, KAR_56, KAR_57, KAR_58, KAR_59, KAR_60, KAR_61, KAR_62, KAR_63, KAR_64, MOR_1, MOR_2, MOR_3, MOR_4, ZER_1, ZER_2, ZER_4, ZER_5, ZER_6, ZER_7, ZER_8, ZER_9, ZER_12, ZER_13, ZER_15, ZER_18, ZER_19, ZER_21, ZER_23, ZER_24, ZER_28, ZER_29, ZER_31, ZER_35, ZER_37, ZER_39, ZER_40, ZER_42 and ZER_43
## The discovered latent features. i.e. Decorrelated by the IDeA
pander::pander(colnames(decorrelatedDF)[str_detect(colnames(decorrelatedDF),"La_")])
La_FAC_2, La_FAC_3, La_FAC_4, La_FAC_7, La_FAC_15, La_FAC_16, La_FAC_17, La_FAC_19, La_FAC_22, La_FAC_26, La_FAC_29, La_FAC_30, La_FAC_39, La_FAC_40, La_FAC_41, La_FAC_44, La_FAC_50, La_FAC_51, La_FAC_52, La_FAC_53, La_FAC_54, La_FAC_55, La_FAC_56, La_FAC_57, La_FAC_58, La_FAC_59, La_FAC_63, La_FAC_65, La_FAC_67, La_FAC_68, La_FAC_70, La_FAC_72, La_FAC_76, La_FAC_78, La_FAC_81, La_FAC_83, La_FAC_87, La_FAC_88, La_FAC_89, La_FAC_90, La_FAC_91, La_FAC_93, La_FAC_98, La_FAC_102, La_FAC_103, La_FAC_105, La_FAC_112, La_FAC_114, La_FAC_115, La_FAC_117, La_FAC_118, La_FAC_120, La_FAC_123, La_FAC_124, La_FAC_125, La_FAC_126, La_FAC_128, La_FAC_129, La_FAC_132, La_FAC_134, La_FAC_135, La_FAC_136, La_FAC_137, La_FAC_138, La_FAC_140, La_FAC_141, La_FAC_142, La_FAC_143, La_FAC_144, La_FAC_146, La_FAC_147, La_FAC_148, La_FAC_149, La_FAC_150, La_FAC_151, La_FAC_152, La_FAC_153, La_FAC_154, La_FAC_155, La_FAC_156, La_FAC_157, La_FAC_159, La_FAC_160, La_FAC_161, La_FAC_162, La_FAC_163, La_FAC_164, La_FAC_165, La_FAC_166, La_FAC_167, La_FAC_168, La_FAC_169, La_FAC_170, La_FAC_171, La_FAC_172, La_FAC_173, La_FAC_174, La_FAC_175, La_FAC_176, La_FAC_177, La_FAC_179, La_FAC_181, La_FAC_183, La_FAC_184, La_FAC_185, La_FAC_186, La_FAC_187, La_FAC_188, La_FAC_189, La_FAC_190, La_FAC_191, La_FAC_192, La_FAC_193, La_FAC_194, La_FAC_195, La_FAC_196, La_FAC_197, La_FAC_198, La_FAC_199, La_FAC_200, La_FAC_201, La_FAC_202, La_FAC_203, La_FAC_204, La_FAC_207, La_FAC_208, La_FAC_209, La_FAC_212, La_FAC_216, La_KAR_1, La_MOR_5, La_MOR_6, La_ZER_3, La_ZER_10, La_ZER_11, La_ZER_14, La_ZER_16, La_ZER_17, La_ZER_20, La_ZER_22, La_ZER_25, La_ZER_26, La_ZER_27, La_ZER_30, La_ZER_32, La_ZER_33, La_ZER_34, La_ZER_36, La_ZER_38, La_ZER_41, La_ZER_44, La_ZER_45, La_ZER_46 and La_ZER_47
The returned decorrelated object can be analyzed by the getLatentCoefficients() function to return the set of discovered latent variables with their corresponding formula.
latentlist <- getLatentCoefficients(decorrelatedDF)
pander::pander(head(latentlist))
La_FAC_2:
| FAC_2 | FAC_38 |
|---|---|
| 1 | 0.7623 |
La_FAC_3:
| FAC_3 | FAC_111 |
|---|---|
| 1 | -0.9091 |
La_FAC_4:
| FAC_4 | FAC_100 |
|---|---|
| 1 | -0.8614 |
La_FAC_7:
| FAC_7 | FAC_111 |
|---|---|
| 1 | -0.06728 |
La_FAC_15:
| FAC_15 | FAC_75 |
|---|---|
| 1 | -0.8605 |
La_FAC_16:
| FAC_16 | FAC_100 |
|---|---|
| 1 | -0.6102 |
Furthermore, the returned data frame will have the following main attributes:
Transformation matrix: “UPSTM”
Feature Analysis:
“TotalAdjustments”
“fsocre”
The “UPSTM“ attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the user specified threshold
## The Spatial Transformation Matrix:
UPSTM <- attr(decorrelatedDF,"UPSTM")
## The heatmap of the matrix
gplots::heatmap.2(1*(abs(UPSTM) > 0),
trace = "none",
mar = c(10,10),
col=rev(heat.colors(2)),
main = paste("UPSTM Matrix:",studyName),
cexRow = 0.7,
cexCol = 0.7,
breaks = c(0,0.5,1),
key.title=NA,
key.xlab="|beta| > 0",
xlab="UPSTM Feature", ylab="Input Feature")
pander::pander(t(colnames(UPSTM)),caption="New names of decorrelated matrix")
| La_FAC_2 | La_FAC_3 | La_FAC_4 | FAC_5 | FAC_6 | La_FAC_7 | FAC_8 | FAC_9 |
| FAC_10 | FAC_11 | FAC_12 | FAC_13 | FAC_14 | La_FAC_15 | La_FAC_16 | La_FAC_17 |
| FAC_18 | La_FAC_19 | FAC_20 | La_FAC_22 | FAC_23 | FAC_24 | La_FAC_26 |
| La_FAC_29 | La_FAC_30 | FAC_32 | FAC_33 | FAC_34 | FAC_35 | FAC_36 | FAC_38 |
| La_FAC_39 | La_FAC_40 | La_FAC_41 | La_FAC_44 | FAC_45 | FAC_46 | FAC_47 |
| FAC_49 | La_FAC_50 | La_FAC_51 | La_FAC_52 | La_FAC_53 | La_FAC_54 | La_FAC_55 |
| La_FAC_56 | La_FAC_57 | La_FAC_58 | La_FAC_59 | FAC_60 | FAC_61 | La_FAC_63 |
| FAC_64 | La_FAC_65 | FAC_66 | La_FAC_67 | La_FAC_68 | FAC_69 | La_FAC_70 |
| FAC_71 | La_FAC_72 | FAC_75 | La_FAC_76 | FAC_77 | La_FAC_78 | FAC_79 |
| La_FAC_81 | FAC_82 | La_FAC_83 | FAC_84 | FAC_86 | La_FAC_87 | La_FAC_88 |
| La_FAC_89 | La_FAC_90 | La_FAC_91 | FAC_92 | La_FAC_93 | FAC_94 | FAC_96 |
| FAC_97 | La_FAC_98 | FAC_99 | FAC_100 | FAC_101 | La_FAC_102 | La_FAC_103 |
| FAC_104 | La_FAC_105 | FAC_106 | FAC_107 | FAC_109 | FAC_111 | La_FAC_112 |
| FAC_113 | La_FAC_114 | La_FAC_115 | La_FAC_117 | La_FAC_118 | FAC_119 |
| La_FAC_120 | La_FAC_123 | La_FAC_124 | La_FAC_125 | La_FAC_126 | FAC_127 |
| La_FAC_128 | La_FAC_129 | FAC_130 | La_FAC_132 | La_FAC_134 | La_FAC_135 |
| La_FAC_136 | La_FAC_137 | La_FAC_138 | FAC_139 | La_FAC_140 | La_FAC_141 |
| La_FAC_142 | La_FAC_143 | La_FAC_144 | La_FAC_146 | La_FAC_147 | La_FAC_148 |
| La_FAC_149 | La_FAC_150 | La_FAC_151 | La_FAC_152 | La_FAC_153 | La_FAC_154 |
| La_FAC_155 | La_FAC_156 | La_FAC_157 | FAC_158 | La_FAC_159 | La_FAC_160 |
| La_FAC_161 | La_FAC_162 | La_FAC_163 | La_FAC_164 | La_FAC_165 | La_FAC_166 |
| La_FAC_167 | La_FAC_168 | La_FAC_169 | La_FAC_170 | La_FAC_171 | La_FAC_172 |
| La_FAC_173 | La_FAC_174 | La_FAC_175 | La_FAC_176 | La_FAC_177 | La_FAC_179 |
| FAC_180 | La_FAC_181 | La_FAC_183 | La_FAC_184 | La_FAC_185 | La_FAC_186 |
| La_FAC_187 | La_FAC_188 | La_FAC_189 | La_FAC_190 | La_FAC_191 | La_FAC_192 |
| La_FAC_193 | La_FAC_194 | La_FAC_195 | La_FAC_196 | La_FAC_197 | La_FAC_198 |
| La_FAC_199 | La_FAC_200 | La_FAC_201 | La_FAC_202 | La_FAC_203 | La_FAC_204 |
| La_FAC_207 | La_FAC_208 | La_FAC_209 | La_FAC_212 | La_FAC_216 | La_KAR_1 |
| MOR_4 | La_MOR_5 | La_MOR_6 | ZER_2 | La_ZER_3 | ZER_4 | ZER_9 | La_ZER_10 |
| La_ZER_11 | La_ZER_14 | ZER_15 | La_ZER_16 | La_ZER_17 | La_ZER_20 | ZER_21 |
| La_ZER_22 | La_ZER_25 | La_ZER_26 | La_ZER_27 | La_ZER_30 | ZER_31 | La_ZER_32 |
| La_ZER_33 | La_ZER_34 | ZER_35 | La_ZER_36 | La_ZER_38 | ZER_39 | ZER_40 |
| La_ZER_41 | ZER_42 | La_ZER_44 | La_ZER_45 | La_ZER_46 | La_ZER_47 |
The IDeA analysis of the data features are stored in three attributes: “TotalAdjustments”, and “fscore”.
“TotalAdjustments” returns named vector of with the number of times a variable was decorrelated by the IDeA.
“fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features. The score is equal to number of times the features was uses as independent feature minus the “TotalAdjustments”.
#pander::pander(t(attr(decorrelatedDF,"TotalAdjustments")),caption="Total Adjustments")
#pander::pander(t(attr(decorrelatedDF,"fscore")),caption="The Score")
totA <- attr(decorrelatedDF,"TotalAdjustments")
totA <- totA[order(-totA)];
totA <- totA[totA>0];
barplot(totA,las=2,cex.names = 0.5)
fscore <- attr(decorrelatedDF,"fscore")
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.5)
To decorrelate any new data set use the predictDecorrelate() function
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)
Here are the heat maps of the correlation matrices before and after decorrelation on the testing set
featnames <- rownames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = paste("Raw Correlation:",studyName),
cexRow = 0.75,
cexCol = 0.75,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
featnames <- colnames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
cormat[is.na(cormat)] <- 0;
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = paste("After decorrelation:",studyName),
cexRow = 0.75,
cexCol = 0.75,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Transformed Feature", ylab="Transformed Feature")
The following code are examples of running IDeA() function with options:
datasetframeDecor<-list();
# With default parameters
datasetframeDecor[[1]] <- decorrelatedDF
decortype <- list();
decortype[[1]] <- "Default"
# Change the maximum correlation goal
decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- IDeA(
datasetframe_train,
thr = correlationThreshold
))
user system elapsed 2.19 0.07 2.19
# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- IDeA(
datasetframe_train,
type="RLM",
method="pearson"))
user system elapsed 6.92 0.22 7.04
# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- IDeA(
datasetframe_train,
type="LM",
method="spearman"))
user system elapsed 5.48 0.11 5.50
# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- IDeA(
datasetframe_train,
type="RLM",
method="spearman"))
user system elapsed 9.25 0.15 9.37
# The following are for outcome driven learning
# Set the target class for association learning
decortype[[6]] <- "Driven_Default"
system.time(datasetframeDecor[[6]] <- IDeA(
datasetframe_train,
Outcome=Outcome))
user system elapsed 1.61 0.11 1.74
# Change the maximum correlation goal
decortype[[7]] <- "Driven_AtThr"
system.time(datasetframeDecor[[7]] <- IDeA(
datasetframe_train,
thr = correlationThreshold,
Outcome=Outcome))
user system elapsed 2.56 0.13 2.55
# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Driven_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="RLM",
method="pearson"))
user system elapsed 7.31 0.09 7.25
# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Driven_LM_Spearman"
system.time(datasetframeDecor[[9]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="LM",
method="spearman"))
user system elapsed 5.68 0.13 5.75
# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Driven_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="RLM",
method="spearman"))
user system elapsed 8.37 0.07 8.42
# With user defined basis
decortype[[11]] <- "Driven_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
Outcome=Outcome,
pvalue=0.20,
limit=0,
thr=correlationThreshold))
system.time(datasetframeDecor[[11]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
baseFeatures=baseKS,
type="RLM",
method="spearman"))
user system elapsed 8.92 0.04 8.89
Train a simple NB model on the raw data set
mNBRaw <- filteredFit(paste(Outcome,"~."),
datasetframe_train,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
datasetframe_train,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=TRUE,
normalize=FALSE
)
Training using the decorrelated data
mNBDecor <- filteredFit(paste(Outcome,"~."),
decorrelatedDF,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
Selected Raw Features
vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
data=dta,
Outcome=Outcome,
hCluster="col",
srtCol=45,
xlab="Raw Features",
ylab="Samples"
)
Selected decorrelated Features
vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- decorrelatedDF;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
data=dta,
Outcome=Outcome,
hCluster="col",
srtCol=35,
xlab="Decorrelated Features",
ylab="Samples"
)
To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function
# Transform the testing set
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)
Once we have the transformed testing dataset we can make a side by side comparison of predictions
# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")
# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")
# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")
classNames <- as.character(classNames)
par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
paste("Raw :",theClass),cex=0.75)
pander::pander(psRaw$aucs)
psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
paste("PCA :",theClass),cex=0.75)
pander::pander(psPCA$aucs)
psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
paste("UPSTM :",theClass),cex=0.75)
pander::pander(psDecor$aucs)
meanROCAUC <- meanROCAUC + psRaw$aucs;
meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}
Raw : 0 PCA : 0 UPSTM : 0
Raw
: 1 PCA : 1 UPSTM : 1
Raw
: 2 PCA : 2 UPSTM : 2
Raw
: 3 PCA : 3 UPSTM : 3
Raw
: 4 PCA : 4 UPSTM : 4
Raw
: 5 PCA : 5 UPSTM : 5
Raw
: 6 PCA : 6 UPSTM : 6
Raw
: 7 PCA : 7 UPSTM : 7
Raw
: 8 PCA : 8 UPSTM : 8
Raw
: 9 PCA : 9 UPSTM : 9
meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);
par(mfrow=c(4,4))
for (i in c(1:length(datasetframeDecor)))
{
mNBDecor <- filteredFit(paste(Outcome,"~."),
datasetframeDecor[[i]],
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
prDecor <- attr(predict(mNBDecor,decor_test),"prob")
meanROCAUC <- numeric(3);
for (theClass in classNames)
{
classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
paste(decortype[[i]],theClass,sep=":"),cex=0.35)
meanROCAUC <- meanROCAUC + psDecor$aucs;
}
meanROCAUC <- meanROCAUC/length(classNames)
pander::pander(meanROCAUC)
AllRocAUC <- rbind(AllRocAUC,meanROCAUC)
}
Default:0 Default:1 Default:2 Default:3 Default:4 Default:5 Default:6
Default:7 Default:8 Default:9 AtThr:0 AtThr:1 AtThr:2 AtThr:3 AtThr:4
AtThr:5
AtThr:6
AtThr:7 AtThr:8 AtThr:9 RLM_Pearson:0 RLM_Pearson:1 RLM_Pearson:2
RLM_Pearson:3 RLM_Pearson:4 RLM_Pearson:5 RLM_Pearson:6 RLM_Pearson:7
RLM_Pearson:8 RLM_Pearson:9 LM_Spearman:0 LM_Spearman:1
LM_Spearman:2
LM_Spearman:3 LM_Spearman:4 LM_Spearman:5 LM_Spearman:6 LM_Spearman:7
LM_Spearman:8 LM_Spearman:9 RLM_Spearman:0 RLM_Spearman:1 RLM_Spearman:2
RLM_Spearman:3 RLM_Spearman:4 RLM_Spearman:5 RLM_Spearman:6
RLM_Spearman:7
RLM_Spearman:8
RLM_Spearman:9 Driven_Default:0 Driven_Default:1 Driven_Default:2
Driven_Default:3 Driven_Default:4 Driven_Default:5 Driven_Default:6
Driven_Default:7 Driven_Default:8 Driven_Default:9 Driven_AtThr:0
Driven_AtThr:1 Driven_AtThr:2 Driven_AtThr:3
Driven_AtThr:4
Driven_AtThr:5 Driven_AtThr:6 Driven_AtThr:7 Driven_AtThr:8
Driven_AtThr:9 Driven_RLM_Pearson:0 Driven_RLM_Pearson:1
Driven_RLM_Pearson:2 Driven_RLM_Pearson:3 Driven_RLM_Pearson:4
Driven_RLM_Pearson:5 Driven_RLM_Pearson:6 Driven_RLM_Pearson:7
Driven_RLM_Pearson:8 Driven_RLM_Pearson:9
Driven_LM_Spearman:0
Driven_LM_Spearman:1 Driven_LM_Spearman:2 Driven_LM_Spearman:3
Driven_LM_Spearman:4 Driven_LM_Spearman:5 Driven_LM_Spearman:6
Driven_LM_Spearman:7 Driven_LM_Spearman:8 Driven_LM_Spearman:9
Driven_RLM_Spearman:0 Driven_RLM_Spearman:1 Driven_RLM_Spearman:2
Driven_RLM_Spearman:3 Driven_RLM_Spearman:4 Driven_RLM_Spearman:5
Driven_RLM_Spearman:6
Driven_RLM_Spearman:7 Driven_RLM_Spearman:8 Driven_RLM_Spearman:9
Driven_KS_RLM_Spearman:0 Driven_KS_RLM_Spearman:1
Driven_KS_RLM_Spearman:2 Driven_KS_RLM_Spearman:3
Driven_KS_RLM_Spearman:4 Driven_KS_RLM_Spearman:5
Driven_KS_RLM_Spearman:6 Driven_KS_RLM_Spearman:7
Driven_KS_RLM_Spearman:8 Driven_KS_RLM_Spearman:9
par(mfrow=c(1,1))
rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
| est | lower | upper | |
|---|---|---|---|
| Raw | 0.9981 | 0.9959 | 0.9999 |
| PCA | 0.9092 | 0.873 | 0.9455 |
| Default | 0.9981 | 0.9961 | 0.9999 |
| AtThr | 0.9982 | 0.9961 | 0.9999 |
| RLM_Pearson | 0.9977 | 0.9953 | 0.9998 |
| LM_Spearman | 0.9981 | 0.9961 | 0.9998 |
| RLM_Spearman | 0.9974 | 0.995 | 0.9996 |
| Driven_Default | 0.998 | 0.9959 | 0.9999 |
| Driven_AtThr | 0.998 | 0.9959 | 0.9999 |
| Driven_RLM_Pearson | 0.9973 | 0.9947 | 0.9997 |
| Driven_LM_Spearman | 0.9979 | 0.9957 | 0.9999 |
| Driven_RLM_Spearman | 0.997 | 0.9942 | 0.9995 |
| Driven_KS_RLM_Spearman | 0.9976 | 0.9951 | 0.9998 |
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
metricname = "ROCAUC",
thesets = "ROC AUC",
themethod = rownames(AllRocAUC),
main = "ROC AUC",
offsets = c(0.5,1),
scoreDirection = ">",
ho=0.5,
args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
col = terrain.colors(nrow(AllRocAUC))
)