GDSTM Decorrelation on the multiple features Dataset

This document describes the use of the FRESA.CAD::GDSTMDecorrelation() function that runs the feature correlation analysis (FCA) algorithm on the mfeat data set:

https://archive.ics.uci.edu/ml/datasets/Multiple+Features

M. van Breukelen, R.P.W. Duin, D.M.J. Tax, and J.E. den Hartog, Handwritten digit recognition by combined classifiers, Kybernetika, vol. 34, no. 4, 1998, 381-386.

This scrip uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
Loading required package: Rcpp
Loading required package: stringr
Loading required package: miscTools
Loading required package: Hmisc
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Loading required package: ggplot2
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:base’:

    format.pval, units

Loading required package: pROC
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the mfeat data set

mfeat <- read.delim("../Data/mfeat.txt", stringsAsFactors=TRUE)
mfeat$ID <- NULL
print(table(mfeat$Class))

  0   1   2   3   4   5   6   7   8   9 
200 200 200 200 200 200 200 200 200 200 

Setting some variables for downstream analysis

studyName = "mfeat"
datasetframe <- mfeat
Outcome <- "Class"

trainFraction = 0.50
correlationThreshold = 0.6
featnames <- colnames(datasetframe)[colnames(datasetframe) != Outcome]

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
All
0 1 2 3 4 5 6 7 8 9
200 200 200 200 200 200 200 200 200 200
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
Training
0 1 2 3 4 5 6 7 8 9
100 100 100 100 100 100 100 100 100 100
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
Testing
0 1 2 3 4 5 6 7 8 9
100 100 100 100 100 100 100 100 100 100

FCA with default parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))

user system elapsed 1.27 0.11 1.11

decortype[[1]] <- "Default"

Transforming the testing set

To make test predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

Heat maps of the correlation matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
cormat[is.na(cormat)] <- 0;
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

NA
NA

The GDSTMDecorrelation function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

Furthermore, the returned data frame will have the following attributes:

  1. Transformation matrix: “GDSTM

  2. Features:

    1. fsocre

    2. varincluded

    3. topFeatures

  3. Unaltered Basis:

    1. baseFeatures

      1. correlatedToBase
    2. AbaseFeatures

GDSTM

The “GDSTM attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold



## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")

NA
NA
NA

The Feature Index Score

The FCA analysis of the data features are stored in three attributes: “varincluded”, “topFeatures”, and “fscore”.

  • “varincluded” returns the list of continuous features that were decorrelated

  • “topFeatures” returns the features that at some point were used as independent variables inside the linear models.

  • “fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features.

\[ F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind \]

\[ F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep \]

where \(B^n_j\) is the set of features statistically associated with feature j at iteration n, \(ρ_{i,j}\) is the correlation between features i,j, and \(ρ_{th}\) is the correlation goal, \({Ind, Dep}\) are the set of independent and dependent features respectively. In other words, the “fscore” indicates the degree of total association of”independent” features to dependent features.



fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

Machine Learning and GDSTM

Train a simple NB model on the raw dataset

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=TRUE
                   )

Now using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )

The testing set predictions comparison


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
classNames <- as.character(classNames)
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : 0

est lower upper
1 1 1

PCA : 0

est lower upper
0.9922 0.9829 1

GDSTM : 0

est lower upper
1 1 1

Raw : 1

est lower upper
0.9988 0.9978 0.9999

PCA : 1

est lower upper
0.9869 0.9733 1

GDSTM : 1

est lower upper
0.9993 0.9987 1

Raw : 2

est lower upper
0.9998 0.9994 1

PCA : 2

est lower upper
0.9926 0.9864 0.9987

GDSTM : 2

est lower upper
0.9997 0.9993 1

Raw : 3

est lower upper
0.9943 0.9855 1

PCA : 3

est lower upper
0.9869 0.9771 0.9968

GDSTM : 3

est lower upper
0.9987 0.9974 1

Raw : 4

est lower upper
0.9995 0.999 1

PCA : 4

est lower upper
0.9771 0.9531 1

GDSTM : 4

est lower upper
0.9997 0.9993 1

Raw : 5

est lower upper
0.999 0.9979 1

PCA : 5

est lower upper
0.9799 0.9593 1

GDSTM : 5

est lower upper
0.9988 0.9973 1

Raw : 6

est lower upper
0.9999 0.9998 1

PCA : 6

est lower upper
0.9844 0.9731 0.9957

GDSTM : 6

est lower upper
0.9999 0.9997 1

Raw : 7

est lower upper
0.9992 0.9984 1

PCA : 7

est lower upper
0.9804 0.9606 1

GDSTM : 7

est lower upper
0.998 0.9945 1

Raw : 8

est lower upper
1 1 1

PCA : 8

est lower upper
0.9914 0.98 1

GDSTM : 8

est lower upper
1 0.9999 1

Raw : 9

est lower upper
0.9998 0.9995 1

PCA : 9

est lower upper
0.9529 0.922 0.9838

GDSTM : 9

est lower upper
0.9994 0.9986 1

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Comparing FCA Options predictions

The following code runs the function and selecting some of the possible options:



# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 2.10 0.12 2.05

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 7.59 0.11 7.68

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 2.94 0.11 2.87

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 8.11 0.05 8.27

# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 1.48 0.08 1.44

# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 1.13 0.06 1.19

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 8.05 0.07 7.96

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 4.03 0.09 3.94

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 8.58 0.16 8.67

# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 10.21 0.11 10.31

Predict on all set of transformations

par(mfrow=c(2,5))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.5)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:0 Default:1 Default:2 Default:3 Default:4 Default:5 Default:6 Default:7 Default:8 Default:9

est lower upper
0.9993 0.9985 1

AtThr:0

AtThr:1 AtThr:2 AtThr:3 AtThr:4 AtThr:5 AtThr:6 AtThr:7 AtThr:8 AtThr:9

est lower upper
0.9991 0.9981 1

RLM_Pearson:0

RLM_Pearson:1 RLM_Pearson:2 RLM_Pearson:3 RLM_Pearson:4 RLM_Pearson:5 RLM_Pearson:6 RLM_Pearson:7 RLM_Pearson:8 RLM_Pearson:9

est lower upper
0.9994 0.9987 1

LM_Spearman:0

LM_Spearman:1 LM_Spearman:2 LM_Spearman:3 LM_Spearman:4 LM_Spearman:5 LM_Spearman:6 LM_Spearman:7 LM_Spearman:8 LM_Spearman:9

est lower upper
0.9992 0.9982 1

RLM_Spearman:0

RLM_Spearman:1 RLM_Spearman:2 RLM_Spearman:3 RLM_Spearman:4 RLM_Spearman:5 RLM_Spearman:6 RLM_Spearman:7 RLM_Spearman:8 RLM_Spearman:9

est lower upper
0.9987 0.9975 0.9999

Sup_Default:0

Sup_Default:1 Sup_Default:2 Sup_Default:3 Sup_Default:4 Sup_Default:5 Sup_Default:6 Sup_Default:7 Sup_Default:8 Sup_Default:9

est lower upper
0.9996 0.9991 1

Sup_AtThr:0

Sup_AtThr:1 Sup_AtThr:2 Sup_AtThr:3 Sup_AtThr:4 Sup_AtThr:5 Sup_AtThr:6 Sup_AtThr:7 Sup_AtThr:8 Sup_AtThr:9

est lower upper
0.9996 0.9991 1

Sup_RLM_Pearson:0

Sup_RLM_Pearson:1 Sup_RLM_Pearson:2 Sup_RLM_Pearson:3 Sup_RLM_Pearson:4 Sup_RLM_Pearson:5 Sup_RLM_Pearson:6 Sup_RLM_Pearson:7 Sup_RLM_Pearson:8 Sup_RLM_Pearson:9

est lower upper
0.9989 0.9975 1

Sup_LM_Spearman:0

Sup_LM_Spearman:1 Sup_LM_Spearman:2 Sup_LM_Spearman:3 Sup_LM_Spearman:4 Sup_LM_Spearman:5 Sup_LM_Spearman:6 Sup_LM_Spearman:7 Sup_LM_Spearman:8 Sup_LM_Spearman:9

est lower upper
0.9991 0.9981 0.9999

Sup_RLM_Spearman:0

Sup_RLM_Spearman:1 Sup_RLM_Spearman:2 Sup_RLM_Spearman:3 Sup_RLM_Spearman:4 Sup_RLM_Spearman:5 Sup_RLM_Spearman:6 Sup_RLM_Spearman:7 Sup_RLM_Spearman:8 Sup_RLM_Spearman:9

est lower upper
0.9992 0.998 1

Sup_KS_RLM_Spearman:0

Sup_KS_RLM_Spearman:1 Sup_KS_RLM_Spearman:2 Sup_KS_RLM_Spearman:3 Sup_KS_RLM_Spearman:4 Sup_KS_RLM_Spearman:5 Sup_KS_RLM_Spearman:6 Sup_KS_RLM_Spearman:7 Sup_KS_RLM_Spearman:8 Sup_KS_RLM_Spearman:9

est lower upper
0.9986 0.9968 1

The Bar plot of all AUC


rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
  est lower upper
Raw 0.999 0.9977 1
PCA 0.9825 0.9668 0.9975
Default 0.9993 0.9985 1
AtThr 0.9991 0.9981 1
RLM_Pearson 0.9994 0.9987 1
LM_Spearman 0.9992 0.9982 1
RLM_Spearman 0.9987 0.9975 0.9999
Sup_Default 0.9996 0.9991 1
Sup_AtThr 0.9996 0.9991 1
Sup_RLM_Pearson 0.9989 0.9975 1
Sup_LM_Spearman 0.9991 0.9981 0.9999
Sup_RLM_Spearman 0.9992 0.998 1
Sup_KS_RLM_Spearman 0.9986 0.9968 1
par(mfrow=c(1,1))

bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

---
title: "FCA and the GDSTM"
output: html_notebook
---

## GDSTM Decorrelation on the multiple features Dataset

This document describes the use of the **FRESA.CAD::GDSTMDecorrelation()** function that runs the feature correlation analysis (**FCA**) algorithm on the mfeat data set:

> <https://archive.ics.uci.edu/ml/datasets/Multiple+Features>
>
> M. van Breukelen, R.P.W. Duin, D.M.J. Tax, and J.E. den Hartog, Handwritten digit recognition by combined classifiers, Kybernetika, vol. 34, no. 4, 1998, 381-386.

This scrip uses FRESA.CAD and mlbench R packages:

```{r functions,echo = TRUE }
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
library(mlbench)

op <- par(no.readonly = TRUE)

```

I'll load the mfeat data set

```{r}
mfeat <- read.delim("../Data/mfeat.txt", stringsAsFactors=TRUE)
mfeat$ID <- NULL
print(table(mfeat$Class))


```

Setting some variables for downstream analysis

```{r}
studyName = "mfeat"
datasetframe <- mfeat
Outcome <- "Class"

trainFraction = 0.50
correlationThreshold = 0.6
featnames <- colnames(datasetframe)[colnames(datasetframe) != Outcome]

```

Setting the Training and Testing sets

```{r, results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")


```

## FCA with default parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method="fast",type="LM"

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))
decortype[[1]] <- "Default"


```

### Transforming the testing set

To make test predictions we need to transform the testing set. This is done using the **FRESA.CAD::predictDecorrelate()** function

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

```

## Heat maps of the correlation matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
cormat[is.na(cormat)] <- 0;
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")


```

The **GDSTMDecorrelation** function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

-   The name will be unaltered if their maximum correlation to other features was lower than the threshold.

-   The name will have the "Ba\_" prefix is the feature was correlated but used as unaltered basis

-   The name will have the "De\_" prefix is the feature was original correlated and its correlation to "Ba\_" features has been removed.

Furthermore, the returned data frame will have the following attributes:

1)  Transformation matrix: "*GDSTM*"

2)  Features:

    1.  "*fsocre*"

    2.  "*varincluded*"

    3.  "*topFeatures*"

3)  Unaltered Basis:

    1.  "*baseFeatures*"

        1.  "*correlatedToBase*"

    2.  "*AbaseFeatures*"

### GDSTM

The "*GDSTM***"** attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")



```

### The Feature Index Score

The **FCA** analysis of the data features are stored in three attributes: "*varincluded*", "*topFeatures*", and "*fscore*".

-   "varincluded" returns the list of continuous features that were decorrelated

-   "topFeatures" returns the features that at some point were used as independent variables inside the linear models.

-   "fscore" : returns a named vector with the total feature score, $F_j$, of the analyzed features.

$$
F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind
$$

$$
F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep
$$

where $B^n_j$ is the set of features statistically associated with feature *j* at iteration *n,* $ρ_{i,j}$ is the correlation between features *i,j*, and $ρ_{th}$ is the correlation goal, ${Ind, Dep}$ are the set of independent and dependent features respectively. In other words, the "*fscore"* indicates the degree of total association of "independent" features to dependent features.

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

```

## Machine Learning and GDSTM

Train a simple NB model on the raw dataset

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=TRUE
                   )


```

Now using the decorrelated data

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )




```

### The testing set predictions comparison

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 4.0, fig.width= 12.0}

# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
classNames <- as.character(classNames)
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}
meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

```

## Comparing FCA Options predictions

The following code runs the function and selecting some of the possible options:

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))


# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))


# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))


# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))


```

## Predict on all set of transformations

```{r ROC PLOTS, fig.height=3.0, fig.width=12, warning=FALSE, dpi=600, results="asis"}
par(mfrow=c(2,5))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_Wilcoxon,
                     filtermethod.control=list(pvalue=0.05,limit= 50),
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.5)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

```

### The Bar plot of all AUC

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
par(mfrow=c(1,1))

bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

```
