FCA-Based Decorrelation on Vehicle

This document describes the use of the FRESA.CAD::GDSTMDecorrelation() function that runs the feature correlation analysis (FCA) algorithm.

This demo uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
Loading required package: Rcpp
Loading required package: stringr
Loading required package: miscTools
Loading required package: Hmisc
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Loading required package: ggplot2
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:base’:

    format.pval, units

Loading required package: pROC
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the vehicle data set

data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))

 bus opel saab  van 
 218  212  217  199 

Setting some variables for downstream analysis

studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"

# The fractions of samples to be use in the training set
trainFraction = 0.5

# The correlation threshold 
correlationThreshold = 0.6

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
All
bus opel saab van
218 212 217 199
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
Training
bus opel saab van
99 99 99 99
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
Testing
bus opel saab van
119 113 118 100

FCA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))

user system elapsed 0.06 0.00 0.06

decortype[[1]] <- "Default"

Testing Set Decorrelation

To decorrelate the testing set use the predictDecorrelate() function

decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

NA
NA

FCA with Options

The following code are examples of running GDSTMDecorrelation() function with options:



# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 0.04 0.02 0.05

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 0.18 0.03 0.21

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 0.05 0.01 0.06

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 0.11 0.02 0.12

# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 0.03 0.00 0.03

# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

user system elapsed 0.03 0.00 0.03

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 0.13 0.02 0.13

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 0.05 0.00 0.05

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 0.14 0.02 0.16

# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 0.14 0.00 0.14

The GDSTMDecorrelation function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

Furthermore, the returned data frame will have the following attributes:

  1. Transformation matrix: “GDSTM

  2. Features:

    1. fsocre

    2. varincluded

    3. topFeatures

  3. Unaltered Basis:

    1. baseFeatures

      1. correlatedToBase
    2. AbaseFeatures

GDSTM

The “GDSTM attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold



## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")


pander::pander(t(colnames(GDSTM)),caption="New names of decorrelated matrix")
New names of decorrelated matrix (continued below)
De_Comp De_Circ De_D.Circ Ba_Scat.Ra De_Elong De_Pr.Axis.Rect
Table continues below
De_Max.L.Rect De_Sc.Var.Maxis De_Sc.Var.maxis De_Ra.Gyr Ba_Kurt.Maxis
De_Holl.Ra

The Coefficients per each Decorrelated Feature

The following code extracts the dependence formula coefficients of each new feature to the original features.


namecol <- colnames(GDSTM);

for (i in 1:ncol(GDSTM))
{
  associ <- abs(GDSTM[,i])>0;
  if (sum(associ) > 1)
  {
    pander::pander(t(GDSTM[associ,i]),caption=namecol[i])
  }
}

----------------
 Comp   Scat.Ra 
------ ---------
  1     -0.2026 
----------------

Table: De_Comp


----------------
 Circ   Scat.Ra 
------ ---------
  1     -0.1595 
----------------

Table: De_Circ


------------------
 D.Circ   Scat.Ra 
-------- ---------
   1      -0.4369 
------------------

Table: De_D.Circ


-----------------
 Scat.Ra   Elong 
--------- -------
 0.2307      1   
-----------------

Table: De_Elong


-------------------------
 Scat.Ra    Pr.Axis.Rect 
---------- --------------
 -0.07737        1       
-------------------------

Table: De_Pr.Axis.Rect


------------------------
 Scat.Ra   Sc.Var.Maxis 
--------- --------------
 -0.8859        1       
------------------------

Table: De_Sc.Var.Maxis


------------------------
 Scat.Ra   Sc.Var.maxis 
--------- --------------
  -5.27         1       
------------------------

Table: De_Sc.Var.maxis


---------------------
 Max.L.Rect   Ra.Gyr 
------------ --------
   -1.989       1    
---------------------

Table: De_Ra.Gyr


----------------------
 Kurt.Maxis   Holl.Ra 
------------ ---------
   -1.071        1    
----------------------

Table: De_Holl.Ra

The Feature Index Score

The FCA analysis of the data features are stored in three attributes: “varincluded”, “topFeatures”, and “fscore”.

  • “varincluded” returns the list of continuous features that were decorrelated

  • “topFeatures” returns the features that at some point were used as independent variables inside the linear models.

  • “fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features.

\[ F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind \]

\[ F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep \]

where \(B^n_j\) is the set of features statistically associated with feature j at iteration n, \(ρ_{i,j}\) is the correlation between features i,j, and \(ρ_{th}\) is the correlation goal, \({Ind, Dep}\) are the set of independent and dependent features respectively. In other words, the “fscore” indicates the degree of total association of”independent” features to dependent features.


pander::pander(t(attr(datasetframeDecor[[1]],"varincluded")),caption="Correlated Features")
Correlated Features (continued below)
Comp Circ D.Circ Scat.Ra Elong Pr.Axis.Rect Max.L.Rect
Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Kurt.Maxis Holl.Ra
pander::pander(t(attr(datasetframeDecor[[1]],"topFeatures")),caption="Independent Features")
Independent Features
Scat.Ra Kurt.Maxis Max.L.Rect
fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

The FCA algorithm will return for unsupervised basis learning the following attribute: “AbaseFeatures


pander::pander(t(attr(datasetframeDecor[[1]],"AbaseFeatures")),caption="Set of unaltered features")
Set of unaltered features
Scat.Ra Kurt.Maxis Pr.Axis.Ra Max.L.Ra Skew.Maxis

The total set of unaltered features is:


atbase <- attr(datasetframeDecor[[1]],"AbaseFeatures")
featnames <- colnames(datasetframe_train)
included <- attr(datasetframeDecor[[1]],"varincluded")

notinvarincluded <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features")
Set of unaltered features (continued below)
Scat.Ra Kurt.Maxis Pr.Axis.Ra Max.L.Ra Skew.Maxis Rad.Ra
Pr.Axis.Ra Max.L.Ra Skew.Maxis Skew.maxis Kurt.maxis Class

For supervised basis learning the FCA algorithm will return the following attributes:

  • baseFeatures” and “correlatedToBase

The “baseFeatures” is the set of features that the FRESA.CAD::univariate_correlation() univariate filter function used to get the features associated with the outcome.

# With Supervised Basis

pander::pander(t(attr(datasetframeDecor[[6]],"baseFeatures")),caption="Set of unaltered features")
Set of unaltered features
Holl.Ra D.Circ Comp Max.L.Ra
if (length(attr(datasetframeDecor[[6]],"correlatedToBase"))>0)
{
pander::pander(t(attr(datasetframeDecor[[6]],"correlatedToBase")),caption="Set of features associated with base")
}
Set of features associated with base
Kurt.Maxis
atbaseSup <- attr(datasetframeDecor[[6]],"baseFeatures")
included <- attr(datasetframeDecor[[6]],"varincluded")

notinvarincludedSup <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbaseSup,notinvarincludedSup)),caption="Set of unaltered features: Supervised")
Set of unaltered features: Supervised (continued below)
Holl.Ra D.Circ Comp Max.L.Ra Rad.Ra Pr.Axis.Ra Max.L.Ra
Skew.Maxis Skew.maxis Kurt.maxis Class
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features: Unsupervised")
Set of unaltered features: Unsupervised (continued below)
Scat.Ra Kurt.Maxis Pr.Axis.Ra Max.L.Ra Skew.Maxis Rad.Ra
Pr.Axis.Ra Max.L.Ra Skew.Maxis Skew.maxis Kurt.maxis Class
fscore <- attr(datasetframeDecor[[6]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6,main="Feature Score: Supervised")

Machine Learning and GDSTM

Train a simple NB model on the raw data set

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

Training using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

Selected Raw Features


vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

Selected decorrelated Features


vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- datasetframeDecor[[1]];
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

Once we have the transformed testing dataset we can make a side by side comparison of predictions


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : van

est lower upper
0.9666 0.9523 0.9809

PCA : van

est lower upper
0.9706 0.9563 0.9849

GDSTM : van

est lower upper
0.981 0.9715 0.9905

Raw : saab

est lower upper
0.8147 0.7731 0.8562

PCA : saab

est lower upper
0.8417 0.8029 0.8804

GDSTM : saab

est lower upper
0.8481 0.8126 0.8836

Raw : bus

est lower upper
0.9827 0.9734 0.9919

PCA : bus

est lower upper
0.9928 0.9882 0.9975

GDSTM : bus

est lower upper
0.9942 0.9897 0.9987

Raw : opel

est lower upper
0.7927 0.7475 0.8379

PCA : opel

est lower upper
0.8552 0.8157 0.8946

GDSTM : opel

est lower upper
0.8339 0.7954 0.8724

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Training and Prediction on all Decorrelations Sets

par(mfrow=c(2,2))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.75)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:van Default:saab Default:bus Default:opel

est lower upper
0.9143 0.8923 0.9363

AtThr:van

AtThr:saab AtThr:bus AtThr:opel

est lower upper
0.9178 0.8933 0.9417

RLM_Pearson:van

RLM_Pearson:saab RLM_Pearson:bus RLM_Pearson:opel

est lower upper
0.9121 0.8899 0.9343

LM_Spearman:van

LM_Spearman:saab LM_Spearman:bus LM_Spearman:opel

est lower upper
0.9136 0.8919 0.9352

RLM_Spearman:van

RLM_Spearman:saab RLM_Spearman:bus RLM_Spearman:opel

est lower upper
0.912 0.8901 0.9339

Sup_Default:van

Sup_Default:saab Sup_Default:bus Sup_Default:opel

est lower upper
0.9115 0.8889 0.934

Sup_AtThr:van

Sup_AtThr:saab Sup_AtThr:bus Sup_AtThr:opel

est lower upper
0.9178 0.8933 0.9417

Sup_RLM_Pearson:van

Sup_RLM_Pearson:saab Sup_RLM_Pearson:bus Sup_RLM_Pearson:opel

est lower upper
0.9104 0.8879 0.9329

Sup_LM_Spearman:van

Sup_LM_Spearman:saab Sup_LM_Spearman:bus Sup_LM_Spearman:opel

est lower upper
0.914 0.8918 0.9363

Sup_RLM_Spearman:van

Sup_RLM_Spearman:saab Sup_RLM_Spearman:bus Sup_RLM_Spearman:opel

est lower upper
0.9147 0.8933 0.936

Sup_KS_RLM_Spearman:van

Sup_KS_RLM_Spearman:saab Sup_KS_RLM_Spearman:bus Sup_KS_RLM_Spearman:opel

est lower upper
0.9163 0.8949 0.9377

Final Plot Comparing the ROC AUC of all Options

par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
  est lower upper
Raw 0.8892 0.8616 0.9167
PCA 0.9151 0.8908 0.9394
Default 0.9143 0.8923 0.9363
AtThr 0.9178 0.8933 0.9417
RLM_Pearson 0.9121 0.8899 0.9343
LM_Spearman 0.9136 0.8919 0.9352
RLM_Spearman 0.912 0.8901 0.9339
Sup_Default 0.9115 0.8889 0.934
Sup_AtThr 0.9178 0.8933 0.9417
Sup_RLM_Pearson 0.9104 0.8879 0.9329
Sup_LM_Spearman 0.914 0.8918 0.9363
Sup_RLM_Spearman 0.9147 0.8933 0.936
Sup_KS_RLM_Spearman 0.9163 0.8949 0.9377
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

---
title: "FCA and the GDSTM"
output: html_notebook
---

## FCA-Based Decorrelation on Vehicle

This document describes the use of the **FRESA.CAD::GDSTMDecorrelation()** function that runs the feature correlation analysis (**FCA**) algorithm.

This demo uses FRESA.CAD and mlbench R packages:

```{r functions,echo = TRUE }
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
library(mlbench)

op <- par(no.readonly = TRUE)

```

I'll load the vehicle data set

```{r}
data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))
```

Setting some variables for downstream analysis

```{r}
studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"

# The fractions of samples to be use in the training set
trainFraction = 0.5

# The correlation threshold 
correlationThreshold = 0.6

```

Setting the Training and Testing sets

```{r, results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")


```

## FCA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method="fast",type="LM"

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))
decortype[[1]] <- "Default"


```

### Testing Set Decorrelation

To decorrelate the testing set use the **predictDecorrelate()** function

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

```

## Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")


```

## FCA with Options

The following code are examples of running **GDSTMDecorrelation()** function with options:

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))


# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))


# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))


# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))


```

The **GDSTMDecorrelation** function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

-   The name will be unaltered if their maximum correlation to other features was lower than the threshold.

-   The name will have the "Ba\_" prefix is the feature was correlated but used as unaltered basis

-   The name will have the "De\_" prefix is the feature was original correlated and its correlation to "Ba\_" features has been removed.

Furthermore, the returned data frame will have the following attributes:

1)  Transformation matrix: "*GDSTM*"

2)  Features:

    1.  "*fsocre*"

    2.  "*varincluded*"

    3.  "*topFeatures*"

3)  Unaltered Basis:

    1.  "*baseFeatures*"

        1.  "*correlatedToBase*"

    2.  "*AbaseFeatures*"

### GDSTM

The "*GDSTM***"** attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")

pander::pander(t(colnames(GDSTM)),caption="New names of decorrelated matrix")


```

### The Coefficients per each Decorrelated Feature

The following code extracts the dependence formula coefficients of each new feature to the original features.

```{r}

namecol <- colnames(GDSTM);

for (i in 1:ncol(GDSTM))
{
  associ <- abs(GDSTM[,i])>0;
  if (sum(associ) > 1)
  {
    pander::pander(t(GDSTM[associ,i]),caption=namecol[i])
  }
}


```

### The Feature Index Score

The **FCA** analysis of the data features are stored in three attributes: "*varincluded*", "*topFeatures*", and "*fscore*".

-   "varincluded" returns the list of continuous features that were decorrelated

-   "topFeatures" returns the features that at some point were used as independent variables inside the linear models.

-   "fscore" : returns a named vector with the total feature score, $F_j$, of the analyzed features.

$$
F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind
$$

$$
F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep
$$

where $B^n_j$ is the set of features statistically associated with feature *j* at iteration *n,* $ρ_{i,j}$ is the correlation between features *i,j*, and $ρ_{th}$ is the correlation goal, ${Ind, Dep}$ are the set of independent and dependent features respectively. In other words, the "*fscore"* indicates the degree of total association of "independent" features to dependent features.

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

pander::pander(t(attr(datasetframeDecor[[1]],"varincluded")),caption="Correlated Features")

pander::pander(t(attr(datasetframeDecor[[1]],"topFeatures")),caption="Independent Features")

fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

```

The FCA algorithm will return for unsupervised basis learning the following attribute: "*AbaseFeatures*"

```{r results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

pander::pander(t(attr(datasetframeDecor[[1]],"AbaseFeatures")),caption="Set of unaltered features")


```

The total set of unaltered features is:

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

atbase <- attr(datasetframeDecor[[1]],"AbaseFeatures")
featnames <- colnames(datasetframe_train)
included <- attr(datasetframeDecor[[1]],"varincluded")

notinvarincluded <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features")

```

For supervised basis learning the FCA algorithm will return the following attributes:

-   "*baseFeatures*" and "*correlatedToBase*"

The "*baseFeatures*" is the set of features that the **FRESA.CAD::univariate_correlation()** univariate filter function used to get the features associated with the outcome.

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
# With Supervised Basis

pander::pander(t(attr(datasetframeDecor[[6]],"baseFeatures")),caption="Set of unaltered features")

if (length(attr(datasetframeDecor[[6]],"correlatedToBase"))>0)
{
pander::pander(t(attr(datasetframeDecor[[6]],"correlatedToBase")),caption="Set of features associated with base")
}

atbaseSup <- attr(datasetframeDecor[[6]],"baseFeatures")
included <- attr(datasetframeDecor[[6]],"varincluded")

notinvarincludedSup <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbaseSup,notinvarincludedSup)),caption="Set of unaltered features: Supervised")

pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features: Unsupervised")


fscore <- attr(datasetframeDecor[[6]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6,main="Feature Score: Supervised")

```

## Machine Learning and GDSTM

Train a simple NB model on the raw data set

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

```

Training using the decorrelated data

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )




```

Selected Raw Features

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

```

Selected decorrelated Features

```{r results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- datasetframeDecor[[1]];
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

```

To make predictions we need to transform the testing set. This is done using the **FRESA.CAD::predictDecorrelate()** function

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

```

Once we have the transformed testing dataset we can make a side by side comparison of predictions

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 4.0, fig.width= 12.0}

# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}
meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

```

## Training and Prediction on all Decorrelations Sets

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
par(mfrow=c(2,2))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.75)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

```

## Final Plot Comparing the ROC AUC of all Options

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

```
