FCA-Based Decorrelation on Vehicle

This document describes the use of the FRESA.CAD::GDSTMDecorrelation() function that runs the feature correlation analysis (FCA) algorithm.

This demo uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
Loading required package: Rcpp
Loading required package: stringr
Loading required package: miscTools
Loading required package: Hmisc
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Loading required package: ggplot2
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:base’:

    format.pval, units

Loading required package: pROC
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the vehicle data set

data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))

 bus opel saab  van 
 218  212  217  199 

Setting some variables for downstream analysis

studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"

# The fractions of samples to be use in the training set
trainFraction = 0.5

# The correlation threshold 
correlationThreshold = 0.6

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
All
bus opel saab van
218 212 217 199
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
Training
bus opel saab van
99 99 99 99
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
Testing
bus opel saab van
119 113 118 100

FCA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))

user system elapsed 0.05 0.02 0.06

decortype[[1]] <- "Default"

Testing Set Decorrelation

To decorrelate the testing set use the predictDecorrelate() function

decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

NA
NA

FCA with Options

The following code are examples of running GDSTMDecorrelation() function with options:



# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 0.11 0.06 0.05

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 0.12 0.03 0.16

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 0.03 0.00 0.03

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 0.13 0.00 0.13

# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 0.02 0.00 0.02

# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

user system elapsed 0.04 0.03 0.03

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 0.11 0.00 0.11

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 0.05 0.00 0.05

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 0.13 0.02 0.13

# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 0.14 0.00 0.14

The GDSTMDecorrelation function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

Furthermore, the returned data frame will have the following attributes:

  1. Transformation matrix: “GDSTM

  2. Features:

    1. fsocre

    2. varincluded

    3. topFeatures

  3. Unaltered Basis:

    1. baseFeatures

      1. correlatedToBase
    2. AbaseFeatures

GDSTM

The “GDSTM attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold



## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")


pander::pander(t(colnames(GDSTM)),caption="New names of decorrelated matrix")
New names of decorrelated matrix (continued below)
De_Circ De_D.Circ Ba_Scat.Ra De_Elong De_Pr.Axis.Rect De_Max.L.Rect
Table continues below
De_Sc.Var.Maxis De_Sc.Var.maxis De_Ra.Gyr De_Skew.Maxis De_Kurt.Maxis
Ba_Holl.Ra

The Coefficients per each Decorrelated Feature

The following code extracts the dependence formula coefficients of each new feature to the original features.


namecol <- colnames(GDSTM);

for (i in 1:ncol(GDSTM))
{
  associ <- abs(GDSTM[,i])>0;
  if (sum(associ) > 1)
  {
    pander::pander(t(GDSTM[associ,i]),caption=namecol[i])
  }
}

----------------
 Circ   Scat.Ra 
------ ---------
  1     -0.1593 
----------------

Table: De_Circ


------------------
 D.Circ   Scat.Ra 
-------- ---------
   1      -0.4402 
------------------

Table: De_D.Circ


-----------------
 Scat.Ra   Elong 
--------- -------
 0.2311      1   
-----------------

Table: De_Elong


-------------------------
 Scat.Ra    Pr.Axis.Rect 
---------- --------------
 -0.07719        1       
-------------------------

Table: De_Pr.Axis.Rect


------------------------
 Scat.Ra   Sc.Var.Maxis 
--------- --------------
 -0.8929        1       
------------------------

Table: De_Sc.Var.Maxis


------------------------
 Scat.Ra   Sc.Var.maxis 
--------- --------------
 -5.258         1       
------------------------

Table: De_Sc.Var.maxis


---------------------
 Max.L.Rect   Ra.Gyr 
------------ --------
   -1.933       1    
---------------------

Table: De_Ra.Gyr


----------------------
 Skew.Maxis   Holl.Ra 
------------ ---------
     1        0.8278  
----------------------

Table: De_Skew.Maxis


----------------------
 Kurt.Maxis   Holl.Ra 
------------ ---------
     1        -0.7503 
----------------------

Table: De_Kurt.Maxis

The Feature Index Score

The FCA analysis of the data features are stored in three attributes: “varincluded”, “topFeatures”, and “fscore”.

  • “varincluded” returns the list of continuous features that were decorrelated

  • “topFeatures” returns the features that at some point were used as independent variables inside the linear models.

  • “fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features.

\[ F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind \]

\[ F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep \]

where \(B^n_j\) is the set of features statistically associated with feature j at iteration n, \(ρ_{i,j}\) is the correlation between features i,j, and \(ρ_{th}\) is the correlation goal, \({Ind, Dep}\) are the set of independent and dependent features respectively. In other words, the “fscore” indicates the degree of total association of”independent” features to dependent features.


pander::pander(t(attr(datasetframeDecor[[1]],"varincluded")),caption="Correlated Features")
Correlated Features (continued below)
Circ D.Circ Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis
Sc.Var.maxis Ra.Gyr Skew.Maxis Kurt.Maxis Holl.Ra
pander::pander(t(attr(datasetframeDecor[[1]],"topFeatures")),caption="Independent Features")
Independent Features
Scat.Ra Holl.Ra Max.L.Rect
fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

The FCA algorithm will return for unsupervised basis learning the following attribute: “AbaseFeatures


pander::pander(t(attr(datasetframeDecor[[1]],"AbaseFeatures")),caption="Set of unaltered features")
Set of unaltered features
Scat.Ra Holl.Ra Comp

The total set of unaltered features is:


atbase <- attr(datasetframeDecor[[1]],"AbaseFeatures")
featnames <- colnames(datasetframe_train)
included <- attr(datasetframeDecor[[1]],"varincluded")

notinvarincluded <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features")
Set of unaltered features (continued below)
Scat.Ra Holl.Ra Comp Comp Rad.Ra Pr.Axis.Ra Max.L.Ra Skew.maxis
Kurt.maxis Class

For supervised basis learning the FCA algorithm will return the following attributes:

  • baseFeatures” and “correlatedToBase

The “baseFeatures” is the set of features that the FRESA.CAD::univariate_correlation() univariate filter function used to get the features associated with the outcome.

# With Supervised Basis

pander::pander(t(attr(datasetframeDecor[[6]],"baseFeatures")),caption="Set of unaltered features")
Set of unaltered features
Skew.Maxis D.Circ Comp
if (length(attr(datasetframeDecor[[6]],"correlatedToBase"))>0)
{
pander::pander(t(attr(datasetframeDecor[[6]],"correlatedToBase")),caption="Set of features associated with base")
}

atbaseSup <- attr(datasetframeDecor[[6]],"baseFeatures")
included <- attr(datasetframeDecor[[6]],"varincluded")

notinvarincludedSup <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbaseSup,notinvarincludedSup)),caption="Set of unaltered features: Supervised")
Set of unaltered features: Supervised (continued below)
Skew.Maxis D.Circ Comp Comp Rad.Ra Pr.Axis.Ra Max.L.Ra
Skew.maxis Kurt.maxis Class
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features: Unsupervised")
Set of unaltered features: Unsupervised (continued below)
Scat.Ra Holl.Ra Comp Comp Rad.Ra Pr.Axis.Ra Max.L.Ra Skew.maxis
Kurt.maxis Class
fscore <- attr(datasetframeDecor[[6]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6,main="Feature Score: Supervised")

Machine Learning and GDSTM

Train a simple NB model on the raw data set

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

Training using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

Selected Raw Features


vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

Selected decorrelated Features


vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- datasetframeDecor[[1]];
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

Once we have the transformed testing dataset we can make a side by side comparison of predictions


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : van

est lower upper
0.9101 0.8771 0.9431

PCA : van

est lower upper
0.9267 0.8859 0.9676

GDSTM : van

est lower upper
0.9661 0.9521 0.9801

Raw : saab

est lower upper
0.75 0.6991 0.8009

PCA : saab

est lower upper
0.8697 0.8358 0.9036

GDSTM : saab

est lower upper
0.8415 0.8042 0.8789

Raw : bus

est lower upper
0.964 0.9494 0.9786

PCA : bus

est lower upper
0.9688 0.9484 0.9892

GDSTM : bus

est lower upper
0.9925 0.9866 0.9984

Raw : opel

est lower upper
0.7492 0.699 0.7994

PCA : opel

est lower upper
0.8555 0.8163 0.8946

GDSTM : opel

est lower upper
0.8438 0.8034 0.8841

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Training and Prediction on all Decorrelations Sets

par(mfrow=c(2,2))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.75)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:van Default:saab Default:bus Default:opel

est lower upper
0.911 0.8866 0.9354

AtThr:van

AtThr:saab AtThr:bus AtThr:opel

est lower upper
0.9154 0.893 0.9377

RLM_Pearson:van

RLM_Pearson:saab RLM_Pearson:bus RLM_Pearson:opel

est lower upper
0.9047 0.8775 0.9318

LM_Spearman:van

LM_Spearman:saab LM_Spearman:bus LM_Spearman:opel

est lower upper
0.9078 0.8823 0.9332

RLM_Spearman:van

RLM_Spearman:saab RLM_Spearman:bus RLM_Spearman:opel

est lower upper
0.9015 0.8723 0.9308

Sup_Default:van

Sup_Default:saab Sup_Default:bus Sup_Default:opel

est lower upper
0.911 0.8866 0.9354

Sup_AtThr:van

Sup_AtThr:saab Sup_AtThr:bus Sup_AtThr:opel

est lower upper
0.9201 0.8966 0.9437

Sup_RLM_Pearson:van

Sup_RLM_Pearson:saab Sup_RLM_Pearson:bus Sup_RLM_Pearson:opel

est lower upper
0.9047 0.8775 0.9318

Sup_LM_Spearman:van

Sup_LM_Spearman:saab Sup_LM_Spearman:bus Sup_LM_Spearman:opel

est lower upper
0.8995 0.8694 0.9297

Sup_RLM_Spearman:van

Sup_RLM_Spearman:saab Sup_RLM_Spearman:bus Sup_RLM_Spearman:opel

est lower upper
0.9054 0.8768 0.9339

Sup_KS_RLM_Spearman:van

Sup_KS_RLM_Spearman:saab Sup_KS_RLM_Spearman:bus Sup_KS_RLM_Spearman:opel

est lower upper
0.9053 0.8767 0.9339

Final Plot Comparing the ROC AUC of all Options

par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
  est lower upper
Raw 0.8433 0.8062 0.8805
PCA 0.9052 0.8716 0.9388
Default 0.911 0.8866 0.9354
AtThr 0.9154 0.893 0.9377
RLM_Pearson 0.9047 0.8775 0.9318
LM_Spearman 0.9078 0.8823 0.9332
RLM_Spearman 0.9015 0.8723 0.9308
Sup_Default 0.911 0.8866 0.9354
Sup_AtThr 0.9201 0.8966 0.9437
Sup_RLM_Pearson 0.9047 0.8775 0.9318
Sup_LM_Spearman 0.8995 0.8694 0.9297
Sup_RLM_Spearman 0.9054 0.8768 0.9339
Sup_KS_RLM_Spearman 0.9053 0.8767 0.9339
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

---
title: "FCA and the GDSTM"
output: html_notebook
---

## FCA-Based Decorrelation on Vehicle

This document describes the use of the **FRESA.CAD::GDSTMDecorrelation()** function that runs the feature correlation analysis (**FCA**) algorithm.

This demo uses FRESA.CAD and mlbench R packages:

```{r functions,echo = TRUE }
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")
library(mlbench)

op <- par(no.readonly = TRUE)

```

I'll load the vehicle data set

```{r}
data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))
```

Setting some variables for downstream analysis

```{r}
studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"

# The fractions of samples to be use in the training set
trainFraction = 0.5

# The correlation threshold 
correlationThreshold = 0.6

```

Setting the Training and Testing sets

```{r, results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")


```

## FCA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method="fast",type="LM"

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


datasetframeDecor<-list();
decortype <- list();
system.time(datasetframeDecor[[1]] <- GDSTMDecorrelation(datasetframe_train))
decortype[[1]] <- "Default"


```

### Testing Set Decorrelation

To decorrelate the testing set use the **predictDecorrelate()** function

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

```

## Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


featnames <- attr(datasetframeDecor[[1]],"varincluded")
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


featnames <- colnames(attr(datasetframeDecor[[1]],"GDSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")


```

## FCA with Options

The following code are examples of running **GDSTMDecorrelation()** function with options:

```{r, results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold
  ))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="pearson"))


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- GDSTMDecorrelation(
  datasetframe_train,
  type="RLM",
  method="spearman"))


# The following are for supervised basis learning

# Set the target class for association learning
decortype[[6]] <- "Sup_Default"
system.time(datasetframeDecor[[6]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome))


# Change the maximum correlation goal
decortype[[7]] <- "Sup_AtThr"
system.time(datasetframeDecor[[7]] <- GDSTMDecorrelation(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Sup_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Sup_LM_Spearman"
system.time(datasetframeDecor[[9]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Sup_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))


# With user defined supervised basis 
decortype[[11]] <- "Sup_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- GDSTMDecorrelation(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))


```

The **GDSTMDecorrelation** function returns a data frame with the following column names:

The output features after transformation will be named after the original names and:

-   The name will be unaltered if their maximum correlation to other features was lower than the threshold.

-   The name will have the "Ba\_" prefix is the feature was correlated but used as unaltered basis

-   The name will have the "De\_" prefix is the feature was original correlated and its correlation to "Ba\_" features has been removed.

Furthermore, the returned data frame will have the following attributes:

1)  Transformation matrix: "*GDSTM*"

2)  Features:

    1.  "*fsocre*"

    2.  "*varincluded*"

    3.  "*topFeatures*"

3)  Unaltered Basis:

    1.  "*baseFeatures*"

        1.  "*correlatedToBase*"

    2.  "*AbaseFeatures*"

### GDSTM

The "*GDSTM***"** attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the threshold

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}


## The Spatial Transformation Matrix:
GDSTM <- attr(datasetframeDecor[[1]],"GDSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(GDSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("GDSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="GDSTM Feature", ylab="Input Feature")

pander::pander(t(colnames(GDSTM)),caption="New names of decorrelated matrix")


```

### The Coefficients per each Decorrelated Feature

The following code extracts the dependence formula coefficients of each new feature to the original features.

```{r}

namecol <- colnames(GDSTM);

for (i in 1:ncol(GDSTM))
{
  associ <- abs(GDSTM[,i])>0;
  if (sum(associ) > 1)
  {
    pander::pander(t(GDSTM[associ,i]),caption=namecol[i])
  }
}


```

### The Feature Index Score

The **FCA** analysis of the data features are stored in three attributes: "*varincluded*", "*topFeatures*", and "*fscore*".

-   "varincluded" returns the list of continuous features that were decorrelated

-   "topFeatures" returns the features that at some point were used as independent variables inside the linear models.

-   "fscore" : returns a named vector with the total feature score, $F_j$, of the analyzed features.

$$
F_j=∑_{n}∑_{i∈B^{n}_{j}}|ρ_{i,j}|^2(|ρ_{i,j}|>ρ_{th}),~ \forall j \in Ind
$$

$$
F_j=F_j-∑_{n}|ρ_{B^n_j,j}|^2),~ \forall j \in Dep
$$

where $B^n_j$ is the set of features statistically associated with feature *j* at iteration *n,* $ρ_{i,j}$ is the correlation between features *i,j*, and $ρ_{th}$ is the correlation goal, ${Ind, Dep}$ are the set of independent and dependent features respectively. In other words, the "*fscore"* indicates the degree of total association of "independent" features to dependent features.

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

pander::pander(t(attr(datasetframeDecor[[1]],"varincluded")),caption="Correlated Features")

pander::pander(t(attr(datasetframeDecor[[1]],"topFeatures")),caption="Independent Features")

fscore <- attr(datasetframeDecor[[1]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

```

The FCA algorithm will return for unsupervised basis learning the following attribute: "*AbaseFeatures*"

```{r results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

pander::pander(t(attr(datasetframeDecor[[1]],"AbaseFeatures")),caption="Set of unaltered features")


```

The total set of unaltered features is:

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

atbase <- attr(datasetframeDecor[[1]],"AbaseFeatures")
featnames <- colnames(datasetframe_train)
included <- attr(datasetframeDecor[[1]],"varincluded")

notinvarincluded <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features")

```

For supervised basis learning the FCA algorithm will return the following attributes:

-   "*baseFeatures*" and "*correlatedToBase*"

The "*baseFeatures*" is the set of features that the **FRESA.CAD::univariate_correlation()** univariate filter function used to get the features associated with the outcome.

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
# With Supervised Basis

pander::pander(t(attr(datasetframeDecor[[6]],"baseFeatures")),caption="Set of unaltered features")

if (length(attr(datasetframeDecor[[6]],"correlatedToBase"))>0)
{
pander::pander(t(attr(datasetframeDecor[[6]],"correlatedToBase")),caption="Set of features associated with base")
}

atbaseSup <- attr(datasetframeDecor[[6]],"baseFeatures")
included <- attr(datasetframeDecor[[6]],"varincluded")

notinvarincludedSup <-  featnames[!(featnames %in% included)]
pander::pander(t(c(atbaseSup,notinvarincludedSup)),caption="Set of unaltered features: Supervised")

pander::pander(t(c(atbase,notinvarincluded)),caption="Set of unaltered features: Unsupervised")


fscore <- attr(datasetframeDecor[[6]],"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6,main="Feature Score: Supervised")

```

## Machine Learning and GDSTM

Train a simple NB model on the raw data set

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

```

Training using the decorrelated data

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[1]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )




```

Selected Raw Features

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

```

Selected decorrelated Features

```{r results = "asis", dpi=600, fig.height= 6.0, fig.width= 8.0}

vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- datasetframeDecor[[1]];
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

```

To make predictions we need to transform the testing set. This is done using the **FRESA.CAD::predictDecorrelate()** function

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}

# Transform the testing set
decor_test <- predictDecorrelate(datasetframeDecor[[1]],datasetframe_test)

```

Once we have the transformed testing dataset we can make a side by side comparison of predictions

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 4.0, fig.width= 12.0}

# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("GDSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}
meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

```

## Training and Prediction on all Decorrelations Sets

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
par(mfrow=c(2,2))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.75)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

```

## Final Plot Comparing the ROC AUC of all Options

```{r results = "asis", warning = FALSE, dpi=600, fig.height= 6.0, fig.width= 8.0}
par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )

```
