IDeA and the UPSTM

IDeA-Based Decorrelation on Vehicle

This document describes the use of the FRESA.CAD::IDeA() function that runs the feature Decorrelation analysis (IDeA) algorithm.

This demo uses FRESA.CAD and mlbench R packages:

knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")

library("FRESA.CAD")

## Loading required package: Rcpp

## Loading required package: stringr

## Loading required package: miscTools

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: pROC

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(mlbench)

op <- par(no.readonly = TRUE)

I’ll load the vehicle data set

data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))
#> 
#>  bus opel saab  van 
#>  218  212  217  199

Setting some variables for downstream analysis

studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"

# The fractions of samples to be use in the training set
trainFraction = 0.5

# The correlation threshold 
correlationThreshold = 0.6

Setting the Training and Testing sets


tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])

allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)

trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
  classSample <- allrowClass[allrowClass == theClass]
  trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}


datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]

outcomes <- datasetframe_train[,Outcome]

pander::pander(table(datasetframe[,Outcome]),caption="All")

All
bus	opel	saab	van
218	212	217	199

pander::pander(table(datasetframe_train[,Outcome]),caption="Training")

Training
bus	opel	saab	van
99	99	99	99

pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")

Testing
bus	opel	saab	van
119	113	118	100

IDeA with Default Parameters

The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.

Default Parameters: thr=0.80,method=“fast”,type=“LM”



system.time(decorrelatedDF <- IDeA(datasetframe_train))

user system elapsed 0.04 0.02 0.06 ### Returned Data Frame Specifications

The UPSTMDecorrelation function returns a data frame with the following specifications for the column names:

The output basis after transformation will be named after the original names:

The feature name will have the “La_” prefix, if the feature was decorrelated by the algorithm. These features are the discovered latent variables

The Unlatered Features

## The unaltered features: 
## 1) Factors or
## 2) features with not enough unique values or
## 3) Uncorrelated to any other feature or
## 4) the features were keep as basis pivot
pander::pander(colnames(decorrelatedDF)[colnames(decorrelatedDF) %in% colnames(datasetframe_train)])

Rad.Ra, Pr.Axis.Ra, Max.L.Ra, Scat.Ra, Skew.Maxis, Skew.maxis, Kurt.maxis, Kurt.Maxis and Class

The Latent Basis

## The discovered latent features. i.e. Decorrelated by the IDeA
pander::pander(colnames(decorrelatedDF)[str_detect(colnames(decorrelatedDF),"La_")])

La_Comp, La_Circ, La_D.Circ, La_Elong, La_Pr.Axis.Rect, La_Max.L.Rect, La_Sc.Var.Maxis, La_Sc.Var.maxis, La_Ra.Gyr and La_Holl.Ra

The Discovered Latent Variables

The returned decorrelated object can be analyzed by the getLatentCoefficients() function to return the set of discovered latent variables with their corresponding formula.

latentlist <- getLatentCoefficients(decorrelatedDF)
pander::pander(latentlist)

La_Comp:

Comp Scat.Ra

1 -0.2095
La_Circ:

Circ Scat.Ra

1 -0.1542
La_D.Circ:

D.Circ Scat.Ra

1 -0.4312
La_Elong:

Scat.Ra Elong

0.2323 1
La_Pr.Axis.Rect:

Scat.Ra Pr.Axis.Rect

-0.07678 1
La_Max.L.Rect:

Circ Max.L.Rect

-2.28 1
La_Sc.Var.Maxis:

Scat.Ra Sc.Var.Maxis

-0.9082 1
La_Sc.Var.maxis:

Scat.Ra Sc.Var.maxis

-5.282 1
La_Ra.Gyr:

Circ Ra.Gyr

-4.93 1
La_Holl.Ra:

Kurt.Maxis Holl.Ra

-1.074 1

Comp	Scat.Ra
1	-0.2095

Circ	Scat.Ra
1	-0.1542

D.Circ	Scat.Ra
1	-0.4312

Scat.Ra	Elong
0.2323	1

Scat.Ra	Pr.Axis.Rect
-0.07678	1

Circ	Max.L.Rect
-2.28	1

Scat.Ra	Sc.Var.Maxis
-0.9082	1

Scat.Ra	Sc.Var.maxis
-5.282	1

Circ	Ra.Gyr
-4.93	1

Kurt.Maxis	Holl.Ra
-1.074	1

Attributes of the Returned Dataframe.

Furthermore, the returned data frame will have the following main attributes:

Transformation matrix: “UPSTM”
Feature Analysis:
1. “TotalAdjustments”
2. “fsocre”

UPSTM

The “UPSTM“ attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the user specified threshold



## The Spatial Transformation Matrix:
UPSTM <- attr(decorrelatedDF,"UPSTM")

## The heatmap of the matrix
gplots::heatmap.2(1*(abs(UPSTM) > 0),
                  trace = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(2)),
                  main = paste("UPSTM Matrix:",studyName),
                  cexRow = 0.7,
                  cexCol = 0.7,
                  breaks = c(0,0.5,1),
                  key.title=NA,
                  key.xlab="|beta| > 0",
                  xlab="UPSTM Feature", ylab="Input Feature")


pander::pander(t(colnames(UPSTM)),caption="New names of decorrelated matrix")

New names of decorrelated matrix (continued below)
La_Comp	La_Circ	La_D.Circ	Scat.Ra	La_Elong	La_Pr.Axis.Rect

Table continues below
La_Max.L.Rect	La_Sc.Var.Maxis	La_Sc.Var.maxis	La_Ra.Gyr	Kurt.Maxis

La_Holl.Ra

The Feature Index Score

The IDeA analysis of the data features are stored in three attributes: “TotalAdjustments”, and “fscore”.

“TotalAdjustments” returns named vector of with the number of times a variable was decorrelated by the IDeA.
“fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features. The score is equal to number of times the features was uses as independent feature minus the “TotalAdjustments”.


pander::pander(t(attr(decorrelatedDF,"TotalAdjustments")),caption="Total Adjustments")

Total Adjustments (continued below)
Comp	Circ	D.Circ	Rad.Ra	Pr.Axis.Ra	Max.L.Ra	Scat.Ra	Elong
1	1	1	0	0	0	0	1

Table continues below
Pr.Axis.Rect	Max.L.Rect	Sc.Var.Maxis	Sc.Var.maxis	Ra.Gyr	Skew.Maxis
1	1	1	1	1	0

Skew.maxis	Kurt.maxis	Kurt.Maxis	Holl.Ra	Class
0	0	0	1	0

pander::pander(t(attr(decorrelatedDF,"LatentVariables")),caption="Latent Varibale Parents")

Latent Varibale Parents (continued below)
Comp	Circ	D.Circ	Elong	Pr.Axis.Rect	Max.L.Rect	Sc.Var.Maxis

Sc.Var.maxis

Ra.Gyr

Holl.Ra

colnames(decorrelatedDF)

[1] “La_Comp” “La_Circ” “La_D.Circ” “Rad.Ra”
[5] “Pr.Axis.Ra” “Max.L.Ra” “Scat.Ra” “La_Elong”
[9] “La_Pr.Axis.Rect” “La_Max.L.Rect” “La_Sc.Var.Maxis” “La_Sc.Var.maxis” [13] “La_Ra.Gyr” “Skew.Maxis” “Skew.maxis” “Kurt.maxis”
[17] “Kurt.Maxis” “La_Holl.Ra” “Class”


pander::pander(t(attr(decorrelatedDF,"fscore")),caption="The Score")

The Score (continued below)
La_Comp	La_Circ	La_D.Circ	Scat.Ra	La_Elong	La_Pr.Axis.Rect
-1	1	-1	7	-1	-1

Table continues below
La_Max.L.Rect	La_Sc.Var.Maxis	La_Sc.Var.maxis	La_Ra.Gyr	Kurt.Maxis
-1	-1	-1	-1	1

La_Holl.Ra
-1


fscore <- attr(decorrelatedDF,"fscore") 
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)

Testing Set Decorrelation

To decorrelate any new data set use the predictDecorrelate() function

decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Heat Maps of the Correlation Matrices

Here are the heat maps of the correlation matrices before and after decorrelation on the testing set



featnames <- rownames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("Raw Correlation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")



featnames <- colnames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = paste("After decorrelation:",studyName),
                  cexRow = 0.75,
                  cexCol = 0.75,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Transformed Feature", ylab="Transformed Feature")

IDeA with Options

The following code are examples of running IDeA() function with options:


datasetframeDecor<-list();

# With default parameters
datasetframeDecor[[1]] <- decorrelatedDF
decortype <- list();
decortype[[1]] <- "Default"


# Change the maximum correlation goal

decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold
  ))

user system elapsed 0.03 0.00 0.03


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="pearson"))

user system elapsed 0.20 0.03 0.23



# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- IDeA(
  datasetframe_train,
  type="LM",
  method="spearman"))

user system elapsed 0.08 0.00 0.08


# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- IDeA(
  datasetframe_train,
  type="RLM",
  method="spearman"))

user system elapsed 0.17 0.00 0.17



# The following are for outcome driven learning

# Set the target class for association learning
decortype[[6]] <- "Driven_Default"
system.time(datasetframeDecor[[6]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome))

user system elapsed 0.17 0.00 0.17



# Change the maximum correlation goal
decortype[[7]] <- "Driven_AtThr"
system.time(datasetframeDecor[[7]] <- IDeA(
  datasetframe_train,
  thr = correlationThreshold,
  Outcome=Outcome))

user system elapsed 0.22 0.01 0.17


# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Driven_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="pearson"))

user system elapsed 0.27 0.02 0.26


# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Driven_LM_Spearman"
system.time(datasetframeDecor[[9]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="LM",
  method="spearman"))

user system elapsed 0.20 0.02 0.22


# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Driven_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  type="RLM",
  method="spearman"))

user system elapsed 0.33 0.00 0.33



# With user defined basis 
decortype[[11]] <- "Driven_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
                        Outcome=Outcome,
                        pvalue=0.20,
                        limit=0,
                        thr=correlationThreshold))

system.time(datasetframeDecor[[11]] <- IDeA(
  datasetframe_train,
  Outcome=Outcome,
  baseFeatures=baseKS,
  type="RLM",
  method="spearman"))

user system elapsed 0.31 0.01 0.33

Machine Learning and UPSTM

Train a simple NB model on the raw data set

mNBRaw <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
                   datasetframe_train,
                   fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=TRUE,
                     normalize=FALSE
                   )

Training using the decorrelated data

mNBDecor <- filteredFit(paste(Outcome,"~."),
                   decorrelatedDF,
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

Selected Raw Features


vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=45,
               xlab="Raw Features",
               ylab="Samples"
               )

Selected decorrelated Features


vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- decorrelatedDF;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
               data=dta,
               Outcome=Outcome,
               hCluster="col",
               srtCol=35,
               xlab="Decorrelated Features",
               ylab="Samples"
               )

To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function


# Transform the testing set
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)

Once we have the transformed testing dataset we can make a side by side comparison of predictions


# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")

# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")

# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")


par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
  classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
  psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
                                paste("Raw :",theClass),cex=0.75)
  pander::pander(psRaw$aucs)
  psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
                                paste("PCA :",theClass),cex=0.75)
  pander::pander(psPCA$aucs)
  psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                paste("UPSTM :",theClass),cex=0.75)
  pander::pander(psDecor$aucs)
  meanROCAUC <- meanROCAUC + psRaw$aucs;
  meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}

Raw : van PCA : van UPSTM : van Raw : saab PCA : saab UPSTM : saab Raw : bus PCA : bus UPSTM : bus Raw : opel PCA : opel UPSTM : opel

meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);

Training and Prediction on all Decorrelations Sets

par(mfrow=c(4,4))

for (i in c(1:length(datasetframeDecor)))
{
  mNBDecor <- filteredFit(paste(Outcome,"~."),
                   datasetframeDecor[[i]],
                    fitmethod=NAIVE_BAYES,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(pvalue=0.05),
                     Scale="OrderLogit",
                     pca=FALSE
                   )

  decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
  prDecor <- attr(predict(mNBDecor,decor_test),"prob")
  meanROCAUC <- numeric(3);
  for (theClass in classNames)
  {
    classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
    psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
                                  paste(decortype[[i]],theClass,sep=":"),cex=0.5)
    meanROCAUC <- meanROCAUC + psDecor$aucs;
  }
  meanROCAUC <- meanROCAUC/length(classNames)
  pander::pander(meanROCAUC)
  AllRocAUC <- rbind(AllRocAUC,meanROCAUC)

}

Default:van Default:saab Default:bus Default:opel AtThr:van AtThr:saab AtThr:bus AtThr:opel RLM_Pearson:van RLM_Pearson:saab RLM_Pearson:bus RLM_Pearson:opel LM_Spearman:van LM_Spearman:saab LM_Spearman:bus LM_Spearman:opel RLM_Spearman:van RLM_Spearman:saab RLM_Spearman:bus RLM_Spearman:opel Driven_Default:van Driven_Default:saab Driven_Default:bus Driven_Default:opel Driven_AtThr:van Driven_AtThr:saab Driven_AtThr:bus Driven_AtThr:opel Driven_RLM_Pearson:van Driven_RLM_Pearson:saab Driven_RLM_Pearson:bus Driven_RLM_Pearson:opel Driven_LM_Spearman:van Driven_LM_Spearman:saab Driven_LM_Spearman:bus Driven_LM_Spearman:opel Driven_RLM_Spearman:van Driven_RLM_Spearman:saab Driven_RLM_Spearman:bus Driven_RLM_Spearman:opel Driven_KS_RLM_Spearman:van Driven_KS_RLM_Spearman:saab Driven_KS_RLM_Spearman:bus Driven_KS_RLM_Spearman:opel

Final Plot Comparing the ROC AUC of all Options

par(mfrow=c(1,1))

rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)

	est	lower	upper
Raw	0.8504	0.8163	0.8846
PCA	0.8526	0.8162	0.889
Default	0.8852	0.8563	0.9141
AtThr	0.9088	0.8843	0.9331
RLM_Pearson	0.8877	0.8597	0.9158
LM_Spearman	0.8725	0.8443	0.9007
RLM_Spearman	0.8587	0.8272	0.8902
Driven_Default	0.8895	0.8641	0.9149
Driven_AtThr	0.8654	0.8318	0.899
Driven_RLM_Pearson	0.8667	0.8356	0.8977
Driven_LM_Spearman	0.8795	0.8528	0.9063
Driven_RLM_Spearman	0.861	0.8299	0.8921
Driven_KS_RLM_Spearman	0.859	0.8267	0.8912

bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
                          metricname = "ROCAUC",
                          thesets = "ROC AUC",
                          themethod = rownames(AllRocAUC),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
                          col = terrain.colors(nrow(AllRocAUC))
                          )