This document describes the use of the FRESA.CAD::IDeA() function that runs the feature Decorrelation analysis (IDeA) algorithm.
This demo uses FRESA.CAD and mlbench R packages:
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE, message = FALSE,comment = "#>")
library("FRESA.CAD")
## Loading required package: Rcpp
## Loading required package: stringr
## Loading required package: miscTools
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(mlbench)
op <- par(no.readonly = TRUE)
I’ll load the vehicle data set
data("Vehicle", package = "mlbench")
print(table(Vehicle$Class))
#>
#> bus opel saab van
#> 218 212 217 199
Setting some variables for downstream analysis
studyName = "Vehicle"
datasetframe <- Vehicle
Outcome <- "Class"
# The fractions of samples to be use in the training set
trainFraction = 0.5
# The correlation threshold
correlationThreshold = 0.6
Setting the Training and Testing sets
tb <- table(datasetframe[,Outcome])
classNames <- unique(datasetframe[,Outcome])
allrowClass <- datasetframe[,Outcome]
names(allrowClass) <- rownames(datasetframe)
trainsize <- trainFraction*min(tb);
trainSamples <- NULL;
for (theClass in classNames)
{
classSample <- allrowClass[allrowClass == theClass]
trainSamples <- c(trainSamples,names(classSample[sample(length(classSample),trainsize)]))
}
datasetframe_train <- datasetframe[trainSamples,]
testSamples <- !(rownames(datasetframe) %in% trainSamples)
datasetframe_test <- datasetframe[testSamples,]
outcomes <- datasetframe_train[,Outcome]
pander::pander(table(datasetframe[,Outcome]),caption="All")
| bus | opel | saab | van |
|---|---|---|---|
| 218 | 212 | 217 | 199 |
pander::pander(table(datasetframe_train[,Outcome]),caption="Training")
| bus | opel | saab | van |
|---|---|---|---|
| 99 | 99 | 99 | 99 |
pander::pander(table(datasetframe_test[,Outcome]),caption="Testing")
| bus | opel | saab | van |
|---|---|---|---|
| 119 | 113 | 118 | 100 |
The default parameters will compute the transformation matrix with a maximum correlation goal of 0.8 using fast matrix multiplication with Pearson correlation and linear models estimation.
Default Parameters: thr=0.80,method=“fast”,type=“LM”
system.time(decorrelatedDF <- IDeA(datasetframe_train))
user system elapsed 0.04 0.02 0.06 ### Returned Data Frame Specifications
The UPSTMDecorrelation function returns a data frame with the following specifications for the column names:
The output basis after transformation will be named after the original names:
## The unaltered features:
## 1) Factors or
## 2) features with not enough unique values or
## 3) Uncorrelated to any other feature or
## 4) the features were keep as basis pivot
pander::pander(colnames(decorrelatedDF)[colnames(decorrelatedDF) %in% colnames(datasetframe_train)])
Rad.Ra, Pr.Axis.Ra, Max.L.Ra, Scat.Ra, Skew.Maxis, Skew.maxis, Kurt.maxis, Kurt.Maxis and Class
## The discovered latent features. i.e. Decorrelated by the IDeA
pander::pander(colnames(decorrelatedDF)[str_detect(colnames(decorrelatedDF),"La_")])
La_Comp, La_Circ, La_D.Circ, La_Elong, La_Pr.Axis.Rect, La_Max.L.Rect, La_Sc.Var.Maxis, La_Sc.Var.maxis, La_Ra.Gyr and La_Holl.Ra
The returned decorrelated object can be analyzed by the getLatentCoefficients() function to return the set of discovered latent variables with their corresponding formula.
latentlist <- getLatentCoefficients(decorrelatedDF)
pander::pander(latentlist)
La_Comp:
| Comp | Scat.Ra |
|---|---|
| 1 | -0.2095 |
La_Circ:
| Circ | Scat.Ra |
|---|---|
| 1 | -0.1542 |
La_D.Circ:
| D.Circ | Scat.Ra |
|---|---|
| 1 | -0.4312 |
La_Elong:
| Scat.Ra | Elong |
|---|---|
| 0.2323 | 1 |
La_Pr.Axis.Rect:
| Scat.Ra | Pr.Axis.Rect |
|---|---|
| -0.07678 | 1 |
La_Max.L.Rect:
| Circ | Max.L.Rect |
|---|---|
| -2.28 | 1 |
La_Sc.Var.Maxis:
| Scat.Ra | Sc.Var.Maxis |
|---|---|
| -0.9082 | 1 |
La_Sc.Var.maxis:
| Scat.Ra | Sc.Var.maxis |
|---|---|
| -5.282 | 1 |
La_Ra.Gyr:
| Circ | Ra.Gyr |
|---|---|
| -4.93 | 1 |
La_Holl.Ra:
| Kurt.Maxis | Holl.Ra |
|---|---|
| -1.074 | 1 |
Furthermore, the returned data frame will have the following main attributes:
Transformation matrix: “UPSTM”
Feature Analysis:
“TotalAdjustments”
“fsocre”
The “UPSTM“ attribute stores the spatial transformation matrix. The matrix only includes continuous features that had some correlation greater than the user specified threshold
## The Spatial Transformation Matrix:
UPSTM <- attr(decorrelatedDF,"UPSTM")
## The heatmap of the matrix
gplots::heatmap.2(1*(abs(UPSTM) > 0),
trace = "none",
mar = c(10,10),
col=rev(heat.colors(2)),
main = paste("UPSTM Matrix:",studyName),
cexRow = 0.7,
cexCol = 0.7,
breaks = c(0,0.5,1),
key.title=NA,
key.xlab="|beta| > 0",
xlab="UPSTM Feature", ylab="Input Feature")
pander::pander(t(colnames(UPSTM)),caption="New names of decorrelated matrix")
| La_Comp | La_Circ | La_D.Circ | Scat.Ra | La_Elong | La_Pr.Axis.Rect |
| La_Max.L.Rect | La_Sc.Var.Maxis | La_Sc.Var.maxis | La_Ra.Gyr | Kurt.Maxis |
| La_Holl.Ra |
The IDeA analysis of the data features are stored in three attributes: “TotalAdjustments”, and “fscore”.
“TotalAdjustments” returns named vector of with the number of times a variable was decorrelated by the IDeA.
“fscore” : returns a named vector with the total feature score, \(F_j\), of the analyzed features. The score is equal to number of times the features was uses as independent feature minus the “TotalAdjustments”.
pander::pander(t(attr(decorrelatedDF,"TotalAdjustments")),caption="Total Adjustments")
| Comp | Circ | D.Circ | Rad.Ra | Pr.Axis.Ra | Max.L.Ra | Scat.Ra | Elong |
|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
| Pr.Axis.Rect | Max.L.Rect | Sc.Var.Maxis | Sc.Var.maxis | Ra.Gyr | Skew.Maxis |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 1 | 0 |
| Skew.maxis | Kurt.maxis | Kurt.Maxis | Holl.Ra | Class |
|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 |
pander::pander(t(attr(decorrelatedDF,"LatentVariables")),caption="Latent Varibale Parents")
| Comp | Circ | D.Circ | Elong | Pr.Axis.Rect | Max.L.Rect | Sc.Var.Maxis |
| Sc.Var.maxis | Ra.Gyr | Holl.Ra |
colnames(decorrelatedDF)
[1] “La_Comp” “La_Circ” “La_D.Circ” “Rad.Ra”
[5] “Pr.Axis.Ra” “Max.L.Ra” “Scat.Ra” “La_Elong”
[9] “La_Pr.Axis.Rect” “La_Max.L.Rect” “La_Sc.Var.Maxis”
“La_Sc.Var.maxis” [13] “La_Ra.Gyr” “Skew.Maxis” “Skew.maxis”
“Kurt.maxis”
[17] “Kurt.Maxis” “La_Holl.Ra” “Class”
pander::pander(t(attr(decorrelatedDF,"fscore")),caption="The Score")
| La_Comp | La_Circ | La_D.Circ | Scat.Ra | La_Elong | La_Pr.Axis.Rect |
|---|---|---|---|---|---|
| -1 | 1 | -1 | 7 | -1 | -1 |
| La_Max.L.Rect | La_Sc.Var.Maxis | La_Sc.Var.maxis | La_Ra.Gyr | Kurt.Maxis |
|---|---|---|---|---|
| -1 | -1 | -1 | -1 | 1 |
| La_Holl.Ra |
|---|
| -1 |
fscore <- attr(decorrelatedDF,"fscore")
fscore <- fscore[order(-fscore)];
barplot(fscore,las=2,cex.names = 0.6)
To decorrelate any new data set use the predictDecorrelate() function
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)
Here are the heat maps of the correlation matrices before and after decorrelation on the testing set
featnames <- rownames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(datasetframe_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = paste("Raw Correlation:",studyName),
cexRow = 0.75,
cexCol = 0.75,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
featnames <- colnames(attr(decorrelatedDF,"UPSTM"))
cormat <- cor(decor_test[,featnames],method="pearson")
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = paste("After decorrelation:",studyName),
cexRow = 0.75,
cexCol = 0.75,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Transformed Feature", ylab="Transformed Feature")
The following code are examples of running IDeA() function with options:
datasetframeDecor<-list();
# With default parameters
datasetframeDecor[[1]] <- decorrelatedDF
decortype <- list();
decortype[[1]] <- "Default"
# Change the maximum correlation goal
decortype[[2]] <- "AtThr"
system.time(datasetframeDecor[[2]] <- IDeA(
datasetframe_train,
thr = correlationThreshold
))
user system elapsed 0.03 0.00 0.03
# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[3]] <- "RLM_Pearson"
system.time(datasetframeDecor[[3]] <- IDeA(
datasetframe_train,
type="RLM",
method="pearson"))
user system elapsed 0.20 0.03 0.23
# Change the maximum correlation goal, and change to Spearman correlation
decortype[[4]] <- "LM_Spearman"
system.time(datasetframeDecor[[4]] <- IDeA(
datasetframe_train,
type="LM",
method="spearman"))
user system elapsed 0.08 0.00 0.08
# Change the maximum correlation goal, and set Spearman correlation with robust liner model
decortype[[5]] <- "RLM_Spearman"
system.time(datasetframeDecor[[5]] <- IDeA(
datasetframe_train,
type="RLM",
method="spearman"))
user system elapsed 0.17 0.00 0.17
# The following are for outcome driven learning
# Set the target class for association learning
decortype[[6]] <- "Driven_Default"
system.time(datasetframeDecor[[6]] <- IDeA(
datasetframe_train,
Outcome=Outcome))
user system elapsed 0.17 0.00 0.17
# Change the maximum correlation goal
decortype[[7]] <- "Driven_AtThr"
system.time(datasetframeDecor[[7]] <- IDeA(
datasetframe_train,
thr = correlationThreshold,
Outcome=Outcome))
user system elapsed 0.22 0.01 0.17
# Change the maximum correlation goal, and set to Robust Liner Models
decortype[[8]] <- "Driven_RLM_Pearson"
system.time(datasetframeDecor[[8]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="RLM",
method="pearson"))
user system elapsed 0.27 0.02 0.26
# Change the maximum correlation goal, and change to Spearman correlation
decortype[[9]] <- "Driven_LM_Spearman"
system.time(datasetframeDecor[[9]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="LM",
method="spearman"))
user system elapsed 0.20 0.02 0.22
# Change the maximum correlation goal, and set to Spearman correlation with robust liner model
decortype[[10]] <- "Driven_RLM_Spearman"
system.time(datasetframeDecor[[10]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
type="RLM",
method="spearman"))
user system elapsed 0.33 0.00 0.33
# With user defined basis
decortype[[11]] <- "Driven_KS_RLM_Spearman"
baseKS <- names(univariate_KS(datasetframe_train,
Outcome=Outcome,
pvalue=0.20,
limit=0,
thr=correlationThreshold))
system.time(datasetframeDecor[[11]] <- IDeA(
datasetframe_train,
Outcome=Outcome,
baseFeatures=baseKS,
type="RLM",
method="spearman"))
user system elapsed 0.31 0.01 0.33
Train a simple NB model on the raw data set
mNBRaw <- filteredFit(paste(Outcome,"~."),
datasetframe_train,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
# With PCA
mNBPCA <- filteredFit(paste(Outcome,"~."),
datasetframe_train,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=TRUE,
normalize=FALSE
)
Training using the decorrelated data
mNBDecor <- filteredFit(paste(Outcome,"~."),
decorrelatedDF,
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
Selected Raw Features
vnames <- as.data.frame(cbind(mNBRaw$selectedfeatures,mNBRaw$selectedfeatures))
dta <- datasetframe_train;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
data=dta,
Outcome=Outcome,
hCluster="col",
srtCol=45,
xlab="Raw Features",
ylab="Samples"
)
Selected decorrelated Features
vnames <- as.data.frame(cbind(mNBDecor$selectedfeatures,mNBDecor$selectedfeatures))
dta <- decorrelatedDF;
dta <- FRESAScale(dta,method="OrderLogit")$scaledData
dta$Class <- as.numeric(dta$Class)
hm <- heatMaps(variableList=vnames,
data=dta,
Outcome=Outcome,
hCluster="col",
srtCol=35,
xlab="Decorrelated Features",
ylab="Samples"
)
To make predictions we need to transform the testing set. This is done using the FRESA.CAD::predictDecorrelate() function
# Transform the testing set
decor_test <- predictDecorrelate(decorrelatedDF,datasetframe_test)
Once we have the transformed testing dataset we can make a side by side comparison of predictions
# Predict the raw testing set
prRAW <- attr(predict(mNBRaw,datasetframe_test),"prob")
# Predict with PCA
prPCA <- attr(predict(mNBPCA,datasetframe_test),"prob")
# Predict the transformed dataset
prDecor <- attr(predict(mNBDecor,decor_test),"prob")
par(mfrow=c(1,3))
meanROCAUC <- numeric(3);
meanPCAROCAUC <- numeric(3);
for (theClass in classNames)
{
classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
psRaw <- predictionStats_binary(cbind(classoutcomes,prRAW[,theClass]),
paste("Raw :",theClass),cex=0.75)
pander::pander(psRaw$aucs)
psPCA <- predictionStats_binary(cbind(classoutcomes,prPCA[,theClass]),
paste("PCA :",theClass),cex=0.75)
pander::pander(psPCA$aucs)
psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
paste("UPSTM :",theClass),cex=0.75)
pander::pander(psDecor$aucs)
meanROCAUC <- meanROCAUC + psRaw$aucs;
meanPCAROCAUC <- meanPCAROCAUC + psPCA$aucs;
}
Raw : van PCA : van UPSTM : van
Raw
: saab PCA : saab UPSTM : saab
Raw
: bus PCA : bus UPSTM : bus
Raw
: opel PCA : opel UPSTM : opel
meanROCAUC <- meanROCAUC/length(classNames)
AllRocAUC <- meanROCAUC;
meanPCAROCAUC <- meanPCAROCAUC/length(classNames)
AllRocAUC <- rbind(AllRocAUC,meanPCAROCAUC);
par(mfrow=c(4,4))
for (i in c(1:length(datasetframeDecor)))
{
mNBDecor <- filteredFit(paste(Outcome,"~."),
datasetframeDecor[[i]],
fitmethod=NAIVE_BAYES,
filtermethod=univariate_KS,
filtermethod.control=list(pvalue=0.05),
Scale="OrderLogit",
pca=FALSE
)
decor_test <- predictDecorrelate(datasetframeDecor[[i]],datasetframe_test)
prDecor <- attr(predict(mNBDecor,decor_test),"prob")
meanROCAUC <- numeric(3);
for (theClass in classNames)
{
classoutcomes <- 1*(datasetframe_test[,Outcome] == theClass)
psDecor <- predictionStats_binary(cbind(classoutcomes,prDecor[,theClass]),
paste(decortype[[i]],theClass,sep=":"),cex=0.5)
meanROCAUC <- meanROCAUC + psDecor$aucs;
}
meanROCAUC <- meanROCAUC/length(classNames)
pander::pander(meanROCAUC)
AllRocAUC <- rbind(AllRocAUC,meanROCAUC)
}
Default:van Default:saab Default:bus Default:opel AtThr:van
AtThr:saab AtThr:bus AtThr:opel RLM_Pearson:van RLM_Pearson:saab
RLM_Pearson:bus RLM_Pearson:opel LM_Spearman:van LM_Spearman:saab
LM_Spearman:bus LM_Spearman:opel
RLM_Spearman:van
RLM_Spearman:saab RLM_Spearman:bus RLM_Spearman:opel Driven_Default:van
Driven_Default:saab Driven_Default:bus Driven_Default:opel
Driven_AtThr:van Driven_AtThr:saab Driven_AtThr:bus Driven_AtThr:opel
Driven_RLM_Pearson:van Driven_RLM_Pearson:saab Driven_RLM_Pearson:bus
Driven_RLM_Pearson:opel
Driven_LM_Spearman:van
Driven_LM_Spearman:saab Driven_LM_Spearman:bus Driven_LM_Spearman:opel
Driven_RLM_Spearman:van Driven_RLM_Spearman:saab Driven_RLM_Spearman:bus
Driven_RLM_Spearman:opel Driven_KS_RLM_Spearman:van
Driven_KS_RLM_Spearman:saab Driven_KS_RLM_Spearman:bus
Driven_KS_RLM_Spearman:opel
par(mfrow=c(1,1))
rownames(AllRocAUC) <- c("Raw","PCA",unlist(decortype))
pander::pander(AllRocAUC)
| est | lower | upper | |
|---|---|---|---|
| Raw | 0.8504 | 0.8163 | 0.8846 |
| PCA | 0.8526 | 0.8162 | 0.889 |
| Default | 0.8852 | 0.8563 | 0.9141 |
| AtThr | 0.9088 | 0.8843 | 0.9331 |
| RLM_Pearson | 0.8877 | 0.8597 | 0.9158 |
| LM_Spearman | 0.8725 | 0.8443 | 0.9007 |
| RLM_Spearman | 0.8587 | 0.8272 | 0.8902 |
| Driven_Default | 0.8895 | 0.8641 | 0.9149 |
| Driven_AtThr | 0.8654 | 0.8318 | 0.899 |
| Driven_RLM_Pearson | 0.8667 | 0.8356 | 0.8977 |
| Driven_LM_Spearman | 0.8795 | 0.8528 | 0.9063 |
| Driven_RLM_Spearman | 0.861 | 0.8299 | 0.8921 |
| Driven_KS_RLM_Spearman | 0.859 | 0.8267 | 0.8912 |
bpROCAUC <- barPlotCiError(as.matrix(AllRocAUC),
metricname = "ROCAUC",
thesets = "ROC AUC",
themethod = rownames(AllRocAUC),
main = "ROC AUC",
offsets = c(0.5,1),
scoreDirection = ">",
ho=0.5,
args.legend = list(bg = "white",x="bottomright",inset=c(0.0,0),cex=0.75),
col = terrain.colors(nrow(AllRocAUC))
)