Data Distributions, Correlations, and Model Comparison

Demos for the Mathematics for Computer Science
Check Correlation
UMAP
Two Classifiers and Method Comparison
Testing if KNN with FS is superior
- Confidence interval using bootstrap
Accuracy 95%CI
Checking Superiority

Demos for the Mathematics for Computer Science

The libraries

library(mlbench)
library(FRESA.CAD)
library(umap)
library("e1071")
library(gbm)
library(caret)
library(gplots)
library(ggplot2)

Vehicle data set

data("Vehicle", package = "mlbench")
data(mtcars)

Distributions

Some distributions

  p <- ggplot(Vehicle, aes(x = Class, y = Circ, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Circ")
  print(p)

    p <- ggplot(Vehicle, aes(x = Class, y = Elong, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Elong")
  print(p)

Check Correlation

Pearson

VehicleMatrix <- as.matrix(Vehicle[,colnames(Vehicle)!="Class"])
rc <- rainbow(unique(Vehicle$Class))
rc <- rc[as.integer(Vehicle$Class)]

cormat <- cor(VehicleMatrix)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

ILAA

ILAA is a statistical method that discovers variable associations and creates a new dataset with controled correlation

ILAAData <- ILAA(as.data.frame(VehicleMatrix),thr=0.50)

LatentFormulas <- getLatentCoefficients(ILAAData)
charFormulas <- attr(LatentFormulas,"LatentCharFormulas")
varRatio <- attr(ILAAData,"VarRatio")
fullFrame <- as.data.frame(cbind(Formula=charFormulas,R2=round(1.0-varRatio[names(charFormulas)],3)))
pander::pander(fullFrame)

	Formula	R2
La_Comp	+ Comp - (0.201)Scat.Ra	0.661
La_Circ	+ Circ - (0.160)Scat.Ra	0.74
La_D.Circ	+ D.Circ - (0.430)Scat.Ra - (0.480)Holl.Ra	0.875
La_Rad.Ra	+ Rad.Ra + (0.333)Scat.Ra - (1.198)Sc.Var.Maxis - (1.713)Holl.Ra	0.809
La_Pr.Axis.Ra	- (0.304)Rad.Ra + Pr.Axis.Ra + (0.072)Scat.Ra + (0.172)Sc.Var.Maxis - (0.587)Skew.Maxis - (0.217)Holl.Ra	0.883
La_Max.L.Ra	+ Max.L.Ra + (0.061)Scat.Ra - (0.067)Sc.Var.Maxis - (0.625)Skew.Maxis - (0.507)Holl.Ra	0.491
La_Elong	+ (0.652)Scat.Ra + Elong - (0.080)Sc.Var.maxis	0.971
La_Pr.Axis.Rect	- (0.077)Scat.Ra + Pr.Axis.Rect	0.984
La_Max.L.Rect	- (2.272)Circ + Max.L.Rect	0.933
La_Sc.Var.Maxis	- (0.899)Scat.Ra + Sc.Var.Maxis	0.906
La_Sc.Var.maxis	- (5.295)Scat.Ra + Sc.Var.maxis	0.993
La_Ra.Gyr	- (4.938)Circ + Ra.Gyr	0.876
La_Skew.Maxis	+ (0.263)Scat.Ra - (0.293)Sc.Var.Maxis + Skew.Maxis + (0.811)Holl.Ra	0.791
La_Kurt.Maxis	+ (0.791)Scat.Ra + (1.214)Elong - (0.097)Sc.Var.maxis + Kurt.Maxis - (0.741)Holl.Ra	0.867

gplots::heatmap.2(as.matrix(ILAAData),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle ILAA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(ILAAData)
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle ILAA Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

Some association graphs

## For network analysis
library(igraph)

transform <- attr(ILAAData,"UPLTM") != 0
colnames(transform) <- str_remove_all(colnames(transform),"La_")
# The weights are proportional to the observed correlation
transform <- abs(transform*cor(VehicleMatrix[,rownames(transform)])) 

VertexSize <- attr(ILAAData,"fscore") 

names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
# Normalization
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) 


gr <- graph_from_adjacency_matrix(transform,mode = "directed",
                                  diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr

# The user can use any cluster method. Here we use the optimal clustering.
fc <- cluster_optimal(gr)
plot(fc, gr,
     edge.width=2*E(gr)$weight,
     edge.arrow.size=0.5,
     edge.arrow.width=0.5,
     vertex.size=VertexSize,
     vertex.label.cex=0.85,
     vertex.label.dist=2,
     main="Feature Association")

PCA Transform

pcaobj <- prcomp(VehicleMatrix,center = TRUE, scale.= TRUE,tol=0.0);
pcapred <- predict(pcaobj,VehicleMatrix);



gplots::heatmap.2(as.matrix(pcapred),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle PCA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(pcapred)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "PCA (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
pca_df <- data.frame(
  P1 = pcapred[,1],
  P2 = pcapred[,2],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(pca_df, aes(x = P1, y = P2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "PCA Projection of Numeric Vehicle",
    x = "PC_1",
    y = "PC_2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

UMAP

UMAP is alternative method for dimension reduction

VehicleScaled <- scale(VehicleMatrix)


umap_result <- umap(VehicleScaled, n_neighbors = 15, min_dist = 0.1,n_components=3)


label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
umap_df <- data.frame(
  UMAP1 = umap_result$layout[, 1],
  UMAP2 = umap_result$layout[, 2],
  UMAP3 = umap_result$layout[, 3],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "UMAP Projection of Numeric Vehicle",
    x = "UMAP Dimension 1",
    y = "UMAP Dimension 2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

Two Classifiers and Method Comparison

Create Train and Test sets

set.seed(123)
trainSample <- sample(nrow(Vehicle),0.7*nrow(Vehicle))
TrainSet <- Vehicle[trainSample,]
TestSet <- Vehicle[-trainSample,]

Raw KNN

KNNML <- KNN_method(Class~.,TrainSet,scaleMethod="None")

testPred <- predict(KNNML,TestSet)
rawtestPred <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)

0.6181

pander::pander(table(testPred,TestSet$Class))

	bus	opel	saab	van
bus	46	10	5	8
opel	9	21	20	0
saab	6	22	37	0
van	0	9	8	53

prob = attr(testPred,"prob");

for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

accci:

50% 2.5% 97.5%

0.7008 0.6457 0.7559
senci:

50% 2.5% 97.5%

0.5672 0.4444 0.6842
speci:

50% 2.5% 97.5%

0.7449 0.6813 0.803
aucci:

50% 2.5% 97.5%

0.6551 0.5878 0.7216
berci:

50% 2.5% 97.5%

0.3449 0.2784 0.4122
preci:

50% 2.5% 97.5%

0.4167 0.3132 0.5206
F1ci:

50% 2.5% 97.5%

0.4789 0.377 0.5714

50%	2.5%	97.5%
0.7008	0.6457	0.7559

50%	2.5%	97.5%
0.5672	0.4444	0.6842

50%	2.5%	97.5%
0.7449	0.6813	0.803

50%	2.5%	97.5%
0.6551	0.5878	0.7216

50%	2.5%	97.5%
0.3449	0.2784	0.4122

50%	2.5%	97.5%
0.4167	0.3132	0.5206

50%	2.5%	97.5%
0.4789	0.377	0.5714

KNN with FS

KNNML <- filteredFit(Class~.,TrainSet,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(limit=0,pvalue=0.001),
                     Scale="LRankInv",
                     Transf="ILAA",
                     fitmethod=KNN_method,
                     scale="None"
)

## <Decor><FS

## ><Scale><Fit>

pander::pander(KNNML$usedFeatures)

Class, La_D.Circ, Max.L.Ra, Holl.Ra, La_Kurt.Maxis, Skew.Maxis, Scat.Ra, La_Comp, La_Sc.Var.Maxis, La_Pr.Axis.Rect, Rad.Ra, La_Max.L.Rect, Pr.Axis.Ra, La_Ra.Gyr, La_Elong, Kurt.maxis, La_Circ, Skew.maxis and La_Sc.Var.maxis

testPred <- predict(KNNML,TestSet)

prob = attr(testPred,"prob");

testPredFilter <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)

pander::pander(accuracy)

0.7874

pander::pander(table(testPred,TestSet$Class))

	bus	opel	saab	van
bus	61	3	3	2
opel	0	35	17	0
saab	0	22	47	2
van	0	2	3	57

for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

accci:

50% 2.5% 97.5%

0.7402 0.6849 0.7914
senci:

50% 2.5% 97.5%

0.4821 0.3585 0.6078
speci:

50% 2.5% 97.5%

0.824 0.7677 0.8756
aucci:

50% 2.5% 97.5%

0.6525 0.5864 0.7229
berci:

50% 2.5% 97.5%

0.3475 0.2771 0.4136
preci:

50% 2.5% 97.5%

0.4667 0.3472 0.5938
F1ci:

50% 2.5% 97.5%

0.4741 0.3652 0.5785

50%	2.5%	97.5%
0.7402	0.6849	0.7914

50%	2.5%	97.5%
0.4821	0.3585	0.6078

50%	2.5%	97.5%
0.824	0.7677	0.8756

50%	2.5%	97.5%
0.6525	0.5864	0.7229

50%	2.5%	97.5%
0.3475	0.2771	0.4136

50%	2.5%	97.5%
0.4667	0.3472	0.5938

50%	2.5%	97.5%
0.4741	0.3652	0.5785

Testing if KNN with FS is superior

For binary predictions:

	M_1 correct	M_1 wrong
M_2 correct	\(n_{00}\)	\(n_{01}\)
M_2 wrong	\(n_{10}\)	\(n_{11}\)

correct_1_knn <- 1*(rawtestPred == TestSet$Class)
sum(correct_1_knn)/length(correct_1_knn)

## [1] 0.6181102

correct_2_knn <- 1*(testPredFilter == TestSet$Class)
sum(correct_2_knn)/length(correct_1_knn)

## [1] 0.7874016

## Not totally correct. It Assumes independent samples (Loss of power)
susmatrix <- rbind(c(sum(correct_1_knn),sum(correct_1_knn==0)),c(sum(correct_2_knn),sum(correct_2_knn==0)))
pander::pander(susmatrix)

157	97
200	54

pander::pander(prop.test(susmatrix))

2-sample test for equality of proportions with continuity correction: `susmatrix`
Test statistic	df	P value	Alternative hypothesis	prop 1	prop 2
16.62	1	4.559e-05 * * *	two.sided	0.6181	0.7874

## The correct is use paired samples, we gain power

agreement <- table(correct_1_knn,correct_2_knn)
pander::pander(agreement)

	0	1
0	33	64
1	21	136

ntst <- mcnemar.test(agreement, correct = TRUE)
pander::pander(ntst)

McNemar’s Chi-squared test with continuity correction: `agreement`
Test statistic	df	P value
20.75	1	5.225e-06 * * *

Confidence interval using bootstrap

bootstrap_ci <- function(x, B = 10000) {
  boot_means <- replicate(B, mean(sample(x, replace = TRUE)))
  return(boot_means)
}

Accuracy 95%CI

acc1 <- bootstrap_ci(correct_1_knn)
ci_regr <- quantile(acc1, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_1_knn), ci_regr[1], ci_regr[2]))

## Mean Δ = 0.6181   95% CI = [0.5591, 0.6772]

acc2 <- bootstrap_ci(correct_2_knn)
ci_regr <- quantile(acc2, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_2_knn), ci_regr[1], ci_regr[2]))

## Mean Δ = 0.7874   95% CI = [0.7362, 0.8386]

boxplot(cbind(M1=acc1,M2=acc2),notch = TRUE,main="Accuracy Distribution")

ggplot(data.frame(acc=c(acc1,acc2),Model=as.factor(c(rep(1,length(acc1)),rep(2,length(acc2))))), 
       aes(x=acc,group=Model,fill = Model)) +
    geom_density(color="#e9ecef", alpha=0.8)  +
    labs(title = "Model Accuracy", x = "Accuracy")

Checking Superiority

deltaCorrect <- correct_2_knn - correct_1_knn
sum(deltaCorrect)

## [1] 43

bootmeans <- bootstrap_ci(deltaCorrect)
ci_regr <- quantile(bootmeans, c(0.025, 0.975))
pNozero <- 2*sum(bootmeans<=0)/length(bootmeans) #Two tailed


cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f], p=%.4f\n",
            mean(deltaCorrect), ci_regr[1], ci_regr[2],pNozero))

## Mean Δ = 0.1693   95% CI = [0.1024, 0.2362], p=0.0000

ggplot(data.frame(bootmeans), aes(x=bootmeans)) +
    geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8)  +
    labs(title = "Paired Success", x = "Correct_2 - Correct_1")