Demos for the Mathematics for Computer Science

The libraries

library(mlbench)
library(FRESA.CAD)
library(umap)
library("e1071")
library(gbm)
library(caret)
library(gplots)
library(ggplot2)

Vehicle data set

data("Vehicle", package = "mlbench")
data(mtcars)

Distributions

Some distributions

  p <- ggplot(Vehicle, aes(x = Class, y = Circ, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Circ")
  print(p)

    p <- ggplot(Vehicle, aes(x = Class, y = Elong, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Elong")
  print(p)

Check Correlation

Pearson

VehicleMatrix <- as.matrix(Vehicle[,colnames(Vehicle)!="Class"])
rc <- rainbow(unique(Vehicle$Class))
rc <- rc[as.integer(Vehicle$Class)]

cormat <- cor(VehicleMatrix)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

ILAA

ILAA is a statistical method that discovers variable associations and creates a new dataset with controled correlation

ILAAData <- ILAA(as.data.frame(VehicleMatrix),thr=0.50)

LatentFormulas <- getLatentCoefficients(ILAAData)
charFormulas <- attr(LatentFormulas,"LatentCharFormulas")
varRatio <- attr(ILAAData,"VarRatio")
fullFrame <- as.data.frame(cbind(Formula=charFormulas,R2=round(1.0-varRatio[names(charFormulas)],3)))
pander::pander(fullFrame)
  Formula R2
La_Comp + Comp - (0.201)Scat.Ra 0.661
La_Circ + Circ - (0.160)Scat.Ra 0.74
La_D.Circ + D.Circ - (0.430)Scat.Ra - (0.480)Holl.Ra 0.875
La_Rad.Ra + Rad.Ra + (0.333)Scat.Ra - (1.198)Sc.Var.Maxis - (1.713)Holl.Ra 0.809
La_Pr.Axis.Ra - (0.304)Rad.Ra + Pr.Axis.Ra + (0.072)Scat.Ra + (0.172)Sc.Var.Maxis - (0.587)Skew.Maxis - (0.217)Holl.Ra 0.883
La_Max.L.Ra + Max.L.Ra + (0.061)Scat.Ra - (0.067)Sc.Var.Maxis - (0.625)Skew.Maxis - (0.507)Holl.Ra 0.491
La_Elong + (0.652)Scat.Ra + Elong - (0.080)Sc.Var.maxis 0.971
La_Pr.Axis.Rect - (0.077)Scat.Ra + Pr.Axis.Rect 0.984
La_Max.L.Rect - (2.272)Circ + Max.L.Rect 0.933
La_Sc.Var.Maxis - (0.899)Scat.Ra + Sc.Var.Maxis 0.906
La_Sc.Var.maxis - (5.295)Scat.Ra + Sc.Var.maxis 0.993
La_Ra.Gyr - (4.938)Circ + Ra.Gyr 0.876
La_Skew.Maxis + (0.263)Scat.Ra - (0.293)Sc.Var.Maxis + Skew.Maxis + (0.811)Holl.Ra 0.791
La_Kurt.Maxis + (0.791)Scat.Ra + (1.214)Elong - (0.097)Sc.Var.maxis + Kurt.Maxis - (0.741)Holl.Ra 0.867
gplots::heatmap.2(as.matrix(ILAAData),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle ILAA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(ILAAData)
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle ILAA Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

Some association graphs

## For network analysis
library(igraph)

transform <- attr(ILAAData,"UPLTM") != 0
colnames(transform) <- str_remove_all(colnames(transform),"La_")
# The weights are proportional to the observed correlation
transform <- abs(transform*cor(VehicleMatrix[,rownames(transform)])) 

VertexSize <- attr(ILAAData,"fscore") 

names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
# Normalization
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) 


gr <- graph_from_adjacency_matrix(transform,mode = "directed",
                                  diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr

# The user can use any cluster method. Here we use the optimal clustering.
fc <- cluster_optimal(gr)
plot(fc, gr,
     edge.width=2*E(gr)$weight,
     edge.arrow.size=0.5,
     edge.arrow.width=0.5,
     vertex.size=VertexSize,
     vertex.label.cex=0.85,
     vertex.label.dist=2,
     main="Feature Association")

PCA Transform

pcaobj <- prcomp(VehicleMatrix,center = TRUE, scale.= TRUE,tol=0.0);
pcapred <- predict(pcaobj,VehicleMatrix);



gplots::heatmap.2(as.matrix(pcapred),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle PCA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(pcapred)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "PCA (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
pca_df <- data.frame(
  P1 = pcapred[,1],
  P2 = pcapred[,2],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(pca_df, aes(x = P1, y = P2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "PCA Projection of Numeric Vehicle",
    x = "PC_1",
    y = "PC_2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

UMAP

UMAP is alternative method for dimension reduction

VehicleScaled <- scale(VehicleMatrix)


umap_result <- umap(VehicleScaled, n_neighbors = 15, min_dist = 0.1,n_components=3)


label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
umap_df <- data.frame(
  UMAP1 = umap_result$layout[, 1],
  UMAP2 = umap_result$layout[, 2],
  UMAP3 = umap_result$layout[, 3],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "UMAP Projection of Numeric Vehicle",
    x = "UMAP Dimension 1",
    y = "UMAP Dimension 2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

Two Classifiers and Method Comparison

Create Train and Test sets

set.seed(123)
trainSample <- sample(nrow(Vehicle),0.7*nrow(Vehicle))
TrainSet <- Vehicle[trainSample,]
TestSet <- Vehicle[-trainSample,]

Raw KNN

KNNML <- KNN_method(Class~.,TrainSet,scaleMethod="None")

testPred <- predict(KNNML,TestSet)
rawtestPred <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)

0.6181

pander::pander(table(testPred,TestSet$Class))
  bus opel saab van
bus 46 10 5 8
opel 9 21 20 0
saab 6 22 37 0
van 0 9 8 53
prob = attr(testPred,"prob");

for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

KNN with FS

KNNML <- filteredFit(Class~.,TrainSet,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(limit=0,pvalue=0.001),
                     Scale="LRankInv",
                     Transf="ILAA",
                     fitmethod=KNN_method,
                     scale="None"
)
## <Decor><FS
## ><Scale><Fit>
pander::pander(KNNML$usedFeatures)

Class, La_D.Circ, Max.L.Ra, Holl.Ra, La_Kurt.Maxis, Skew.Maxis, Scat.Ra, La_Comp, La_Sc.Var.Maxis, La_Pr.Axis.Rect, Rad.Ra, La_Max.L.Rect, Pr.Axis.Ra, La_Ra.Gyr, La_Elong, Kurt.maxis, La_Circ, Skew.maxis and La_Sc.Var.maxis

testPred <- predict(KNNML,TestSet)

prob = attr(testPred,"prob");

testPredFilter <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)

pander::pander(accuracy)

0.7874

pander::pander(table(testPred,TestSet$Class))
  bus opel saab van
bus 61 3 3 2
opel 0 35 17 0
saab 0 22 47 2
van 0 2 3 57
for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

Testing if KNN with FS is superior

For binary predictions:

M_1 correct M_1 wrong
M_2 correct \(n_{00}\) \(n_{01}\)
M_2 wrong \(n_{10}\) \(n_{11}\)
correct_1_knn <- 1*(rawtestPred == TestSet$Class)
sum(correct_1_knn)/length(correct_1_knn)
## [1] 0.6181102
correct_2_knn <- 1*(testPredFilter == TestSet$Class)
sum(correct_2_knn)/length(correct_1_knn)
## [1] 0.7874016
## Not totally correct. It Assumes independent samples (Loss of power)
susmatrix <- rbind(c(sum(correct_1_knn),sum(correct_1_knn==0)),c(sum(correct_2_knn),sum(correct_2_knn==0)))
pander::pander(susmatrix)
157 97
200 54
pander::pander(prop.test(susmatrix))
2-sample test for equality of proportions with continuity correction: susmatrix
Test statistic df P value Alternative hypothesis prop 1 prop 2
16.62 1 4.559e-05 * * * two.sided 0.6181 0.7874
## The correct is use paired samples, we gain power

agreement <- table(correct_1_knn,correct_2_knn)
pander::pander(agreement)
  0 1
0 33 64
1 21 136
ntst <- mcnemar.test(agreement, correct = TRUE)
pander::pander(ntst)
McNemar’s Chi-squared test with continuity correction: agreement
Test statistic df P value
20.75 1 5.225e-06 * * *

Confidence interval using bootstrap

bootstrap_ci <- function(x, B = 10000) {
  boot_means <- replicate(B, mean(sample(x, replace = TRUE)))
  return(boot_means)
}

Accuracy 95%CI

acc1 <- bootstrap_ci(correct_1_knn)
ci_regr <- quantile(acc1, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_1_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.6181   95% CI = [0.5591, 0.6772]
acc2 <- bootstrap_ci(correct_2_knn)
ci_regr <- quantile(acc2, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_2_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.7874   95% CI = [0.7362, 0.8386]
boxplot(cbind(M1=acc1,M2=acc2),notch = TRUE,main="Accuracy Distribution")

ggplot(data.frame(acc=c(acc1,acc2),Model=as.factor(c(rep(1,length(acc1)),rep(2,length(acc2))))), 
       aes(x=acc,group=Model,fill = Model)) +
    geom_density(color="#e9ecef", alpha=0.8)  +
    labs(title = "Model Accuracy", x = "Accuracy")

Checking Superiority

deltaCorrect <- correct_2_knn - correct_1_knn
sum(deltaCorrect)
## [1] 43
bootmeans <- bootstrap_ci(deltaCorrect)
ci_regr <- quantile(bootmeans, c(0.025, 0.975))
pNozero <- 2*sum(bootmeans<=0)/length(bootmeans) #Two tailed


cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f], p=%.4f\n",
            mean(deltaCorrect), ci_regr[1], ci_regr[2],pNozero))
## Mean Δ = 0.1693   95% CI = [0.1024, 0.2362], p=0.0000
ggplot(data.frame(bootmeans), aes(x=bootmeans)) +
    geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8)  +
    labs(title = "Paired Success", x = "Correct_2 - Correct_1")