MachineLearningDemo

R Machine Learning
Check Correlation
- ILAA
UMAP
Two Classifiers
Testing if filtered method is superior
- Confidence interval using bootstrap
Accuracy 95%CI
Checking Supperiority

R Machine Learning

The libraries

library(mlbench)
library(FRESA.CAD)
library(umap)
library("e1071")
library(gbm)
library(caret)
library(gplots)
library(ggplot2)

Vehicle data set

data("Vehicle", package = "mlbench")
data(mtcars)

Distributions

Some distributions

  p <- ggplot(Vehicle, aes(x = Class, y = Circ, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Circ")
  print(p)

    p <- ggplot(Vehicle, aes(x = Class, y = Elong, fill = Class)) +
    geom_violin(width = 0.8, alpha = 0.6) +
    geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA)  +  # Overlay boxplot
    stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
    labs(title = "Vehicle", x = "Class", y = "Elong")
  print(p)

Check Correlation

Pearson

VehicleMatrix <- as.matrix(Vehicle[,colnames(Vehicle)!="Class"])
rc <- rainbow(unique(Vehicle$Class))
rc <- rc[as.integer(Vehicle$Class)]

cormat <- cor(VehicleMatrix)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

ILAA

ILAAData <- ILAA(as.data.frame(VehicleMatrix),thr=0.50)

LatentFormulas <- getLatentCoefficients(ILAAData)
charFormulas <- attr(LatentFormulas,"LatentCharFormulas")
varRatio <- attr(ILAAData,"VarRatio")
fullFrame <- as.data.frame(cbind(Formula=charFormulas,R2=round(1.0-varRatio[names(charFormulas)],3)))
pander::pander(fullFrame)

	Formula	R2
La_Comp	+ Comp - (0.201)Scat.Ra	0.661
La_Circ	+ Circ - (0.160)Scat.Ra	0.74
La_D.Circ	+ D.Circ - (0.430)Scat.Ra - (0.480)Holl.Ra	0.875
La_Rad.Ra	+ Rad.Ra + (0.333)Scat.Ra - (1.198)Sc.Var.Maxis - (1.713)Holl.Ra	0.809
La_Pr.Axis.Ra	- (0.304)Rad.Ra + Pr.Axis.Ra + (0.072)Scat.Ra + (0.172)Sc.Var.Maxis - (0.587)Skew.Maxis - (0.217)Holl.Ra	0.883
La_Max.L.Ra	+ Max.L.Ra + (0.061)Scat.Ra - (0.067)Sc.Var.Maxis - (0.625)Skew.Maxis - (0.507)Holl.Ra	0.491
La_Elong	+ (0.652)Scat.Ra + Elong - (0.080)Sc.Var.maxis	0.971
La_Pr.Axis.Rect	- (0.077)Scat.Ra + Pr.Axis.Rect	0.984
La_Max.L.Rect	- (2.272)Circ + Max.L.Rect	0.933
La_Sc.Var.Maxis	- (0.899)Scat.Ra + Sc.Var.Maxis	0.906
La_Sc.Var.maxis	- (5.295)Scat.Ra + Sc.Var.maxis	0.993
La_Ra.Gyr	- (4.938)Circ + Ra.Gyr	0.876
La_Skew.Maxis	+ (0.263)Scat.Ra - (0.293)Sc.Var.Maxis + Skew.Maxis + (0.811)Holl.Ra	0.791
La_Kurt.Maxis	+ (0.791)Scat.Ra + (1.214)Elong - (0.097)Sc.Var.maxis + Kurt.Maxis - (0.741)Holl.Ra	0.867

gplots::heatmap.2(as.matrix(ILAAData),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle ILAA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(ILAAData)
gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "Vehicle ILAA Correlation (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

PCA Transform

pcaobj <- prcomp(VehicleMatrix,center = TRUE, scale.= TRUE,tol=0.0);
pcapred <- predict(pcaobj,VehicleMatrix);



gplots::heatmap.2(as.matrix(pcapred),
                  trace = "none",
                  scale = "col",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  RowSideColors=rc,
                  main = "Vehicle PCA",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="Z-Value",
                  xlab="Feature", ylab="Sample")

cormat <- cor(pcapred)
gplots::heatmap.2(cormat,
                  trace = "none",
                  scale = "none",
                  mar = c(10,10),
                  col=rev(heat.colors(5)),
                  main = "PCA (Pearson)",
                  cexRow = 0.5,
                  cexCol = 0.5,
                  key.title=NA,
                  key.xlab="r",
                  xlab="Feature", ylab="Feature")

label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
pca_df <- data.frame(
  P1 = pcapred[,1],
  P2 = pcapred[,2],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(pca_df, aes(x = P1, y = P2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "PCA Projection of Numeric Origen",
    x = "PC_1",
    y = "PC_2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

UMAP

VehicleScaled <- scale(VehicleMatrix)


umap_result <- umap(VehicleScaled, n_neighbors = 15, min_dist = 0.1,n_components=3)


label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
umap_df <- data.frame(
  UMAP1 = umap_result$layout[, 1],
  UMAP2 = umap_result$layout[, 2],
  UMAP3 = umap_result$layout[, 3],
  Label = label  # Add grouping variable (e.g., species)
)

ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Label)) +
  geom_point(size = 3, alpha = 0.8) +  # Adjust point size/transparency
  labs(
    title = "UMAP Projection of Numeric Origen",
    x = "UMAP Dimension 1",
    y = "UMAP Dimension 2",
    color = "Car"
  ) +
  theme_minimal() +  # Clean background
  scale_color_brewer(palette = "Set1")  # Use a nice color palette

Two Classifiers

Create Train and Test sets

set.seed(123)
trainSample <- sample(nrow(Vehicle),0.7*nrow(Vehicle))
TrainSet <- Vehicle[trainSample,]
TestSet <- Vehicle[-trainSample,]

KNN

KNNML <- KNN_method(Class~.,TrainSet,scaleMethod="None")

testPred <- predict(KNNML,TestSet)
rawtestPred <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)

0.6181

pander::pander(table(testPred,TestSet$Class))

	bus	opel	saab	van
bus	46	10	5	8
opel	9	21	20	0
saab	6	22	37	0
van	0	9	8	53

prob = attr(testPred,"prob");

for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

accci:

50% 2.5% 97.5%

0.7008 0.6457 0.7559
senci:

50% 2.5% 97.5%

0.5672 0.4444 0.6842
speci:

50% 2.5% 97.5%

0.7449 0.6813 0.803
aucci:

50% 2.5% 97.5%

0.6551 0.5878 0.7216
berci:

50% 2.5% 97.5%

0.3449 0.2784 0.4122
preci:

50% 2.5% 97.5%

0.4167 0.3132 0.5206
F1ci:

50% 2.5% 97.5%

0.4789 0.377 0.5714

50%	2.5%	97.5%
0.7008	0.6457	0.7559

50%	2.5%	97.5%
0.5672	0.4444	0.6842

50%	2.5%	97.5%
0.7449	0.6813	0.803

50%	2.5%	97.5%
0.6551	0.5878	0.7216

50%	2.5%	97.5%
0.3449	0.2784	0.4122

50%	2.5%	97.5%
0.4167	0.3132	0.5206

50%	2.5%	97.5%
0.4789	0.377	0.5714

KNN with filters

KNNML <- filteredFit(Class~.,TrainSet,
                     filtermethod=univariate_KS,
                     filtermethod.control=list(limit=5,pvalue=0.001),
#                     Scale="LRankInv",
#                     Transf="ILAA",
                     fitmethod=KNN_method,
#                     scale="None"
)

## <FS

## ><Fit>

pander::pander(KNNML$usedFeatures)

Class, Max.L.Ra, Holl.Ra, Skew.Maxis, D.Circ and Sc.Var.maxis

testPred <- predict(KNNML,TestSet)

prob = attr(testPred,"prob");

testPredFilter <- testPred

accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)

pander::pander(accuracy)

0.6614

pander::pander(table(testPred,TestSet$Class))

	bus	opel	saab	van
bus	57	6	5	3
opel	2	21	19	2
saab	0	28	37	3
van	2	7	9	53

for (class in unique(TestSet$Class))
{
  predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
  pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}

pander::pander(pstat$ClassMetrics)

accci:

50% 2.5% 97.5%

0.6732 0.6142 0.7283
senci:

50% 2.5% 97.5%

0.4179 0.3 0.5455
speci:

50% 2.5% 97.5%

0.7562 0.6927 0.8112
aucci:

50% 2.5% 97.5%

0.5871 0.5189 0.6547
berci:

50% 2.5% 97.5%

0.4129 0.3453 0.4811
preci:

50% 2.5% 97.5%

0.3571 0.2499 0.4697
F1ci:

50% 2.5% 97.5%

0.3836 0.2797 0.4882

50%	2.5%	97.5%
0.6732	0.6142	0.7283

50%	2.5%	97.5%
0.4179	0.3	0.5455

50%	2.5%	97.5%
0.7562	0.6927	0.8112

50%	2.5%	97.5%
0.5871	0.5189	0.6547

50%	2.5%	97.5%
0.4129	0.3453	0.4811

50%	2.5%	97.5%
0.3571	0.2499	0.4697

50%	2.5%	97.5%
0.3836	0.2797	0.4882

Testing if filtered method is superior

For binary predictions:

	Model 1 correct	Model 1 wrong
Model 2 correct	\(n_{00}\)	\(n_{01}\)
Model 2 wrong	\(n_{10}\)	\(n_{11}\)

correct_1_knn <- 1*(rawtestPred == TestSet$Class)
sum(correct_1_knn)/length(correct_1_knn)

## [1] 0.6181102

correct_2_knn <- 1*(testPredFilter == TestSet$Class)
sum(correct_2_knn)/length(correct_1_knn)

## [1] 0.6614173

## Not totally correct. It Assumes independent samples (Loss of power)
susmatrix <- rbind(c(sum(correct_1_knn),sum(correct_1_knn==0)),c(sum(correct_2_knn),sum(correct_2_knn==0)))
pander::pander(susmatrix)

157	97
168	86

pander::pander(prop.test(susmatrix))

2-sample test for equality of proportions with continuity correction: `susmatrix`
Test statistic	df	P value	Alternative hypothesis	prop 1	prop 2
0.8541	1	0.3554	two.sided	0.6181	0.6614

## The correct is use paired samples, we gain power

agreement <- table(correct_1_knn,correct_2_knn)
pander::pander(agreement)

	0	1
0	47	50
1	39	118

ntst <- mcnemar.test(agreement, correct = TRUE)
pander::pander(ntst)

McNemar’s Chi-squared test with continuity correction: `agreement`
Test statistic	df	P value
1.124	1	0.2891

Confidence interval using bootstrap

bootstrap_ci <- function(x, B = 10000) {
  boot_means <- replicate(B, mean(sample(x, replace = TRUE)))
  return(boot_means)
}

Accuracy 95%CI

acc1 <- bootstrap_ci(correct_1_knn)
ci_regr <- quantile(acc1, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_1_knn), ci_regr[1], ci_regr[2]))

## Mean Δ = 0.6181   95% CI = [0.5591, 0.6772]

acc2 <- bootstrap_ci(correct_2_knn)
ci_regr <- quantile(acc2, c(0.025, 0.975))

cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f]\n",
            mean(correct_2_knn), ci_regr[1], ci_regr[2]))

## Mean Δ = 0.6614   95% CI = [0.6024, 0.7205]

boxplot(cbind(M1=acc1,M2=acc2),notch = TRUE,main="Accuracy Distribution")

ggplot(data.frame(acc=c(acc1,acc2),Model=as.factor(c(rep(1,length(acc1)),rep(2,length(acc2))))), 
       aes(x=acc,group=Model,fill = Model)) +
    geom_density(color="#e9ecef", alpha=0.8)  +
    labs(title = "Model Accuracy", x = "Accuracy")

Checking Supperiority

deltaCorrect <- correct_2_knn - correct_1_knn
sum(deltaCorrect)

## [1] 11

bootmeans <- bootstrap_ci(deltaCorrect)
ci_regr <- quantile(bootmeans, c(0.025, 0.975))
pNozero <- 2*sum(bootmeans<=0)/length(bootmeans) #Two tailed


cat(sprintf("Mean Δ = %.4f   95%% CI = [%.4f, %.4f], p=%.4f\n",
            mean(deltaCorrect), ci_regr[1], ci_regr[2],pNozero))

## Mean Δ = 0.0433   95% CI = [-0.0276, 0.1142], p=0.2590

ggplot(data.frame(bootmeans), aes(x=bootmeans)) +
    geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8)  +
    labs(title = "Paired Success", x = "Correct_2 - Correct_1")

MachineLearningDemo

Jose Tamez

2025-11-05

R Machine Learning

The libraries

Vehicle data set

Distributions

Check Correlation

ILAA

UMAP

Two Classifiers

Testing if filtered method is superior

Confidence interval using bootstrap

Accuracy 95%CI

Checking Supperiority