library(mlbench)
library(FRESA.CAD)
library(umap)
library("e1071")
library(gbm)
library(caret)
library(gplots)
library(ggplot2)
data("Vehicle", package = "mlbench")
data(mtcars)
Some distributions
p <- ggplot(Vehicle, aes(x = Class, y = Circ, fill = Class)) +
geom_violin(width = 0.8, alpha = 0.6) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) + # Overlay boxplot
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
labs(title = "Vehicle", x = "Class", y = "Circ")
print(p)
p <- ggplot(Vehicle, aes(x = Class, y = Elong, fill = Class)) +
geom_violin(width = 0.8, alpha = 0.6) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) + # Overlay boxplot
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
labs(title = "Vehicle", x = "Class", y = "Elong")
print(p)
Pearson
VehicleMatrix <- as.matrix(Vehicle[,colnames(Vehicle)!="Class"])
rc <- rainbow(unique(Vehicle$Class))
rc <- rc[as.integer(Vehicle$Class)]
cormat <- cor(VehicleMatrix)
gplots::heatmap.2(cormat,
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "Vehicle Correlation (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
ILAAData <- ILAA(as.data.frame(VehicleMatrix),thr=0.50)
LatentFormulas <- getLatentCoefficients(ILAAData)
charFormulas <- attr(LatentFormulas,"LatentCharFormulas")
varRatio <- attr(ILAAData,"VarRatio")
fullFrame <- as.data.frame(cbind(Formula=charFormulas,R2=round(1.0-varRatio[names(charFormulas)],3)))
pander::pander(fullFrame)
| Formula | R2 | |
|---|---|---|
| La_Comp | + Comp - (0.201)Scat.Ra | 0.661 |
| La_Circ | + Circ - (0.160)Scat.Ra | 0.74 |
| La_D.Circ | + D.Circ - (0.430)Scat.Ra - (0.480)Holl.Ra | 0.875 |
| La_Rad.Ra | + Rad.Ra + (0.333)Scat.Ra - (1.198)Sc.Var.Maxis - (1.713)Holl.Ra | 0.809 |
| La_Pr.Axis.Ra | - (0.304)Rad.Ra + Pr.Axis.Ra + (0.072)Scat.Ra + (0.172)Sc.Var.Maxis - (0.587)Skew.Maxis - (0.217)Holl.Ra | 0.883 |
| La_Max.L.Ra | + Max.L.Ra + (0.061)Scat.Ra - (0.067)Sc.Var.Maxis - (0.625)Skew.Maxis - (0.507)Holl.Ra | 0.491 |
| La_Elong | + (0.652)Scat.Ra + Elong - (0.080)Sc.Var.maxis | 0.971 |
| La_Pr.Axis.Rect | - (0.077)Scat.Ra + Pr.Axis.Rect | 0.984 |
| La_Max.L.Rect | - (2.272)Circ + Max.L.Rect | 0.933 |
| La_Sc.Var.Maxis | - (0.899)Scat.Ra + Sc.Var.Maxis | 0.906 |
| La_Sc.Var.maxis | - (5.295)Scat.Ra + Sc.Var.maxis | 0.993 |
| La_Ra.Gyr | - (4.938)Circ + Ra.Gyr | 0.876 |
| La_Skew.Maxis | + (0.263)Scat.Ra - (0.293)Sc.Var.Maxis + Skew.Maxis + (0.811)Holl.Ra | 0.791 |
| La_Kurt.Maxis | + (0.791)Scat.Ra + (1.214)Elong - (0.097)Sc.Var.maxis + Kurt.Maxis - (0.741)Holl.Ra | 0.867 |
gplots::heatmap.2(as.matrix(ILAAData),
trace = "none",
scale = "col",
mar = c(10,10),
col=rev(heat.colors(5)),
RowSideColors=rc,
main = "Vehicle ILAA",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="Z-Value",
xlab="Feature", ylab="Sample")
cormat <- cor(ILAAData)
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "Vehicle ILAA Correlation (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
PCA Transform
pcaobj <- prcomp(VehicleMatrix,center = TRUE, scale.= TRUE,tol=0.0);
pcapred <- predict(pcaobj,VehicleMatrix);
gplots::heatmap.2(as.matrix(pcapred),
trace = "none",
scale = "col",
mar = c(10,10),
col=rev(heat.colors(5)),
RowSideColors=rc,
main = "Vehicle PCA",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="Z-Value",
xlab="Feature", ylab="Sample")
cormat <- cor(pcapred)
gplots::heatmap.2(cormat,
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "PCA (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
pca_df <- data.frame(
P1 = pcapred[,1],
P2 = pcapred[,2],
Label = label # Add grouping variable (e.g., species)
)
ggplot(pca_df, aes(x = P1, y = P2, color = Label)) +
geom_point(size = 3, alpha = 0.8) + # Adjust point size/transparency
labs(
title = "PCA Projection of Numeric Origen",
x = "PC_1",
y = "PC_2",
color = "Car"
) +
theme_minimal() + # Clean background
scale_color_brewer(palette = "Set1") # Use a nice color palette
VehicleScaled <- scale(VehicleMatrix)
umap_result <- umap(VehicleScaled, n_neighbors = 15, min_dist = 0.1,n_components=3)
label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
umap_df <- data.frame(
UMAP1 = umap_result$layout[, 1],
UMAP2 = umap_result$layout[, 2],
UMAP3 = umap_result$layout[, 3],
Label = label # Add grouping variable (e.g., species)
)
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Label)) +
geom_point(size = 3, alpha = 0.8) + # Adjust point size/transparency
labs(
title = "UMAP Projection of Numeric Origen",
x = "UMAP Dimension 1",
y = "UMAP Dimension 2",
color = "Car"
) +
theme_minimal() + # Clean background
scale_color_brewer(palette = "Set1") # Use a nice color palette
Create Train and Test sets
set.seed(123)
trainSample <- sample(nrow(Vehicle),0.7*nrow(Vehicle))
TrainSet <- Vehicle[trainSample,]
TestSet <- Vehicle[-trainSample,]
KNN
KNNML <- KNN_method(Class~.,TrainSet,scaleMethod="None")
testPred <- predict(KNNML,TestSet)
rawtestPred <- testPred
accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)
0.6181
pander::pander(table(testPred,TestSet$Class))
| bus | opel | saab | van | |
|---|---|---|---|---|
| bus | 46 | 10 | 5 | 8 |
| opel | 9 | 21 | 20 | 0 |
| saab | 6 | 22 | 37 | 0 |
| van | 0 | 9 | 8 | 53 |
prob = attr(testPred,"prob");
for (class in unique(TestSet$Class))
{
predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}
pander::pander(pstat$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7008 | 0.6457 | 0.7559 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.5672 | 0.4444 | 0.6842 |
speci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7449 | 0.6813 | 0.803 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.6551 | 0.5878 | 0.7216 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.3449 | 0.2784 | 0.4122 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4167 | 0.3132 | 0.5206 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4789 | 0.377 | 0.5714 |
KNN with filters
KNNML <- filteredFit(Class~.,TrainSet,
filtermethod=univariate_KS,
filtermethod.control=list(limit=5,pvalue=0.001),
# Scale="LRankInv",
# Transf="ILAA",
fitmethod=KNN_method,
# scale="None"
)
## <FS
## ><Fit>
pander::pander(KNNML$usedFeatures)
Class, Max.L.Ra, Holl.Ra, Skew.Maxis, D.Circ and Sc.Var.maxis
testPred <- predict(KNNML,TestSet)
prob = attr(testPred,"prob");
testPredFilter <- testPred
accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)
0.6614
pander::pander(table(testPred,TestSet$Class))
| bus | opel | saab | van | |
|---|---|---|---|---|
| bus | 57 | 6 | 5 | 3 |
| opel | 2 | 21 | 19 | 2 |
| saab | 0 | 28 | 37 | 3 |
| van | 2 | 7 | 9 | 53 |
for (class in unique(TestSet$Class))
{
predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}
pander::pander(pstat$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.6732 | 0.6142 | 0.7283 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4179 | 0.3 | 0.5455 |
speci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7562 | 0.6927 | 0.8112 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.5871 | 0.5189 | 0.6547 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4129 | 0.3453 | 0.4811 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.3571 | 0.2499 | 0.4697 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.3836 | 0.2797 | 0.4882 |
For binary predictions:
| Model 1 correct | Model 1 wrong | |
|---|---|---|
| Model 2 correct | \(n_{00}\) | \(n_{01}\) |
| Model 2 wrong | \(n_{10}\) | \(n_{11}\) |
correct_1_knn <- 1*(rawtestPred == TestSet$Class)
sum(correct_1_knn)/length(correct_1_knn)
## [1] 0.6181102
correct_2_knn <- 1*(testPredFilter == TestSet$Class)
sum(correct_2_knn)/length(correct_1_knn)
## [1] 0.6614173
## Not totally correct. It Assumes independent samples (Loss of power)
susmatrix <- rbind(c(sum(correct_1_knn),sum(correct_1_knn==0)),c(sum(correct_2_knn),sum(correct_2_knn==0)))
pander::pander(susmatrix)
| 157 | 97 |
| 168 | 86 |
pander::pander(prop.test(susmatrix))
| Test statistic | df | P value | Alternative hypothesis | prop 1 | prop 2 |
|---|---|---|---|---|---|
| 0.8541 | 1 | 0.3554 | two.sided | 0.6181 | 0.6614 |
## The correct is use paired samples, we gain power
agreement <- table(correct_1_knn,correct_2_knn)
pander::pander(agreement)
| 0 | 1 | |
|---|---|---|
| 0 | 47 | 50 |
| 1 | 39 | 118 |
ntst <- mcnemar.test(agreement, correct = TRUE)
pander::pander(ntst)
| Test statistic | df | P value |
|---|---|---|
| 1.124 | 1 | 0.2891 |
bootstrap_ci <- function(x, B = 10000) {
boot_means <- replicate(B, mean(sample(x, replace = TRUE)))
return(boot_means)
}
acc1 <- bootstrap_ci(correct_1_knn)
ci_regr <- quantile(acc1, c(0.025, 0.975))
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f]\n",
mean(correct_1_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.6181 95% CI = [0.5591, 0.6772]
acc2 <- bootstrap_ci(correct_2_knn)
ci_regr <- quantile(acc2, c(0.025, 0.975))
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f]\n",
mean(correct_2_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.6614 95% CI = [0.6024, 0.7205]
boxplot(cbind(M1=acc1,M2=acc2),notch = TRUE,main="Accuracy Distribution")
ggplot(data.frame(acc=c(acc1,acc2),Model=as.factor(c(rep(1,length(acc1)),rep(2,length(acc2))))),
aes(x=acc,group=Model,fill = Model)) +
geom_density(color="#e9ecef", alpha=0.8) +
labs(title = "Model Accuracy", x = "Accuracy")
deltaCorrect <- correct_2_knn - correct_1_knn
sum(deltaCorrect)
## [1] 11
bootmeans <- bootstrap_ci(deltaCorrect)
ci_regr <- quantile(bootmeans, c(0.025, 0.975))
pNozero <- 2*sum(bootmeans<=0)/length(bootmeans) #Two tailed
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f], p=%.4f\n",
mean(deltaCorrect), ci_regr[1], ci_regr[2],pNozero))
## Mean Δ = 0.0433 95% CI = [-0.0276, 0.1142], p=0.2590
ggplot(data.frame(bootmeans), aes(x=bootmeans)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
labs(title = "Paired Success", x = "Correct_2 - Correct_1")