library(mlbench)
library(FRESA.CAD)
library(umap)
library("e1071")
library(gbm)
library(caret)
library(gplots)
library(ggplot2)
data("Vehicle", package = "mlbench")
data(mtcars)
Some distributions
p <- ggplot(Vehicle, aes(x = Class, y = Circ, fill = Class)) +
geom_violin(width = 0.8, alpha = 0.6) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) + # Overlay boxplot
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
labs(title = "Vehicle", x = "Class", y = "Circ")
print(p)
p <- ggplot(Vehicle, aes(x = Class, y = Elong, fill = Class)) +
geom_violin(width = 0.8, alpha = 0.6) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) + # Overlay boxplot
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") +
labs(title = "Vehicle", x = "Class", y = "Elong")
print(p)
Pearson
VehicleMatrix <- as.matrix(Vehicle[,colnames(Vehicle)!="Class"])
rc <- rainbow(unique(Vehicle$Class))
rc <- rc[as.integer(Vehicle$Class)]
cormat <- cor(VehicleMatrix)
gplots::heatmap.2(cormat,
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "Vehicle Correlation (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
ILAA is a statistical method that discovers variable associations and creates a new dataset with controled correlation
ILAAData <- ILAA(as.data.frame(VehicleMatrix),thr=0.50)
LatentFormulas <- getLatentCoefficients(ILAAData)
charFormulas <- attr(LatentFormulas,"LatentCharFormulas")
varRatio <- attr(ILAAData,"VarRatio")
fullFrame <- as.data.frame(cbind(Formula=charFormulas,R2=round(1.0-varRatio[names(charFormulas)],3)))
pander::pander(fullFrame)
| Formula | R2 | |
|---|---|---|
| La_Comp | + Comp - (0.201)Scat.Ra | 0.661 |
| La_Circ | + Circ - (0.160)Scat.Ra | 0.74 |
| La_D.Circ | + D.Circ - (0.430)Scat.Ra - (0.480)Holl.Ra | 0.875 |
| La_Rad.Ra | + Rad.Ra + (0.333)Scat.Ra - (1.198)Sc.Var.Maxis - (1.713)Holl.Ra | 0.809 |
| La_Pr.Axis.Ra | - (0.304)Rad.Ra + Pr.Axis.Ra + (0.072)Scat.Ra + (0.172)Sc.Var.Maxis - (0.587)Skew.Maxis - (0.217)Holl.Ra | 0.883 |
| La_Max.L.Ra | + Max.L.Ra + (0.061)Scat.Ra - (0.067)Sc.Var.Maxis - (0.625)Skew.Maxis - (0.507)Holl.Ra | 0.491 |
| La_Elong | + (0.652)Scat.Ra + Elong - (0.080)Sc.Var.maxis | 0.971 |
| La_Pr.Axis.Rect | - (0.077)Scat.Ra + Pr.Axis.Rect | 0.984 |
| La_Max.L.Rect | - (2.272)Circ + Max.L.Rect | 0.933 |
| La_Sc.Var.Maxis | - (0.899)Scat.Ra + Sc.Var.Maxis | 0.906 |
| La_Sc.Var.maxis | - (5.295)Scat.Ra + Sc.Var.maxis | 0.993 |
| La_Ra.Gyr | - (4.938)Circ + Ra.Gyr | 0.876 |
| La_Skew.Maxis | + (0.263)Scat.Ra - (0.293)Sc.Var.Maxis + Skew.Maxis + (0.811)Holl.Ra | 0.791 |
| La_Kurt.Maxis | + (0.791)Scat.Ra + (1.214)Elong - (0.097)Sc.Var.maxis + Kurt.Maxis - (0.741)Holl.Ra | 0.867 |
gplots::heatmap.2(as.matrix(ILAAData),
trace = "none",
scale = "col",
mar = c(10,10),
col=rev(heat.colors(5)),
RowSideColors=rc,
main = "Vehicle ILAA",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="Z-Value",
xlab="Feature", ylab="Sample")
cormat <- cor(ILAAData)
gplots::heatmap.2(abs(cormat),
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "Vehicle ILAA Correlation (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
## For network analysis
library(igraph)
transform <- attr(ILAAData,"UPLTM") != 0
colnames(transform) <- str_remove_all(colnames(transform),"La_")
# The weights are proportional to the observed correlation
transform <- abs(transform*cor(VehicleMatrix[,rownames(transform)]))
VertexSize <- attr(ILAAData,"fscore")
names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
# Normalization
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize))
gr <- graph_from_adjacency_matrix(transform,mode = "directed",
diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr
# The user can use any cluster method. Here we use the optimal clustering.
fc <- cluster_optimal(gr)
plot(fc, gr,
edge.width=2*E(gr)$weight,
edge.arrow.size=0.5,
edge.arrow.width=0.5,
vertex.size=VertexSize,
vertex.label.cex=0.85,
vertex.label.dist=2,
main="Feature Association")
pcaobj <- prcomp(VehicleMatrix,center = TRUE, scale.= TRUE,tol=0.0);
pcapred <- predict(pcaobj,VehicleMatrix);
gplots::heatmap.2(as.matrix(pcapred),
trace = "none",
scale = "col",
mar = c(10,10),
col=rev(heat.colors(5)),
RowSideColors=rc,
main = "Vehicle PCA",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="Z-Value",
xlab="Feature", ylab="Sample")
cormat <- cor(pcapred)
gplots::heatmap.2(cormat,
trace = "none",
scale = "none",
mar = c(10,10),
col=rev(heat.colors(5)),
main = "PCA (Pearson)",
cexRow = 0.5,
cexCol = 0.5,
key.title=NA,
key.xlab="r",
xlab="Feature", ylab="Feature")
label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
pca_df <- data.frame(
P1 = pcapred[,1],
P2 = pcapred[,2],
Label = label # Add grouping variable (e.g., species)
)
ggplot(pca_df, aes(x = P1, y = P2, color = Label)) +
geom_point(size = 3, alpha = 0.8) + # Adjust point size/transparency
labs(
title = "PCA Projection of Numeric Vehicle",
x = "PC_1",
y = "PC_2",
color = "Car"
) +
theme_minimal() + # Clean background
scale_color_brewer(palette = "Set1") # Use a nice color palette
UMAP is alternative method for dimension reduction
VehicleScaled <- scale(VehicleMatrix)
umap_result <- umap(VehicleScaled, n_neighbors = 15, min_dist = 0.1,n_components=3)
label <- as.character(Vehicle$Class)
# Extract UMAP coordinates
umap_df <- data.frame(
UMAP1 = umap_result$layout[, 1],
UMAP2 = umap_result$layout[, 2],
UMAP3 = umap_result$layout[, 3],
Label = label # Add grouping variable (e.g., species)
)
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Label)) +
geom_point(size = 3, alpha = 0.8) + # Adjust point size/transparency
labs(
title = "UMAP Projection of Numeric Vehicle",
x = "UMAP Dimension 1",
y = "UMAP Dimension 2",
color = "Car"
) +
theme_minimal() + # Clean background
scale_color_brewer(palette = "Set1") # Use a nice color palette
Create Train and Test sets
set.seed(123)
trainSample <- sample(nrow(Vehicle),0.7*nrow(Vehicle))
TrainSet <- Vehicle[trainSample,]
TestSet <- Vehicle[-trainSample,]
Raw KNN
KNNML <- KNN_method(Class~.,TrainSet,scaleMethod="None")
testPred <- predict(KNNML,TestSet)
rawtestPred <- testPred
accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)
0.6181
pander::pander(table(testPred,TestSet$Class))
| bus | opel | saab | van | |
|---|---|---|---|---|
| bus | 46 | 10 | 5 | 8 |
| opel | 9 | 21 | 20 | 0 |
| saab | 6 | 22 | 37 | 0 |
| van | 0 | 9 | 8 | 53 |
prob = attr(testPred,"prob");
for (class in unique(TestSet$Class))
{
predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}
pander::pander(pstat$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7008 | 0.6457 | 0.7559 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.5672 | 0.4444 | 0.6842 |
speci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7449 | 0.6813 | 0.803 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.6551 | 0.5878 | 0.7216 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.3449 | 0.2784 | 0.4122 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4167 | 0.3132 | 0.5206 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4789 | 0.377 | 0.5714 |
KNN with FS
KNNML <- filteredFit(Class~.,TrainSet,
filtermethod=univariate_KS,
filtermethod.control=list(limit=0,pvalue=0.001),
Scale="LRankInv",
Transf="ILAA",
fitmethod=KNN_method,
scale="None"
)
## <Decor><FS
## ><Scale><Fit>
pander::pander(KNNML$usedFeatures)
Class, La_D.Circ, Max.L.Ra, Holl.Ra, La_Kurt.Maxis, Skew.Maxis, Scat.Ra, La_Comp, La_Sc.Var.Maxis, La_Pr.Axis.Rect, Rad.Ra, La_Max.L.Rect, Pr.Axis.Ra, La_Ra.Gyr, La_Elong, Kurt.maxis, La_Circ, Skew.maxis and La_Sc.Var.maxis
testPred <- predict(KNNML,TestSet)
prob = attr(testPred,"prob");
testPredFilter <- testPred
accuracy <- sum(testPred==TestSet$Class)/nrow(TestSet)
pander::pander(accuracy)
0.7874
pander::pander(table(testPred,TestSet$Class))
| bus | opel | saab | van | |
|---|---|---|---|---|
| bus | 61 | 3 | 3 | 2 |
| opel | 0 | 35 | 17 | 0 |
| saab | 0 | 22 | 47 | 2 |
| van | 0 | 2 | 3 | 57 |
for (class in unique(TestSet$Class))
{
predictions <- cbind(TestSet$Class==class,abs(1*(testPred!=class)-prob))
pstat <- predictionStats_binary(predictions,plotname=as.character(class))
}
pander::pander(pstat$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7402 | 0.6849 | 0.7914 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4821 | 0.3585 | 0.6078 |
speci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.824 | 0.7677 | 0.8756 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.6525 | 0.5864 | 0.7229 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.3475 | 0.2771 | 0.4136 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4667 | 0.3472 | 0.5938 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.4741 | 0.3652 | 0.5785 |
For binary predictions:
| M_1 correct | M_1 wrong | |
|---|---|---|
| M_2 correct | \(n_{00}\) | \(n_{01}\) |
| M_2 wrong | \(n_{10}\) | \(n_{11}\) |
correct_1_knn <- 1*(rawtestPred == TestSet$Class)
sum(correct_1_knn)/length(correct_1_knn)
## [1] 0.6181102
correct_2_knn <- 1*(testPredFilter == TestSet$Class)
sum(correct_2_knn)/length(correct_1_knn)
## [1] 0.7874016
## Not totally correct. It Assumes independent samples (Loss of power)
susmatrix <- rbind(c(sum(correct_1_knn),sum(correct_1_knn==0)),c(sum(correct_2_knn),sum(correct_2_knn==0)))
pander::pander(susmatrix)
| 157 | 97 |
| 200 | 54 |
pander::pander(prop.test(susmatrix))
| Test statistic | df | P value | Alternative hypothesis | prop 1 | prop 2 |
|---|---|---|---|---|---|
| 16.62 | 1 | 4.559e-05 * * * | two.sided | 0.6181 | 0.7874 |
## The correct is use paired samples, we gain power
agreement <- table(correct_1_knn,correct_2_knn)
pander::pander(agreement)
| 0 | 1 | |
|---|---|---|
| 0 | 33 | 64 |
| 1 | 21 | 136 |
ntst <- mcnemar.test(agreement, correct = TRUE)
pander::pander(ntst)
| Test statistic | df | P value |
|---|---|---|
| 20.75 | 1 | 5.225e-06 * * * |
bootstrap_ci <- function(x, B = 10000) {
boot_means <- replicate(B, mean(sample(x, replace = TRUE)))
return(boot_means)
}
acc1 <- bootstrap_ci(correct_1_knn)
ci_regr <- quantile(acc1, c(0.025, 0.975))
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f]\n",
mean(correct_1_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.6181 95% CI = [0.5591, 0.6772]
acc2 <- bootstrap_ci(correct_2_knn)
ci_regr <- quantile(acc2, c(0.025, 0.975))
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f]\n",
mean(correct_2_knn), ci_regr[1], ci_regr[2]))
## Mean Δ = 0.7874 95% CI = [0.7362, 0.8386]
boxplot(cbind(M1=acc1,M2=acc2),notch = TRUE,main="Accuracy Distribution")
ggplot(data.frame(acc=c(acc1,acc2),Model=as.factor(c(rep(1,length(acc1)),rep(2,length(acc2))))),
aes(x=acc,group=Model,fill = Model)) +
geom_density(color="#e9ecef", alpha=0.8) +
labs(title = "Model Accuracy", x = "Accuracy")
deltaCorrect <- correct_2_knn - correct_1_knn
sum(deltaCorrect)
## [1] 43
bootmeans <- bootstrap_ci(deltaCorrect)
ci_regr <- quantile(bootmeans, c(0.025, 0.975))
pNozero <- 2*sum(bootmeans<=0)/length(bootmeans) #Two tailed
cat(sprintf("Mean Δ = %.4f 95%% CI = [%.4f, %.4f], p=%.4f\n",
mean(deltaCorrect), ci_regr[1], ci_regr[2],pNozero))
## Mean Δ = 0.1693 95% CI = [0.1024, 0.2362], p=0.0000
ggplot(data.frame(bootmeans), aes(x=bootmeans)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
labs(title = "Paired Success", x = "Correct_2 - Correct_1")