Load Dataset
Data <- read_excel("Dry_Bean_Dataset.xlsx")
head(Data)
# A tibble: 6 × 17
Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 28395 610. 208. 174. 1.20 0.550
2 28734 638. 201. 183. 1.10 0.412
3 29380 624. 213. 176. 1.21 0.563
4 30008 646. 211. 183. 1.15 0.499
5 30140 620. 202. 190. 1.06 0.334
6 30279 635. 213. 182. 1.17 0.520
# ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
# Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
# ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>
Area Perimeter MajorAxisLength MinorAxisLength
Min. : 20420 Min. : 524.7 Min. :183.6 Min. :122.5
1st Qu.: 36328 1st Qu.: 703.5 1st Qu.:253.3 1st Qu.:175.8
Median : 44652 Median : 794.9 Median :296.9 Median :192.4
Mean : 53048 Mean : 855.3 Mean :320.1 Mean :202.3
3rd Qu.: 61332 3rd Qu.: 977.2 3rd Qu.:376.5 3rd Qu.:217.0
Max. :254616 Max. :1985.4 Max. :738.9 Max. :460.2
AspectRation Eccentricity ConvexArea EquivDiameter
Min. :1.025 Min. :0.2190 Min. : 20684 Min. :161.2
1st Qu.:1.432 1st Qu.:0.7159 1st Qu.: 36715 1st Qu.:215.1
Median :1.551 Median :0.7644 Median : 45178 Median :238.4
Mean :1.583 Mean :0.7509 Mean : 53768 Mean :253.1
3rd Qu.:1.707 3rd Qu.:0.8105 3rd Qu.: 62294 3rd Qu.:279.4
Max. :2.430 Max. :0.9114 Max. :263261 Max. :569.4
Extent Solidity roundness Compactness
Min. :0.5553 Min. :0.9192 Min. :0.4896 Min. :0.6406
1st Qu.:0.7186 1st Qu.:0.9857 1st Qu.:0.8321 1st Qu.:0.7625
Median :0.7599 Median :0.9883 Median :0.8832 Median :0.8013
Mean :0.7497 Mean :0.9871 Mean :0.8733 Mean :0.7999
3rd Qu.:0.7869 3rd Qu.:0.9900 3rd Qu.:0.9169 3rd Qu.:0.8343
Max. :0.8662 Max. :0.9947 Max. :0.9907 Max. :0.9873
ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
Min. :0.002778 Min. :0.0005642 Min. :0.4103 Min. :0.9477
1st Qu.:0.005900 1st Qu.:0.0011535 1st Qu.:0.5814 1st Qu.:0.9937
Median :0.006645 Median :0.0016935 Median :0.6420 Median :0.9964
Mean :0.006564 Mean :0.0017159 Mean :0.6436 Mean :0.9951
3rd Qu.:0.007271 3rd Qu.:0.0021703 3rd Qu.:0.6960 3rd Qu.:0.9979
Max. :0.010451 Max. :0.0036650 Max. :0.9748 Max. :0.9997
Class
Length:13611
Class :character
Mode :character
Preprocessing
Area Perimeter MajorAxisLength MinorAxisLength AspectRation
0 0 0 0 0
Eccentricity ConvexArea EquivDiameter Extent Solidity
0 0 0 0 0
roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
0 0 0 0 0
ShapeFactor4 Class
0 0
[1] 68
Data_Clean <- Data[!duplicated(Data), ]
sum(duplicated(Data_Clean))
[1] 0
tibble [13,543 × 17] (S3: tbl_df/tbl/data.frame)
$ Area : num [1:13543] 28395 28734 29380 30008 30140 ...
$ Perimeter : num [1:13543] 610 638 624 646 620 ...
$ MajorAxisLength: num [1:13543] 208 201 213 211 202 ...
$ MinorAxisLength: num [1:13543] 174 183 176 183 190 ...
$ AspectRation : num [1:13543] 1.2 1.1 1.21 1.15 1.06 ...
$ Eccentricity : num [1:13543] 0.55 0.412 0.563 0.499 0.334 ...
$ ConvexArea : num [1:13543] 28715 29172 29690 30724 30417 ...
$ EquivDiameter : num [1:13543] 190 191 193 195 196 ...
$ Extent : num [1:13543] 0.764 0.784 0.778 0.783 0.773 ...
$ Solidity : num [1:13543] 0.989 0.985 0.99 0.977 0.991 ...
$ roundness : num [1:13543] 0.958 0.887 0.948 0.904 0.985 ...
$ Compactness : num [1:13543] 0.913 0.954 0.909 0.928 0.971 ...
$ ShapeFactor1 : num [1:13543] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
$ ShapeFactor2 : num [1:13543] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
$ ShapeFactor3 : num [1:13543] 0.834 0.91 0.826 0.862 0.942 ...
$ ShapeFactor4 : num [1:13543] 0.999 0.998 0.999 0.994 0.999 ...
$ Class : chr [1:13543] "SEKER" "SEKER" "SEKER" "SEKER" ...
Data_Clean$Class <- as.factor(Data_Clean$Class)
Exploratory Data
Analysis
Data_EDA <- melt(Data_Clean, id.vars = "Class")
ggplot(Data_EDA, aes(x = value, fill = variable)) +
geom_histogram(bins = 30, color = "white", alpha = 0.85) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_fill_viridis_d(option = "turbo") +
theme_minimal() +
theme(legend.position = "none")

ggplot(Data_EDA, aes(x = variable, y = value)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))

num_data <- Data_Clean[, sapply(Data_Clean, is.numeric)]
cor_matrix <- cor(num_data)
corrplot(cor_matrix, method = "color",
type = "upper",
addCoef.col = "black",
tl.cex = 0.6)

Normalisasi
preprocessing <- preProcess(Data_Clean[, -which(names(Data_Clean) == "Class")],
method = c("center", "scale"))
scaled_features <- predict(preprocessing, Data_Clean)
data_scaled <- scaled_features
head(data_scaled)
# A tibble: 6 × 17
Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.839 -1.14 -1.30 -0.632 -1.57 -2.18
2 -0.827 -1.01 -1.39 -0.436 -1.97 -3.69
3 -0.805 -1.08 -1.25 -0.587 -1.51 -2.04
4 -0.784 -0.974 -1.27 -0.441 -1.74 -2.74
5 -0.779 -1.09 -1.38 -0.268 -2.12 -4.54
6 -0.775 -1.02 -1.25 -0.463 -1.67 -2.50
# ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
# Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
# ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <fct>
Split Data
set.seed(123)
trainIndex <- createDataPartition(data_scaled$Class,
p = 0.8,
list = FALSE)
trainData <- data_scaled[trainIndex, ]
testData <- data_scaled[-trainIndex, ]
[1] 10836 17
[1] 2707 17
Uji Asumsi
X <- trainData[, sapply(trainData, is.numeric)]
mvn_result <- mvn(data = X, mvn_test = "mardia")
mvn_result$multivariate_normality
Test Statistic p.value Method MVN
1 Mardia Skewness 1049136.201 <0.001 asymptotic ✗ Not normal
2 Mardia Kurtosis 2454.398 <0.001 asymptotic ✗ Not normal
X <- data_scaled[, sapply(data_scaled, is.numeric)]
boxM(X, data_scaled$Class)
Box's M-test for Homogeneity of Covariance Matrices
data: X
Chi-Sq (approx.) = 494178, df = 816, p-value < 2.2e-16
Model LDA
lda_model <- lda(Class ~ ., data = trainData)
lda_model
Call:
lda(Class ~ ., data = trainData)
Prior probabilities of groups:
BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
0.09763750 0.03857512 0.12033961 0.26181248 0.13732004 0.14968623 0.19462901
Group means:
Area Perimeter MajorAxisLength MinorAxisLength AspectRation
BARBUNYA 0.57180762 0.8894225 0.5883130 0.84091809 -0.14285939
BOMBAY 4.13755146 3.4289710 3.2185497 3.83241819 0.03936196
CALI 0.77015055 0.9488095 1.0487683 0.75952296 0.62251103
DERMASON -0.71125492 -0.8826555 -0.8534431 -0.81345749 -0.36883173
HOROZ 0.02304925 0.3058018 0.6173534 -0.39996730 1.81446757
SEKER -0.44986884 -0.5955663 -0.8006855 -0.01459277 -1.36662722
SIRA -0.28203815 -0.2715926 -0.2382901 -0.25448403 -0.04650427
Eccentricity ConvexArea EquivDiameter Extent Solidity
BARBUNYA 0.05649438 0.57988638 0.7483854 -0.017032810 -0.92630911
BOMBAY 0.23959193 4.12881879 3.6671191 0.543539672 -0.03464388
CALI 0.70184930 0.77301078 0.9575585 0.172488937 -0.45623494
DERMASON -0.14830741 -0.71186450 -0.8644783 0.061885669 0.23063571
HOROZ 1.27438643 0.02524155 0.1334771 -0.908720210 -0.36927894
SEKER -1.79378594 -0.45418092 -0.4762307 0.443795902 0.68864891
SIRA 0.18249737 -0.28356695 -0.2463856 -0.006855557 0.16621205
roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
BARBUNYA -1.2243982 0.06788537 -1.0646777 -0.55149108 0.04057241
BOMBAY -0.1619461 -0.14681719 -2.7662078 -1.48137638 -0.17442340
CALI -0.4695188 -0.71007860 -0.9801788 -1.02882411 -0.72277999
DERMASON 0.5784417 0.30456139 1.0539803 0.73911377 0.27604353
HOROZ -1.3357880 -1.61802235 0.3875927 -1.12977980 -1.54589291
SEKER 1.1942758 1.56545142 -0.1952061 1.37852462 1.62509135
SIRA 0.1858711 -0.04700660 0.1370170 -0.06128717 -0.07891545
ShapeFactor4
BARBUNYA 0.15711922
BOMBAY -0.74030666
CALI -1.02730089
DERMASON 0.42328582
HOROZ -0.73465772
SEKER 0.75935673
SIRA 0.06776728
Coefficients of linear discriminants:
LD1 LD2 LD3 LD4
Area 23.39734108 -25.8479410 -54.89272213 -37.20729324
Perimeter 11.25313790 -7.2058028 -0.60708057 1.62377214
MajorAxisLength 45.57648946 -36.2851657 -71.35373419 -0.97362021
MinorAxisLength 28.75831861 -15.0257481 -78.56710179 -8.30793481
AspectRation -7.59356084 1.3537399 -29.56193994 23.69582547
Eccentricity -0.65574839 3.9280251 13.66115858 -12.75200880
ConvexArea -19.60545622 20.5318294 49.42580336 23.29493258
EquivDiameter -93.62529662 58.0986512 151.61521978 25.91358937
Extent -0.05506709 0.0252998 0.02763997 0.10767448
Solidity -0.12609946 0.4028934 0.36304630 0.02741002
roundness 1.80680785 -1.6229142 -0.53775543 -0.38070526
Compactness -5.99435881 -46.6390406 -215.79894130 179.25190564
ShapeFactor1 -2.56636706 -4.8993911 4.12887376 3.30765154
ShapeFactor2 -5.46843407 8.3018612 -8.96448056 1.19153970
ShapeFactor3 3.24628911 37.7458512 215.82921132 -164.21191638
ShapeFactor4 0.82609143 -0.8030443 -0.66953355 -0.32929734
LD5 LD6
Area -18.85238141 13.19268824
Perimeter 17.11967702 -6.46904690
MajorAxisLength -58.20395317 -37.39802390
MinorAxisLength -44.49061346 -22.57486985
AspectRation 21.34366833 24.13568376
Eccentricity 1.52504290 -7.85036379
ConvexArea 25.20955649 10.03994788
EquivDiameter 73.63486628 26.55261608
Extent -0.04616184 -0.04944707
Solidity 0.20958046 0.25943822
roundness 0.98548270 -0.90128789
Compactness 41.29165019 92.84071047
ShapeFactor1 1.25045205 -11.30258535
ShapeFactor2 -5.62211236 -2.64844772
ShapeFactor3 -15.08326222 -82.57937068
ShapeFactor4 -1.18197160 -1.17051531
Proportion of trace:
LD1 LD2 LD3 LD4 LD5 LD6
0.5442 0.2147 0.1061 0.0906 0.0293 0.0150
pred <- predict(lda_model, testData)
y_pred <- pred$class
Evaluasi Model
confusionMatrix(y_pred, testData$Class)
Confusion Matrix and Statistics
Reference
Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
BARBUNYA 227 0 0 1 1 4 1
BOMBAY 0 104 0 0 0 0 0
CALI 15 0 312 0 7 0 0
DERMASON 0 0 0 615 0 4 30
HOROZ 0 0 3 1 347 0 3
SEKER 3 0 0 6 0 369 0
SIRA 19 0 11 86 17 28 493
Overall Statistics
Accuracy : 0.9113
95% CI : (0.9, 0.9218)
No Information Rate : 0.2619
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8927
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
Sensitivity 0.85985 1.00000 0.9571 0.8674
Specificity 0.99713 1.00000 0.9908 0.9830
Pos Pred Value 0.97009 1.00000 0.9341 0.9476
Neg Pred Value 0.98504 1.00000 0.9941 0.9543
Prevalence 0.09752 0.03842 0.1204 0.2619
Detection Rate 0.08386 0.03842 0.1153 0.2272
Detection Prevalence 0.08644 0.03842 0.1234 0.2397
Balanced Accuracy 0.92849 1.00000 0.9739 0.9252
Class: HOROZ Class: SEKER Class: SIRA
Sensitivity 0.9328 0.9111 0.9355
Specificity 0.9970 0.9961 0.9261
Pos Pred Value 0.9802 0.9762 0.7538
Neg Pred Value 0.9894 0.9845 0.9834
Prevalence 0.1374 0.1496 0.1947
Detection Rate 0.1282 0.1363 0.1821
Detection Prevalence 0.1308 0.1396 0.2416
Balanced Accuracy 0.9649 0.9536 0.9308
Visualisasi
Model
lda_values <- predict(lda_model, trainData)$x
lda_df <- data.frame(lda_values, Class = trainData$Class)
ggplot(lda_df, aes(x = LD1, y = LD2, color = Class)) +
geom_point(size = 2) +
theme_minimal() +
ggtitle("Visualisasi LDA")

lda_test <- data.frame(predict(lda_model, testData)$x,
Class = testData$Class)
ggplot(lda_test, aes(x = LD1, y = LD2, color = Class)) +
geom_point() +
theme_minimal() +
ggtitle("Visualisasi LDA (Test)")

Model Regresi Logistik
Multinomial
rlm_model <- multinom(Class ~ ., data = trainData)
# weights: 126 (102 variable)
initial value 21085.882375
iter 10 value 3856.057866
iter 20 value 3312.312610
iter 30 value 2968.792738
iter 40 value 2502.892955
iter 50 value 2329.134445
iter 60 value 2256.927312
iter 70 value 2238.555453
iter 80 value 2232.011572
iter 90 value 2227.977694
iter 100 value 2222.741619
final value 2222.741619
stopped after 100 iterations
Call:
multinom(formula = Class ~ ., data = trainData)
Coefficients:
(Intercept) Area Perimeter MajorAxisLength MinorAxisLength
BOMBAY -9.5320577 -16.939776 18.66210 23.782848 13.15182
CALI 1.6164780 -17.232529 -43.29224 9.981186 20.72500
DERMASON -7.1553069 -4.183068 19.20904 -42.363152 19.26957
HOROZ 2.8127734 9.672490 15.30882 -28.268898 11.36563
SEKER 1.6482085 13.454729 34.60819 -8.036828 -29.84899
SIRA 0.4033704 16.360589 -94.42995 40.280551 -22.97214
AspectRation Eccentricity ConvexArea EquivDiameter Extent
BOMBAY 15.8703204 3.151927 -13.143173 14.047293 -0.1460726
CALI -0.7885896 7.927178 8.040230 24.968071 0.1807100
DERMASON 24.9733329 10.410076 -27.586910 -8.564135 -0.7439593
HOROZ 44.1310513 26.110463 17.615493 -35.924408 -0.3465213
SEKER 22.7652766 12.397772 -4.179395 -16.384160 -0.6531727
SIRA 5.7370370 7.795294 7.086346 17.006864 -0.3629133
Solidity roundness Compactness ShapeFactor1 ShapeFactor2
BOMBAY 0.1221631 12.414307 9.774938 28.676567 4.528267
CALI 1.4719010 -3.653080 -40.866698 -9.594783 33.492280
DERMASON 1.1861984 5.099350 15.454276 -14.319468 20.712571
HOROZ 2.3598635 3.463466 21.024750 -10.001013 19.098113
SEKER 1.1855745 6.057574 6.018438 -7.029113 2.713861
SIRA 1.7140438 -10.309345 14.531442 -17.152311 -6.706350
ShapeFactor3 ShapeFactor4
BOMBAY 15.49839 -1.755712
CALI 15.76397 -2.756195
DERMASON -21.78086 -2.694348
HOROZ 12.36109 -5.549705
SEKER 32.40404 -1.338800
SIRA 10.56191 -3.773056
Std. Errors:
(Intercept) Area Perimeter MajorAxisLength MinorAxisLength
BOMBAY 40.038218 14.44703 9.757591 33.25405 33.82532
CALI 1.036809 65.63222 16.092764 57.04617 29.22216
DERMASON 6.416705 63.45735 20.301846 77.35514 53.40084
HOROZ 1.860615 66.51406 14.412296 60.20565 36.59607
SEKER 2.513728 39.39733 11.176777 72.84690 28.62107
SIRA 2.614421 73.01485 19.494584 76.32722 49.35490
AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity
BOMBAY 66.60843 18.367387 16.60340 16.04129 3.0905650 5.4485653
CALI 21.90414 7.978927 60.82278 42.83807 0.1217717 0.7107297
DERMASON 27.44562 9.299886 88.68395 49.09663 0.1695578 0.6239311
HOROZ 24.38896 21.108709 63.09097 57.61926 0.1540099 0.6688650
SEKER 23.13618 3.418815 58.95367 38.50629 0.2054516 0.5769581
SIRA 22.67236 9.488301 63.38382 59.77790 0.1543245 0.5986858
roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
BOMBAY 12.982257 25.56771 45.95747 41.85893 50.34604
CALI 2.718365 88.15819 23.00454 23.31996 79.25210
DERMASON 2.738341 103.32119 22.83557 29.26875 84.45105
HOROZ 2.339415 70.64982 16.69657 24.78084 67.43142
SEKER 1.789487 36.14537 27.65358 24.87151 27.45970
SIRA 2.711450 67.16213 21.15080 25.64620 62.50354
ShapeFactor4
BOMBAY 5.1969776
CALI 0.8890355
DERMASON 1.1631606
HOROZ 1.1852352
SEKER 0.8566593
SIRA 1.2982195
Residual Deviance: 4445.483
AIC: 4649.483
Evaluasi Model
pred_rlm <- predict(rlm_model, newdata = testData)
eval_rlm <- confusionMatrix(pred_rlm, testData$Class)
eval_rlm
Confusion Matrix and Statistics
Reference
Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
BARBUNYA 251 0 9 1 2 3 1
BOMBAY 0 104 0 0 0 0 0
CALI 6 0 308 0 6 0 1
DERMASON 0 0 0 665 4 5 54
HOROZ 0 0 4 2 349 0 5
SEKER 2 0 0 6 0 385 3
SIRA 5 0 5 35 11 12 463
Overall Statistics
Accuracy : 0.9328
95% CI : (0.9227, 0.9419)
No Information Rate : 0.2619
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9186
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
Sensitivity 0.95076 1.00000 0.9448 0.9379
Specificity 0.99345 1.00000 0.9945 0.9685
Pos Pred Value 0.94007 1.00000 0.9595 0.9135
Neg Pred Value 0.99467 1.00000 0.9925 0.9778
Prevalence 0.09752 0.03842 0.1204 0.2619
Detection Rate 0.09272 0.03842 0.1138 0.2457
Detection Prevalence 0.09863 0.03842 0.1186 0.2689
Balanced Accuracy 0.97210 1.00000 0.9697 0.9532
Class: HOROZ Class: SEKER Class: SIRA
Sensitivity 0.9382 0.9506 0.8786
Specificity 0.9953 0.9952 0.9688
Pos Pred Value 0.9694 0.9722 0.8719
Neg Pred Value 0.9902 0.9913 0.9706
Prevalence 0.1374 0.1496 0.1947
Detection Rate 0.1289 0.1422 0.1710
Detection Prevalence 0.1330 0.1463 0.1962
Balanced Accuracy 0.9667 0.9729 0.9237
Visualisasi
Model
cm_data <- as.data.frame(eval_rlm$table)
ggplot(data = cm_data, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(label = Freq), color = "black", size = 4) +
theme_minimal() +
labs(title = "Heatmap Confusion Matrix - Regresi Logistik Multinomial",
subtitle = "Sumbu X: Kelas Aktual | Sumbu Y: Kelas Prediksi",
x = "Aktual (Reference)",
y = "Prediksi (Prediction)",
fill = "Frekuensi") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
