1 Load Dataset

Data <- read_excel("Dry_Bean_Dataset.xlsx")
head(Data)
# A tibble: 6 × 17
   Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
  <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
1 28395      610.            208.            174.         1.20        0.550
2 28734      638.            201.            183.         1.10        0.412
3 29380      624.            213.            176.         1.21        0.563
4 30008      646.            211.            183.         1.15        0.499
5 30140      620.            202.            190.         1.06        0.334
6 30279      635.            213.            182.         1.17        0.520
# ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
#   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
#   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>
summary(Data)
      Area          Perimeter      MajorAxisLength MinorAxisLength
 Min.   : 20420   Min.   : 524.7   Min.   :183.6   Min.   :122.5  
 1st Qu.: 36328   1st Qu.: 703.5   1st Qu.:253.3   1st Qu.:175.8  
 Median : 44652   Median : 794.9   Median :296.9   Median :192.4  
 Mean   : 53048   Mean   : 855.3   Mean   :320.1   Mean   :202.3  
 3rd Qu.: 61332   3rd Qu.: 977.2   3rd Qu.:376.5   3rd Qu.:217.0  
 Max.   :254616   Max.   :1985.4   Max.   :738.9   Max.   :460.2  
  AspectRation    Eccentricity      ConvexArea     EquivDiameter  
 Min.   :1.025   Min.   :0.2190   Min.   : 20684   Min.   :161.2  
 1st Qu.:1.432   1st Qu.:0.7159   1st Qu.: 36715   1st Qu.:215.1  
 Median :1.551   Median :0.7644   Median : 45178   Median :238.4  
 Mean   :1.583   Mean   :0.7509   Mean   : 53768   Mean   :253.1  
 3rd Qu.:1.707   3rd Qu.:0.8105   3rd Qu.: 62294   3rd Qu.:279.4  
 Max.   :2.430   Max.   :0.9114   Max.   :263261   Max.   :569.4  
     Extent          Solidity        roundness       Compactness    
 Min.   :0.5553   Min.   :0.9192   Min.   :0.4896   Min.   :0.6406  
 1st Qu.:0.7186   1st Qu.:0.9857   1st Qu.:0.8321   1st Qu.:0.7625  
 Median :0.7599   Median :0.9883   Median :0.8832   Median :0.8013  
 Mean   :0.7497   Mean   :0.9871   Mean   :0.8733   Mean   :0.7999  
 3rd Qu.:0.7869   3rd Qu.:0.9900   3rd Qu.:0.9169   3rd Qu.:0.8343  
 Max.   :0.8662   Max.   :0.9947   Max.   :0.9907   Max.   :0.9873  
  ShapeFactor1       ShapeFactor2        ShapeFactor3     ShapeFactor4   
 Min.   :0.002778   Min.   :0.0005642   Min.   :0.4103   Min.   :0.9477  
 1st Qu.:0.005900   1st Qu.:0.0011535   1st Qu.:0.5814   1st Qu.:0.9937  
 Median :0.006645   Median :0.0016935   Median :0.6420   Median :0.9964  
 Mean   :0.006564   Mean   :0.0017159   Mean   :0.6436   Mean   :0.9951  
 3rd Qu.:0.007271   3rd Qu.:0.0021703   3rd Qu.:0.6960   3rd Qu.:0.9979  
 Max.   :0.010451   Max.   :0.0036650   Max.   :0.9748   Max.   :0.9997  
    Class          
 Length:13611      
 Class :character  
 Mode  :character  
                   
                   
                   

2 Preprocessing

colSums(is.na(Data))
           Area       Perimeter MajorAxisLength MinorAxisLength    AspectRation 
              0               0               0               0               0 
   Eccentricity      ConvexArea   EquivDiameter          Extent        Solidity 
              0               0               0               0               0 
      roundness     Compactness    ShapeFactor1    ShapeFactor2    ShapeFactor3 
              0               0               0               0               0 
   ShapeFactor4           Class 
              0               0 
sum(duplicated(Data))
[1] 68
Data_Clean <- Data[!duplicated(Data), ]
sum(duplicated(Data_Clean))
[1] 0
str(Data_Clean)
tibble [13,543 × 17] (S3: tbl_df/tbl/data.frame)
 $ Area           : num [1:13543] 28395 28734 29380 30008 30140 ...
 $ Perimeter      : num [1:13543] 610 638 624 646 620 ...
 $ MajorAxisLength: num [1:13543] 208 201 213 211 202 ...
 $ MinorAxisLength: num [1:13543] 174 183 176 183 190 ...
 $ AspectRation   : num [1:13543] 1.2 1.1 1.21 1.15 1.06 ...
 $ Eccentricity   : num [1:13543] 0.55 0.412 0.563 0.499 0.334 ...
 $ ConvexArea     : num [1:13543] 28715 29172 29690 30724 30417 ...
 $ EquivDiameter  : num [1:13543] 190 191 193 195 196 ...
 $ Extent         : num [1:13543] 0.764 0.784 0.778 0.783 0.773 ...
 $ Solidity       : num [1:13543] 0.989 0.985 0.99 0.977 0.991 ...
 $ roundness      : num [1:13543] 0.958 0.887 0.948 0.904 0.985 ...
 $ Compactness    : num [1:13543] 0.913 0.954 0.909 0.928 0.971 ...
 $ ShapeFactor1   : num [1:13543] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
 $ ShapeFactor2   : num [1:13543] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
 $ ShapeFactor3   : num [1:13543] 0.834 0.91 0.826 0.862 0.942 ...
 $ ShapeFactor4   : num [1:13543] 0.999 0.998 0.999 0.994 0.999 ...
 $ Class          : chr [1:13543] "SEKER" "SEKER" "SEKER" "SEKER" ...
Data_Clean$Class <- as.factor(Data_Clean$Class)

3 Exploratory Data Analysis

Data_EDA <- melt(Data_Clean, id.vars = "Class")
ggplot(Data_EDA, aes(x = value, fill = variable)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_fill_viridis_d(option = "turbo") + 
  theme_minimal() +
  theme(legend.position = "none")

ggplot(Data_EDA, aes(x = variable, y = value)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90))

num_data <- Data_Clean[, sapply(Data_Clean, is.numeric)]
cor_matrix <- cor(num_data)
corrplot(cor_matrix, method = "color",
         type = "upper",
         addCoef.col = "black",
         tl.cex = 0.6)

4 Normalisasi

preprocessing <- preProcess(Data_Clean[, -which(names(Data_Clean) == "Class")],
                      method = c("center", "scale"))
scaled_features <- predict(preprocessing, Data_Clean)
data_scaled <- scaled_features
head(data_scaled)
# A tibble: 6 × 17
    Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
   <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
1 -0.839    -1.14            -1.30          -0.632        -1.57        -2.18
2 -0.827    -1.01            -1.39          -0.436        -1.97        -3.69
3 -0.805    -1.08            -1.25          -0.587        -1.51        -2.04
4 -0.784    -0.974           -1.27          -0.441        -1.74        -2.74
5 -0.779    -1.09            -1.38          -0.268        -2.12        -4.54
6 -0.775    -1.02            -1.25          -0.463        -1.67        -2.50
# ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
#   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
#   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <fct>

5 Split Data

set.seed(123)
trainIndex <- createDataPartition(data_scaled$Class, 
                                  p = 0.8, 
                                  list = FALSE)
trainData <- data_scaled[trainIndex, ]
testData  <- data_scaled[-trainIndex, ]
dim(trainData)
[1] 10836    17
dim(testData)
[1] 2707   17

6 Uji Asumsi

X <- trainData[, sapply(trainData, is.numeric)]
mvn_result <- mvn(data = X, mvn_test = "mardia")
mvn_result$multivariate_normality
             Test   Statistic p.value     Method          MVN
1 Mardia Skewness 1049136.201  <0.001 asymptotic ✗ Not normal
2 Mardia Kurtosis    2454.398  <0.001 asymptotic ✗ Not normal
X <- data_scaled[, sapply(data_scaled, is.numeric)]
boxM(X, data_scaled$Class)

    Box's M-test for Homogeneity of Covariance Matrices

data:  X
Chi-Sq (approx.) = 494178, df = 816, p-value < 2.2e-16

7 Model LDA

lda_model <- lda(Class ~ ., data = trainData)
lda_model
Call:
lda(Class ~ ., data = trainData)

Prior probabilities of groups:
  BARBUNYA     BOMBAY       CALI   DERMASON      HOROZ      SEKER       SIRA 
0.09763750 0.03857512 0.12033961 0.26181248 0.13732004 0.14968623 0.19462901 

Group means:
                Area  Perimeter MajorAxisLength MinorAxisLength AspectRation
BARBUNYA  0.57180762  0.8894225       0.5883130      0.84091809  -0.14285939
BOMBAY    4.13755146  3.4289710       3.2185497      3.83241819   0.03936196
CALI      0.77015055  0.9488095       1.0487683      0.75952296   0.62251103
DERMASON -0.71125492 -0.8826555      -0.8534431     -0.81345749  -0.36883173
HOROZ     0.02304925  0.3058018       0.6173534     -0.39996730   1.81446757
SEKER    -0.44986884 -0.5955663      -0.8006855     -0.01459277  -1.36662722
SIRA     -0.28203815 -0.2715926      -0.2382901     -0.25448403  -0.04650427
         Eccentricity  ConvexArea EquivDiameter       Extent    Solidity
BARBUNYA   0.05649438  0.57988638     0.7483854 -0.017032810 -0.92630911
BOMBAY     0.23959193  4.12881879     3.6671191  0.543539672 -0.03464388
CALI       0.70184930  0.77301078     0.9575585  0.172488937 -0.45623494
DERMASON  -0.14830741 -0.71186450    -0.8644783  0.061885669  0.23063571
HOROZ      1.27438643  0.02524155     0.1334771 -0.908720210 -0.36927894
SEKER     -1.79378594 -0.45418092    -0.4762307  0.443795902  0.68864891
SIRA       0.18249737 -0.28356695    -0.2463856 -0.006855557  0.16621205
          roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
BARBUNYA -1.2243982  0.06788537   -1.0646777  -0.55149108   0.04057241
BOMBAY   -0.1619461 -0.14681719   -2.7662078  -1.48137638  -0.17442340
CALI     -0.4695188 -0.71007860   -0.9801788  -1.02882411  -0.72277999
DERMASON  0.5784417  0.30456139    1.0539803   0.73911377   0.27604353
HOROZ    -1.3357880 -1.61802235    0.3875927  -1.12977980  -1.54589291
SEKER     1.1942758  1.56545142   -0.1952061   1.37852462   1.62509135
SIRA      0.1858711 -0.04700660    0.1370170  -0.06128717  -0.07891545
         ShapeFactor4
BARBUNYA   0.15711922
BOMBAY    -0.74030666
CALI      -1.02730089
DERMASON   0.42328582
HOROZ     -0.73465772
SEKER      0.75935673
SIRA       0.06776728

Coefficients of linear discriminants:
                         LD1         LD2           LD3           LD4
Area             23.39734108 -25.8479410  -54.89272213  -37.20729324
Perimeter        11.25313790  -7.2058028   -0.60708057    1.62377214
MajorAxisLength  45.57648946 -36.2851657  -71.35373419   -0.97362021
MinorAxisLength  28.75831861 -15.0257481  -78.56710179   -8.30793481
AspectRation     -7.59356084   1.3537399  -29.56193994   23.69582547
Eccentricity     -0.65574839   3.9280251   13.66115858  -12.75200880
ConvexArea      -19.60545622  20.5318294   49.42580336   23.29493258
EquivDiameter   -93.62529662  58.0986512  151.61521978   25.91358937
Extent           -0.05506709   0.0252998    0.02763997    0.10767448
Solidity         -0.12609946   0.4028934    0.36304630    0.02741002
roundness         1.80680785  -1.6229142   -0.53775543   -0.38070526
Compactness      -5.99435881 -46.6390406 -215.79894130  179.25190564
ShapeFactor1     -2.56636706  -4.8993911    4.12887376    3.30765154
ShapeFactor2     -5.46843407   8.3018612   -8.96448056    1.19153970
ShapeFactor3      3.24628911  37.7458512  215.82921132 -164.21191638
ShapeFactor4      0.82609143  -0.8030443   -0.66953355   -0.32929734
                         LD5          LD6
Area            -18.85238141  13.19268824
Perimeter        17.11967702  -6.46904690
MajorAxisLength -58.20395317 -37.39802390
MinorAxisLength -44.49061346 -22.57486985
AspectRation     21.34366833  24.13568376
Eccentricity      1.52504290  -7.85036379
ConvexArea       25.20955649  10.03994788
EquivDiameter    73.63486628  26.55261608
Extent           -0.04616184  -0.04944707
Solidity          0.20958046   0.25943822
roundness         0.98548270  -0.90128789
Compactness      41.29165019  92.84071047
ShapeFactor1      1.25045205 -11.30258535
ShapeFactor2     -5.62211236  -2.64844772
ShapeFactor3    -15.08326222 -82.57937068
ShapeFactor4     -1.18197160  -1.17051531

Proportion of trace:
   LD1    LD2    LD3    LD4    LD5    LD6 
0.5442 0.2147 0.1061 0.0906 0.0293 0.0150 
pred <- predict(lda_model, testData)
y_pred <- pred$class

7.1 Evaluasi Model

confusionMatrix(y_pred, testData$Class)
Confusion Matrix and Statistics

          Reference
Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
  BARBUNYA      227      0    0        1     1     4    1
  BOMBAY          0    104    0        0     0     0    0
  CALI           15      0  312        0     7     0    0
  DERMASON        0      0    0      615     0     4   30
  HOROZ           0      0    3        1   347     0    3
  SEKER           3      0    0        6     0   369    0
  SIRA           19      0   11       86    17    28  493

Overall Statistics
                                       
               Accuracy : 0.9113       
                 95% CI : (0.9, 0.9218)
    No Information Rate : 0.2619       
    P-Value [Acc > NIR] : < 2.2e-16    
                                       
                  Kappa : 0.8927       
                                       
 Mcnemar's Test P-Value : NA           

Statistics by Class:

                     Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
Sensitivity                  0.85985       1.00000      0.9571          0.8674
Specificity                  0.99713       1.00000      0.9908          0.9830
Pos Pred Value               0.97009       1.00000      0.9341          0.9476
Neg Pred Value               0.98504       1.00000      0.9941          0.9543
Prevalence                   0.09752       0.03842      0.1204          0.2619
Detection Rate               0.08386       0.03842      0.1153          0.2272
Detection Prevalence         0.08644       0.03842      0.1234          0.2397
Balanced Accuracy            0.92849       1.00000      0.9739          0.9252
                     Class: HOROZ Class: SEKER Class: SIRA
Sensitivity                0.9328       0.9111      0.9355
Specificity                0.9970       0.9961      0.9261
Pos Pred Value             0.9802       0.9762      0.7538
Neg Pred Value             0.9894       0.9845      0.9834
Prevalence                 0.1374       0.1496      0.1947
Detection Rate             0.1282       0.1363      0.1821
Detection Prevalence       0.1308       0.1396      0.2416
Balanced Accuracy          0.9649       0.9536      0.9308

7.2 Visualisasi Model

lda_values <- predict(lda_model, trainData)$x
lda_df <- data.frame(lda_values, Class = trainData$Class)
ggplot(lda_df, aes(x = LD1, y = LD2, color = Class)) +
  geom_point(size = 2) +
  theme_minimal() +
  ggtitle("Visualisasi LDA")

lda_test <- data.frame(predict(lda_model, testData)$x,
                       Class = testData$Class)
ggplot(lda_test, aes(x = LD1, y = LD2, color = Class)) +
  geom_point() +
  theme_minimal() +
  ggtitle("Visualisasi LDA (Test)")

8 Model Regresi Logistik Multinomial

rlm_model <- multinom(Class ~ ., data = trainData)
# weights:  126 (102 variable)
initial  value 21085.882375 
iter  10 value 3856.057866
iter  20 value 3312.312610
iter  30 value 2968.792738
iter  40 value 2502.892955
iter  50 value 2329.134445
iter  60 value 2256.927312
iter  70 value 2238.555453
iter  80 value 2232.011572
iter  90 value 2227.977694
iter 100 value 2222.741619
final  value 2222.741619 
stopped after 100 iterations
summary(rlm_model)
Call:
multinom(formula = Class ~ ., data = trainData)

Coefficients:
         (Intercept)       Area Perimeter MajorAxisLength MinorAxisLength
BOMBAY    -9.5320577 -16.939776  18.66210       23.782848        13.15182
CALI       1.6164780 -17.232529 -43.29224        9.981186        20.72500
DERMASON  -7.1553069  -4.183068  19.20904      -42.363152        19.26957
HOROZ      2.8127734   9.672490  15.30882      -28.268898        11.36563
SEKER      1.6482085  13.454729  34.60819       -8.036828       -29.84899
SIRA       0.4033704  16.360589 -94.42995       40.280551       -22.97214
         AspectRation Eccentricity ConvexArea EquivDiameter     Extent
BOMBAY     15.8703204     3.151927 -13.143173     14.047293 -0.1460726
CALI       -0.7885896     7.927178   8.040230     24.968071  0.1807100
DERMASON   24.9733329    10.410076 -27.586910     -8.564135 -0.7439593
HOROZ      44.1310513    26.110463  17.615493    -35.924408 -0.3465213
SEKER      22.7652766    12.397772  -4.179395    -16.384160 -0.6531727
SIRA        5.7370370     7.795294   7.086346     17.006864 -0.3629133
          Solidity  roundness Compactness ShapeFactor1 ShapeFactor2
BOMBAY   0.1221631  12.414307    9.774938    28.676567     4.528267
CALI     1.4719010  -3.653080  -40.866698    -9.594783    33.492280
DERMASON 1.1861984   5.099350   15.454276   -14.319468    20.712571
HOROZ    2.3598635   3.463466   21.024750   -10.001013    19.098113
SEKER    1.1855745   6.057574    6.018438    -7.029113     2.713861
SIRA     1.7140438 -10.309345   14.531442   -17.152311    -6.706350
         ShapeFactor3 ShapeFactor4
BOMBAY       15.49839    -1.755712
CALI         15.76397    -2.756195
DERMASON    -21.78086    -2.694348
HOROZ        12.36109    -5.549705
SEKER        32.40404    -1.338800
SIRA         10.56191    -3.773056

Std. Errors:
         (Intercept)     Area Perimeter MajorAxisLength MinorAxisLength
BOMBAY     40.038218 14.44703  9.757591        33.25405        33.82532
CALI        1.036809 65.63222 16.092764        57.04617        29.22216
DERMASON    6.416705 63.45735 20.301846        77.35514        53.40084
HOROZ       1.860615 66.51406 14.412296        60.20565        36.59607
SEKER       2.513728 39.39733 11.176777        72.84690        28.62107
SIRA        2.614421 73.01485 19.494584        76.32722        49.35490
         AspectRation Eccentricity ConvexArea EquivDiameter    Extent  Solidity
BOMBAY       66.60843    18.367387   16.60340      16.04129 3.0905650 5.4485653
CALI         21.90414     7.978927   60.82278      42.83807 0.1217717 0.7107297
DERMASON     27.44562     9.299886   88.68395      49.09663 0.1695578 0.6239311
HOROZ        24.38896    21.108709   63.09097      57.61926 0.1540099 0.6688650
SEKER        23.13618     3.418815   58.95367      38.50629 0.2054516 0.5769581
SIRA         22.67236     9.488301   63.38382      59.77790 0.1543245 0.5986858
         roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
BOMBAY   12.982257    25.56771     45.95747     41.85893     50.34604
CALI      2.718365    88.15819     23.00454     23.31996     79.25210
DERMASON  2.738341   103.32119     22.83557     29.26875     84.45105
HOROZ     2.339415    70.64982     16.69657     24.78084     67.43142
SEKER     1.789487    36.14537     27.65358     24.87151     27.45970
SIRA      2.711450    67.16213     21.15080     25.64620     62.50354
         ShapeFactor4
BOMBAY      5.1969776
CALI        0.8890355
DERMASON    1.1631606
HOROZ       1.1852352
SEKER       0.8566593
SIRA        1.2982195

Residual Deviance: 4445.483 
AIC: 4649.483 

8.1 Evaluasi Model

pred_rlm <- predict(rlm_model, newdata = testData)
eval_rlm <- confusionMatrix(pred_rlm, testData$Class)
eval_rlm
Confusion Matrix and Statistics

          Reference
Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
  BARBUNYA      251      0    9        1     2     3    1
  BOMBAY          0    104    0        0     0     0    0
  CALI            6      0  308        0     6     0    1
  DERMASON        0      0    0      665     4     5   54
  HOROZ           0      0    4        2   349     0    5
  SEKER           2      0    0        6     0   385    3
  SIRA            5      0    5       35    11    12  463

Overall Statistics
                                          
               Accuracy : 0.9328          
                 95% CI : (0.9227, 0.9419)
    No Information Rate : 0.2619          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9186          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
Sensitivity                  0.95076       1.00000      0.9448          0.9379
Specificity                  0.99345       1.00000      0.9945          0.9685
Pos Pred Value               0.94007       1.00000      0.9595          0.9135
Neg Pred Value               0.99467       1.00000      0.9925          0.9778
Prevalence                   0.09752       0.03842      0.1204          0.2619
Detection Rate               0.09272       0.03842      0.1138          0.2457
Detection Prevalence         0.09863       0.03842      0.1186          0.2689
Balanced Accuracy            0.97210       1.00000      0.9697          0.9532
                     Class: HOROZ Class: SEKER Class: SIRA
Sensitivity                0.9382       0.9506      0.8786
Specificity                0.9953       0.9952      0.9688
Pos Pred Value             0.9694       0.9722      0.8719
Neg Pred Value             0.9902       0.9913      0.9706
Prevalence                 0.1374       0.1496      0.1947
Detection Rate             0.1289       0.1422      0.1710
Detection Prevalence       0.1330       0.1463      0.1962
Balanced Accuracy          0.9667       0.9729      0.9237

8.2 Visualisasi Model

cm_data <- as.data.frame(eval_rlm$table)
ggplot(data = cm_data, aes(x = Reference, y = Prediction)) +
  geom_tile(aes(fill = Freq), color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  geom_text(aes(label = Freq), color = "black", size = 4) +
  theme_minimal() +
  labs(title = "Heatmap Confusion Matrix - Regresi Logistik Multinomial",
       subtitle = "Sumbu X: Kelas Aktual | Sumbu Y: Kelas Prediksi",
       x = "Aktual (Reference)",
       y = "Prediksi (Prediction)",
       fill = "Frekuensi") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))