knitr::opts_chunk$set(echo = TRUE)
# Load required libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(nnet)
library(car)
## Loading required package: carData
library(MVN)
library(ggplot2)
library(tidyr)
library(biotools)
## ---
## biotools version 4.3
library(klaR)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(multiROC)
# ===============================
# 1. DATA LOADING & BASIC EXPLORATION
# ===============================
beans_data <- read.csv("C:/Users/Rozan/OneDrive/Documents/Anmul/Dry_Bean_Dataset.csv")
# Basic understanding of the data
str(beans_data)
## 'data.frame': 13611 obs. of 17 variables:
## $ Area : int 28395 28734 29380 30008 30140 30279 30477 30519 30685 30834 ...
## $ Perimeter : num 610 638 624 646 620 ...
## $ MajorAxisLength: num 208 201 213 211 202 ...
## $ MinorAxisLength: num 174 183 176 183 190 ...
## $ AspectRation : num 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : int 28715 29172 29690 30724 30417 30600 30970 30847 31044 31120 ...
## $ EquivDiameter : num 190 191 193 195 196 ...
## $ Extent : num 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num 0.999 0.998 0.999 0.994 0.999 ...
## $ Class : chr "SEKER" "SEKER" "SEKER" "SEKER" ...
summary(beans_data)
## Area Perimeter MajorAxisLength MinorAxisLength
## Min. : 20420 Min. : 524.7 Min. :183.6 Min. :122.5
## 1st Qu.: 36328 1st Qu.: 703.5 1st Qu.:253.3 1st Qu.:175.8
## Median : 44652 Median : 794.9 Median :296.9 Median :192.4
## Mean : 53048 Mean : 855.3 Mean :320.1 Mean :202.3
## 3rd Qu.: 61332 3rd Qu.: 977.2 3rd Qu.:376.5 3rd Qu.:217.0
## Max. :254616 Max. :1985.4 Max. :738.9 Max. :460.2
## AspectRation Eccentricity ConvexArea EquivDiameter
## Min. :1.025 Min. :0.2190 Min. : 20684 Min. :161.2
## 1st Qu.:1.432 1st Qu.:0.7159 1st Qu.: 36715 1st Qu.:215.1
## Median :1.551 Median :0.7644 Median : 45178 Median :238.4
## Mean :1.583 Mean :0.7509 Mean : 53768 Mean :253.1
## 3rd Qu.:1.707 3rd Qu.:0.8105 3rd Qu.: 62294 3rd Qu.:279.4
## Max. :2.430 Max. :0.9114 Max. :263261 Max. :569.4
## Extent Solidity roundness Compactness
## Min. :0.5553 Min. :0.9192 Min. :0.4896 Min. :0.6406
## 1st Qu.:0.7186 1st Qu.:0.9857 1st Qu.:0.8321 1st Qu.:0.7625
## Median :0.7599 Median :0.9883 Median :0.8832 Median :0.8013
## Mean :0.7497 Mean :0.9871 Mean :0.8733 Mean :0.7999
## 3rd Qu.:0.7869 3rd Qu.:0.9900 3rd Qu.:0.9169 3rd Qu.:0.8343
## Max. :0.8662 Max. :0.9947 Max. :0.9907 Max. :0.9873
## ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## Min. :0.002778 Min. :0.0005642 Min. :0.4103 Min. :0.9477
## 1st Qu.:0.005900 1st Qu.:0.0011535 1st Qu.:0.5814 1st Qu.:0.9937
## Median :0.006645 Median :0.0016935 Median :0.6420 Median :0.9964
## Mean :0.006564 Mean :0.0017159 Mean :0.6436 Mean :0.9951
## 3rd Qu.:0.007271 3rd Qu.:0.0021703 3rd Qu.:0.6960 3rd Qu.:0.9979
## Max. :0.010451 Max. :0.0036650 Max. :0.9748 Max. :0.9997
## Class
## Length:13611
## Class :character
## Mode :character
##
##
##
head(beans_data)
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## 1 28395 610.291 208.1781 173.8887 1.197191 0.5498122
## 2 28734 638.018 200.5248 182.7344 1.097356 0.4117853
## 3 29380 624.110 212.8261 175.9311 1.209713 0.5627273
## 4 30008 645.884 210.5580 182.5165 1.153638 0.4986160
## 5 30140 620.134 201.8479 190.2793 1.060798 0.3336797
## 6 30279 634.927 212.5606 181.5102 1.171067 0.5204007
## ConvexArea EquivDiameter Extent Solidity roundness Compactness
## 1 28715 190.1411 0.7639225 0.9888560 0.9580271 0.9133578
## 2 29172 191.2728 0.7839681 0.9849856 0.8870336 0.9538608
## 3 29690 193.4109 0.7781132 0.9895588 0.9478495 0.9087742
## 4 30724 195.4671 0.7826813 0.9766957 0.9039364 0.9283288
## 5 30417 195.8965 0.7730980 0.9908932 0.9848771 0.9705155
## 6 30600 196.3477 0.7756885 0.9895098 0.9438518 0.9237260
## ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4 Class
## 1 0.007331506 0.003147289 0.8342224 0.9987239 SEKER
## 2 0.006978659 0.003563624 0.9098505 0.9984303 SEKER
## 3 0.007243912 0.003047733 0.8258706 0.9990661 SEKER
## 4 0.007016729 0.003214562 0.8617944 0.9941988 SEKER
## 5 0.006697010 0.003664972 0.9419004 0.9991661 SEKER
## 6 0.007020065 0.003152779 0.8532696 0.9992358 SEKER
# Convert Class to factor
beans_data$Class <- as.factor(beans_data$Class)
# Understand the distribution of classes
table(beans_data$Class)
##
## BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
## 1322 522 1630 3546 1928 2027 2636
barplot(table(beans_data$Class), main="Distribution of Bean Classes",
xlab="Bean Class", ylab="Frequency", col=rainbow(7))
# ===============================
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ===============================
# Examine feature distributions
par(mfrow=c(3,3))
for(i in 1:16) {
hist(beans_data[,i], main=names(beans_data)[i], xlab=names(beans_data)[i], col="skyblue")
}
par(mfrow=c(1,1))
# ===============================
# 3. PREPROCESSING
# ===============================
# Check for missing values
missing_values <- sum(is.na(beans_data))
cat("Total missing values:", missing_values, "\n")
## Total missing values: 0
# Check for duplicate rows
duplicate_rows <- sum(duplicated(beans_data))
cat("Number of duplicate rows:", duplicate_rows, "\n")
## Number of duplicate rows: 68
# Remove duplicates if any
if(duplicate_rows > 0) {
beans_data <- beans_data[!duplicated(beans_data),]
cat("Duplicates removed. New dataset size:", nrow(beans_data), "\n")
}
## Duplicates removed. New dataset size: 13543
# Detect outliers using boxplots for each feature
par(mfrow=c(2,2)) # hanya 4 boxplot per halaman
for(i in 1:16) {
boxplot(beans_data[,i], main=names(beans_data)[i], col="lightgreen")
if (i %% 4 == 0) {
readline("Press [enter] to see next 4 plots")
}
}
## Press [enter] to see next 4 plots
## Press [enter] to see next 4 plots
## Press [enter] to see next 4 plots
## Press [enter] to see next 4 plots
# Identify outliers using IQR method
outliers_count <- c()
for(i in 1:16) {
q1 <- quantile(beans_data[,i], 0.25)
q3 <- quantile(beans_data[,i], 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
outliers <- sum(beans_data[,i] < lower_bound | beans_data[,i] > upper_bound)
outliers_count <- c(outliers_count, outliers)
}
outliers_df <- data.frame(Feature = names(beans_data)[1:16], Outliers = outliers_count)
print(outliers_df)
## Feature Outliers
## 1 Area 551
## 2 Perimeter 500
## 3 MajorAxisLength 379
## 4 MinorAxisLength 567
## 5 AspectRation 485
## 6 Eccentricity 833
## 7 ConvexArea 549
## 8 EquivDiameter 526
## 9 Extent 271
## 10 Solidity 774
## 11 roundness 98
## 12 Compactness 124
## 13 ShapeFactor1 533
## 14 ShapeFactor2 0
## 15 ShapeFactor3 202
## 16 ShapeFactor4 760
# Create a function to handle outliers using capping method
handle_outliers <- function(x) {
q1 <- quantile(x, 0.25)
q3 <- quantile(x, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
# Apply outlier handling to numeric features
beans_data_clean <- beans_data
for(i in 1:16) {
beans_data_clean[,i] <- handle_outliers(beans_data[,i])
}
# Check effect of outlier handling
par(mfrow=c(2,2))
boxplot(beans_data$Area, main="Area (Before)", col="lightblue")
boxplot(beans_data_clean$Area, main="Area (After)", col="lightgreen")
boxplot(beans_data$Perimeter, main="Perimeter (Before)", col="lightblue")
boxplot(beans_data_clean$Perimeter, main="Perimeter (After)", col="lightgreen")
par(mfrow=c(1,1))
# ===============================
# 4. UJI ASUMSI
# ===============================
# 4.1 Normality test for each feature
normality_test <- data.frame(Feature = character(),
Shapiro_W = numeric(),
p_value = numeric(),
Is_Normal = character(),
stringsAsFactors = FALSE)
for(i in 1:16) {
data_vec <- beans_data_clean[[i]]
if(length(data_vec) > 5000) {
data_vec <- data_vec[1:5000] # ambil subset 5000 data pertama
}
test <- shapiro.test(data_vec)
normality_test <- rbind(normality_test,
data.frame(Feature = names(beans_data_clean)[i],
Shapiro_W = test$statistic,
p_value = test$p.value,
Is_Normal = ifelse(test$p.value > 0.05, "Yes", "No"),
stringsAsFactors = FALSE))
}
print("Normality Test Results:")
## [1] "Normality Test Results:"
print(normality_test)
## Feature Shapiro_W p_value Is_Normal
## W Area 0.9176464 5.333535e-46 No
## W1 Perimeter 0.9074462 6.447543e-48 No
## W2 MajorAxisLength 0.9012350 5.310578e-49 No
## W3 MinorAxisLength 0.9345663 2.428122e-42 No
## W4 AspectRation 0.9471118 4.274959e-39 No
## W5 Eccentricity 0.8777071 1.173999e-52 No
## W6 ConvexArea 0.9167071 3.487681e-46 No
## W7 EquivDiameter 0.9212823 2.868608e-45 No
## W8 Extent 0.9593246 2.810951e-35 No
## W9 Solidity 0.9161077 2.665005e-46 No
## W10 roundness 0.9497511 2.462695e-38 No
## W11 Compactness 0.9377449 1.434372e-41 No
## W12 ShapeFactor1 0.9174663 4.914824e-46 No
## W13 ShapeFactor2 0.9082850 9.127701e-48 No
## W14 ShapeFactor3 0.9282982 8.892458e-44 No
## W15 ShapeFactor4 0.8573644 2.254607e-55 No
# 4.2 Check multivariate normality
# Using Mardia's test
mvn_result <- mvn(beans_data_clean[,1:16], mvnTest = "mardia")
print("Multivariate Normality Test:")
## [1] "Multivariate Normality Test:"
print(mvn_result$multivariateNormality)
## Test Statistic p value Result
## 1 Mardia Skewness 860670.768798702 0 NO
## 2 Mardia Kurtosis 1659.65584526544 0 NO
## 3 MVN <NA> <NA> NO
# 4.3 Homogeneity of variances - Levene's test for each feature across classes
homogeneity_test <- data.frame(Feature = character(),
Levene_F = numeric(),
p_value = numeric(),
Is_Homogeneous = character(),
stringsAsFactors = FALSE)
for(i in 1:16) {
test <- leveneTest(beans_data_clean[,i] ~ beans_data_clean$Class)
homogeneity_test <- rbind(homogeneity_test,
data.frame(Feature = names(beans_data_clean)[i],
Levene_F = test$`F value`[1],
p_value = test$`Pr(>F)`[1],
Is_Homogeneous = ifelse(test$`Pr(>F)`[1] > 0.05, "Yes", "No"),
stringsAsFactors = FALSE))
}
print("Homogeneity Test Results (Levene's Test):")
## [1] "Homogeneity Test Results (Levene's Test):"
print(homogeneity_test)
## Feature Levene_F p_value Is_Homogeneous
## 1 Area 496.23919 0.000000e+00 No
## 2 Perimeter 326.86041 0.000000e+00 No
## 3 MajorAxisLength 156.85007 2.639439e-193 No
## 4 MinorAxisLength 341.30438 0.000000e+00 No
## 5 AspectRation 66.73483 3.931943e-82 No
## 6 Eccentricity 184.69044 6.800750e-227 No
## 7 ConvexArea 499.94659 0.000000e+00 No
## 8 EquivDiameter 275.52376 0.000000e+00 No
## 9 Extent 794.41153 0.000000e+00 No
## 10 Solidity 321.66591 0.000000e+00 No
## 11 roundness 148.53594 3.334090e-183 No
## 12 Compactness 63.51696 4.246091e-78 No
## 13 ShapeFactor1 424.51415 0.000000e+00 No
## 14 ShapeFactor2 323.86561 0.000000e+00 No
## 15 ShapeFactor3 98.02706 3.812464e-121 No
## 16 ShapeFactor4 502.60350 0.000000e+00 No
# 4.4 Box's M Test untuk Homogenitas Kovarians (khusus untuk LDA)
boxm_test <- boxM(beans_data_clean[,1:16], beans_data_clean$Class)
print("Box's M Test untuk Homogenitas Kovarians:")
## [1] "Box's M Test untuk Homogenitas Kovarians:"
print(boxm_test)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: beans_data_clean[, 1:16]
## Chi-Sq (approx.) = Inf, df = 816, p-value < 2.2e-16
# ===============================
# 5. CHECK FOR MULTICOLLINEARITY
# ===============================
# Calculate correlation matrix
cor_matrix <- cor(beans_data_clean[,1:16])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## Area Perimeter MajorAxisLength MinorAxisLength
## Area 1.000000000 0.98659717 0.95257239 0.9172401167
## Perimeter 0.986597171 1.00000000 0.97457127 0.8784407902
## MajorAxisLength 0.952572391 0.97457127 1.00000000 0.7740749482
## MinorAxisLength 0.917240117 0.87844079 0.77407495 1.0000000000
## AspectRation 0.374527025 0.43679728 0.58521367 0.0001632239
## Eccentricity 0.389447043 0.44441965 0.58491555 0.0175523265
## ConvexArea 0.999911176 0.98735018 0.95287227 0.9167800585
## EquivDiameter 0.997156716 0.99051845 0.95886952 0.9174628218
## Extent 0.001425745 -0.04343875 -0.08982499 0.1368343106
## Solidity -0.323739946 -0.36745882 -0.32545274 -0.2314423108
## roundness -0.537027978 -0.61620139 -0.63722300 -0.2817039702
## Compactness -0.391699674 -0.45128009 -0.59738360 -0.0146667148
## ShapeFactor1 -0.891314842 -0.86020508 -0.76013005 -0.9876847242
## ShapeFactor2 -0.767005989 -0.80789271 -0.88443934 -0.4861315024
## ShapeFactor3 -0.394317937 -0.45264865 -0.59745896 -0.0177764986
## ShapeFactor4 -0.503642873 -0.52030913 -0.57130132 -0.3284895982
## AspectRation Eccentricity ConvexArea EquivDiameter
## Area 0.3745270254 0.389447043 0.9999111761 0.997156716
## Perimeter 0.4367972797 0.444419645 0.9873501808 0.990518448
## MajorAxisLength 0.5852136650 0.584915552 0.9528722719 0.958869516
## MinorAxisLength 0.0001632239 0.017552326 0.9167800585 0.917462822
## AspectRation 1.0000000000 0.956014998 0.3748611943 0.381396761
## Eccentricity 0.9560149982 1.000000000 0.3902913268 0.391527482
## ConvexArea 0.3748611943 0.390291327 1.0000000000 0.997000431
## EquivDiameter 0.3813967608 0.391527482 0.9970004312 1.000000000
## Extent -0.3608768225 -0.326316682 -0.0002785788 -0.004397572
## Solidity -0.3011626847 -0.343629449 -0.3335800821 -0.312661584
## roundness -0.7752711869 -0.754679392 -0.5409184770 -0.538795612
## Compactness -0.9909098940 -0.983978961 -0.3924214216 -0.396498691
## ShapeFactor1 0.0146316349 0.009476393 -0.8903830882 -0.901336013
## ShapeFactor2 -0.8444100855 -0.866202053 -0.7673746763 -0.777764416
## ShapeFactor3 -0.9842078774 -0.990669242 -0.3951079057 -0.398378737
## ShapeFactor4 -0.5239863863 -0.536576145 -0.5085727752 -0.501227826
## Extent Solidity roundness Compactness ShapeFactor1
## Area 0.0014257452 -0.3237399 -0.5370280 -0.391699674 -0.891314842
## Perimeter -0.0434387486 -0.3674588 -0.6162014 -0.451280091 -0.860205082
## MajorAxisLength -0.0898249885 -0.3254527 -0.6372230 -0.597383597 -0.760130055
## MinorAxisLength 0.1368343106 -0.2314423 -0.2817040 -0.014666715 -0.987684724
## AspectRation -0.3608768225 -0.3011627 -0.7752712 -0.990909894 0.014631635
## Eccentricity -0.3263166819 -0.3436294 -0.7546794 -0.983978961 0.009476393
## ConvexArea -0.0002785788 -0.3335801 -0.5409185 -0.392421422 -0.890383088
## EquivDiameter -0.0043975724 -0.3126616 -0.5387956 -0.396498691 -0.901336013
## Extent 1.0000000000 0.2054805 0.3408050 0.349088166 -0.138193285
## Solidity 0.2054804510 1.0000000 0.6492091 0.333044647 0.178403725
## roundness 0.3408049814 0.6492091 1.0000000 0.777174384 0.246300008
## Compactness 0.3490881660 0.3330446 0.7771744 1.000000000 -0.006278254
## ShapeFactor1 -0.1381932850 0.1784037 0.2463000 -0.006278254 1.000000000
## ShapeFactor2 0.2335486555 0.3811507 0.7927054 0.867915825 0.470570726
## ShapeFactor3 0.3436642824 0.3393022 0.7737780 0.998891240 -0.005009523
## ShapeFactor4 0.1476758328 0.6008967 0.5269533 0.551260801 0.296003363
## ShapeFactor2 ShapeFactor3 ShapeFactor4
## Area -0.7670060 -0.394317937 -0.5036429
## Perimeter -0.8078927 -0.452648654 -0.5203091
## MajorAxisLength -0.8844393 -0.597458958 -0.5713013
## MinorAxisLength -0.4861315 -0.017776499 -0.3284896
## AspectRation -0.8444101 -0.984207877 -0.5239864
## Eccentricity -0.8662021 -0.990669242 -0.5365761
## ConvexArea -0.7673747 -0.395107906 -0.5085728
## EquivDiameter -0.7777644 -0.398378737 -0.5012278
## Extent 0.2335487 0.343664282 0.1476758
## Solidity 0.3811507 0.339302248 0.6008967
## roundness 0.7927054 0.773777983 0.5269533
## Compactness 0.8679158 0.998891240 0.5512608
## ShapeFactor1 0.4705707 -0.005009523 0.2960034
## ShapeFactor2 1.0000000 0.871526167 0.6123671
## ShapeFactor3 0.8715262 1.000000000 0.5532295
## ShapeFactor4 0.6123671 0.553229486 1.0000000
# Visualize correlation matrix
corrplot(cor_matrix, method="circle", type="upper",
tl.col="black", tl.srt=45, addCoef.col="black")
# Calculate VIF (Variance Inflation Factor) for multicollinearity
# Create a linear model for each predictor
vif_values <- c()
for(i in 1:16) {
formula <- as.formula(paste(names(beans_data_clean)[i], "~ ."))
model <- lm(formula, data = beans_data_clean[,1:16])
vif_values <- c(vif_values, 1/(1-summary(model)$r.squared))
}
vif_df <- data.frame(Feature = names(beans_data_clean)[1:16], VIF = vif_values)
print("VIF Values:")
## [1] "VIF Values:"
print(vif_df)
## Feature VIF
## 1 Area 24347.350192
## 2 Perimeter 1021.115607
## 3 MajorAxisLength 398.451490
## 4 MinorAxisLength 433.740106
## 5 AspectRation 340.229965
## 6 Eccentricity 246.472267
## 7 ConvexArea 23404.014141
## 8 EquivDiameter 5882.597698
## 9 Extent 1.237050
## 10 Solidity 6.868558
## 11 roundness 40.456256
## 12 Compactness 6165.134693
## 13 ShapeFactor1 255.364451
## 14 ShapeFactor2 212.841779
## 15 ShapeFactor3 5927.685126
## 16 ShapeFactor4 3.198797
# ===============================
# 6. FEATURE SCALING & DIMENSIONALITY REDUCTION
# ===============================
# Standardize the features
beans_data_scaled <- beans_data_clean
beans_data_scaled[,1:16] <- scale(beans_data_clean[,1:16])
# Check summary of scaled data
summary(beans_data_scaled[,1:16])
## Area Perimeter MajorAxisLength MinorAxisLength
## Min. :-1.5769 Min. :-1.7036 Min. :-1.6753 Min. :-2.3195
## 1st Qu.:-0.7360 1st Qu.:-0.7610 1st Qu.:-0.8112 1st Qu.:-0.6940
## Median :-0.2962 Median :-0.2822 Median :-0.2725 Median :-0.1883
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5945 3rd Qu.: 0.6856 3rd Qu.: 0.7213 3rd Qu.: 0.5657
## Max. : 2.5903 Max. : 2.8554 Max. : 3.0199 Max. : 2.4551
## AspectRation Eccentricity ConvexArea EquivDiameter
## Min. :-2.3166 Min. :-2.1686 Min. :-1.5677 Min. :-1.8571
## 1st Qu.:-0.6187 1st Qu.:-0.4631 1st Qu.:-0.7373 1st Qu.:-0.7269
## Median :-0.1199 Median : 0.1245 Median :-0.2985 Median :-0.2362
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5247 3rd Qu.: 0.6739 3rd Qu.: 0.5968 3rd Qu.: 0.6334
## Max. : 2.2397 Max. : 1.8979 Max. : 2.5980 Max. : 2.6739
## Extent Solidity roundness Compactness
## Min. :-2.7860 Min. :-2.3219 Min. :-2.8469 Min. :-2.3501
## 1st Qu.:-0.6553 1st Qu.:-0.4993 1st Qu.:-0.6958 1st Qu.:-0.6050
## Median : 0.2032 Median : 0.2312 Median : 0.1631 Median : 0.0202
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7652 3rd Qu.: 0.7159 3rd Qu.: 0.7383 3rd Qu.: 0.5584
## Max. : 2.4199 Max. : 2.0201 Max. : 2.0014 Max. : 2.3035
## ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## Min. :-2.52545 Min. :-1.9397 Min. :-2.37512 Min. :-2.4066
## 1st Qu.:-0.62721 1st Qu.:-0.9418 1st Qu.:-0.62874 1st Qu.:-0.5025
## Median : 0.06215 Median :-0.0316 Median :-0.01597 Median : 0.3111
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.63828 3rd Qu.: 0.7617 3rd Qu.: 0.53552 3rd Qu.: 0.7669
## Max. : 2.53653 Max. : 3.2676 Max. : 2.28190 Max. : 1.3271
# Perform PCA for dimensionality reduction
pca_result <- prcomp(beans_data_scaled[,1:16], scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.0792 1.9756 1.04626 0.90176 0.67244 0.34358 0.24442
## Proportion of Variance 0.5926 0.2439 0.06842 0.05082 0.02826 0.00738 0.00373
## Cumulative Proportion 0.5926 0.8365 0.90495 0.95577 0.98403 0.99141 0.99515
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.19263 0.16784 0.08997 0.04291 0.03666 0.02921 0.01220
## Proportion of Variance 0.00232 0.00176 0.00051 0.00012 0.00008 0.00005 0.00001
## Cumulative Proportion 0.99747 0.99923 0.99973 0.99985 0.99993 0.99998 0.99999
## PC15 PC16
## Standard deviation 0.00916 0.004594
## Proportion of Variance 0.00001 0.000000
## Cumulative Proportion 1.00000 1.000000
# Plot the variance explained by each principal component
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 80))
# Plot the PCA biplot
# Pastikan kolom Class sebagai faktor
beans_data_clean$Class <- as.factor(beans_data_clean$Class)
# Visualisasi PCA
fviz_pca_biplot(pca_result,
geom.ind = "point",
col.ind = beans_data_clean$Class,
addEllipses = TRUE,
label = "var",
repel = TRUE,
legend.title = "Bean Class")
# Determine optimal number of PCs
# Based on eigenvalues > 1 or cumulative variance > 80%
eigenvalues <- pca_result$sdev^2
cumulative_var <- cumsum(eigenvalues) / sum(eigenvalues) * 100
optimal_pcs <- which(cumulative_var >= 80)[1]
cat("Optimal number of PCs explaining at least 80% variance:", optimal_pcs, "\n")
## Optimal number of PCs explaining at least 80% variance: 2
# Extract the principal components
pca_data <- data.frame(pca_result$x[, 1:optimal_pcs])
pca_data$Class <- beans_data_scaled$Class
# ===============================
# 7. FEATURE SELECTION
# ===============================
# Calculate variable importance using Random Forest
# This will help identify the most important features
set.seed(123)
ctrl <- trainControl(method = "cv", number = 5)
rf_model <- train(Class ~ ., data = beans_data_scaled, method = "rf",
trControl = ctrl, importance = TRUE)
# Extract variable importance
# Simpan objek importance lengkap
var_imp <- varImp(rf_model)
# Cetak importance secara tabel
var_imp_values <- var_imp$importance
print("Variable Importance:")
## [1] "Variable Importance:"
print(var_imp_values)
## BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER
## Area 10.786970 4.4854697 14.91405 12.93919 7.313646 6.288466
## Perimeter 29.105762 27.6388564 27.89421 36.15015 17.645823 15.604728
## MajorAxisLength 33.726454 25.0460174 37.35231 23.80197 24.436220 21.684190
## MinorAxisLength 12.452279 10.6853828 20.14285 26.55494 24.101782 17.161057
## AspectRation 12.226981 2.9174234 10.52506 13.81593 12.772151 12.358983
## Eccentricity 11.314938 3.4156069 10.16281 14.17070 12.122340 12.290797
## ConvexArea 15.854161 10.6192277 16.67338 19.46101 11.784507 9.308329
## EquivDiameter 11.657927 11.9792316 14.40869 12.36254 7.035064 6.435190
## Extent 5.891208 0.0000000 13.31769 13.11664 5.930817 10.034831
## Solidity 36.294495 0.3725105 41.84971 34.32757 15.797053 16.148800
## roundness 84.861358 11.5615517 95.50293 41.28527 10.173186 15.002967
## Compactness 23.272124 6.8146040 23.81119 19.21221 25.852829 30.417700
## ShapeFactor1 18.395848 40.3209934 32.84573 33.45926 31.651777 34.078506
## ShapeFactor2 12.834036 5.3311742 19.72206 10.99420 8.450468 11.546878
## ShapeFactor3 22.845788 6.3203827 24.51649 17.63565 25.837314 31.246898
## ShapeFactor4 30.787425 1.0199091 100.00000 37.56966 35.477065 50.365168
## SIRA
## Area 21.91489
## Perimeter 47.76945
## MajorAxisLength 24.64229
## MinorAxisLength 22.20333
## AspectRation 20.05592
## Eccentricity 18.73132
## ConvexArea 24.59226
## EquivDiameter 23.20578
## Extent 11.54541
## Solidity 25.98918
## roundness 77.00072
## Compactness 32.54809
## ShapeFactor1 35.48065
## ShapeFactor2 14.92718
## ShapeFactor3 33.51933
## ShapeFactor4 47.77369
# Tampilkan plot 20 fitur teratas
plot(var_imp, top = 20)
# Get the variable importance values
var_imp_values <- var_imp$importance
var_imp_df <- data.frame(
Feature = rownames(var_imp_values),
Importance = rowMeans(var_imp_values) # gunakan rata-rata importance dari semua kelas
)
var_imp_df <- var_imp_df[order(-var_imp_df$Importance), ] # urut descending
print("Variable Importance Ranking:")
## [1] "Variable Importance Ranking:"
print(var_imp_df)
## Feature Importance
## roundness roundness 47.912569
## ShapeFactor4 ShapeFactor4 43.284702
## ShapeFactor1 ShapeFactor1 32.318966
## Perimeter Perimeter 28.829855
## MajorAxisLength MajorAxisLength 27.241350
## Solidity Solidity 24.397046
## Compactness Compactness 23.132677
## ShapeFactor3 ShapeFactor3 23.131693
## MinorAxisLength MinorAxisLength 19.043089
## ConvexArea ConvexArea 15.470410
## EquivDiameter EquivDiameter 12.440631
## AspectRation AspectRation 12.096065
## ShapeFactor2 ShapeFactor2 11.972284
## Eccentricity Eccentricity 11.744073
## Area Area 11.234669
## Extent Extent 8.548085
# Select top features (e.g., top 10)
top_features <- var_imp_df$Feature[1:10]
print("Top 10 Features:")
## [1] "Top 10 Features:"
print(top_features)
## [1] "roundness" "ShapeFactor4" "ShapeFactor1" "Perimeter"
## [5] "MajorAxisLength" "Solidity" "Compactness" "ShapeFactor3"
## [9] "MinorAxisLength" "ConvexArea"
# Create a dataset with only the selected features
selected_features <- c(top_features, "Class")
beans_selected <- beans_data_scaled[, names(beans_data_scaled) %in% selected_features]
# ===============================
# 8. DATA SPLITTING
# ===============================
# Split data into training and testing sets
set.seed(123)
training_indices <- createDataPartition(beans_selected$Class, p = 0.7, list = FALSE)
training_data <- beans_selected[training_indices, ]
testing_data <- beans_selected[-training_indices, ]
# Make sure the Class variable is a factor
training_data$Class <- as.factor(training_data$Class)
testing_data$Class <- as.factor(testing_data$Class)
# ===============================
# 9. PEMODELAN
# ===============================
# 9.1 Linear Discriminant Analysis (LDA)
lda_model <- lda(Class ~ ., data = training_data)
lda_pred <- predict(lda_model, testing_data)
lda_conf_matrix <- confusionMatrix(lda_pred$class, testing_data$Class)
print("LDA Model Results:")
## [1] "LDA Model Results:"
print(lda_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
## BARBUNYA 343 0 6 1 0 3 3
## BOMBAY 0 155 0 0 0 0 0
## CALI 23 1 449 0 6 0 0
## DERMASON 0 0 0 863 3 5 31
## HOROZ 0 0 5 0 520 0 4
## SEKER 1 0 1 24 0 548 4
## SIRA 29 0 28 175 29 52 748
##
## Overall Statistics
##
## Accuracy : 0.8931
## 95% CI : (0.8832, 0.9024)
## No Information Rate : 0.2618
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8708
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
## Sensitivity 0.86616 0.99359 0.9182 0.8119
## Specificity 0.99645 1.00000 0.9916 0.9870
## Pos Pred Value 0.96348 1.00000 0.9374 0.9568
## Neg Pred Value 0.98569 0.99974 0.9888 0.9367
## Prevalence 0.09754 0.03842 0.1204 0.2618
## Detection Rate 0.08448 0.03818 0.1106 0.2126
## Detection Prevalence 0.08768 0.03818 0.1180 0.2222
## Balanced Accuracy 0.93131 0.99679 0.9549 0.8994
## Class: HOROZ Class: SEKER Class: SIRA
## Sensitivity 0.9319 0.9013 0.9468
## Specificity 0.9974 0.9913 0.9043
## Pos Pred Value 0.9830 0.9481 0.7050
## Neg Pred Value 0.9892 0.9828 0.9860
## Prevalence 0.1374 0.1498 0.1946
## Detection Rate 0.1281 0.1350 0.1842
## Detection Prevalence 0.1303 0.1424 0.2613
## Balanced Accuracy 0.9647 0.9463 0.9256
lda_accuracy <- lda_conf_matrix$overall["Accuracy"]
# 9.2 Multinomial Logistic Regression
multinom_model <- multinom(Class ~ ., data = training_data, trace = FALSE)
multinom_pred <- predict(multinom_model, testing_data)
multinom_conf_matrix <- confusionMatrix(multinom_pred, testing_data$Class)
print("Multinomial Regression Results:")
## [1] "Multinomial Regression Results:"
print(multinom_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
## BARBUNYA 370 0 11 1 0 8 2
## BOMBAY 0 156 0 0 0 0 0
## CALI 16 0 465 0 10 0 3
## DERMASON 0 0 0 961 3 9 72
## HOROZ 0 0 7 2 532 0 12
## SEKER 2 0 1 21 0 570 14
## SIRA 8 0 5 78 13 21 687
##
## Overall Statistics
##
## Accuracy : 0.9214
## 95% CI : (0.9127, 0.9295)
## No Information Rate : 0.2618
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.905
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
## Sensitivity 0.93434 1.00000 0.9509 0.9040
## Specificity 0.99400 1.00000 0.9919 0.9720
## Pos Pred Value 0.94388 1.00000 0.9413 0.9196
## Neg Pred Value 0.99291 1.00000 0.9933 0.9662
## Prevalence 0.09754 0.03842 0.1204 0.2618
## Detection Rate 0.09113 0.03842 0.1145 0.2367
## Detection Prevalence 0.09655 0.03842 0.1217 0.2574
## Balanced Accuracy 0.96417 1.00000 0.9714 0.9380
## Class: HOROZ Class: SEKER Class: SIRA
## Sensitivity 0.9534 0.9375 0.8696
## Specificity 0.9940 0.9890 0.9618
## Pos Pred Value 0.9620 0.9375 0.8461
## Neg Pred Value 0.9926 0.9890 0.9683
## Prevalence 0.1374 0.1498 0.1946
## Detection Rate 0.1310 0.1404 0.1692
## Detection Prevalence 0.1362 0.1498 0.2000
## Balanced Accuracy 0.9737 0.9632 0.9157
multinom_accuracy <- multinom_conf_matrix$overall["Accuracy"]
# ===============================
# 10. UJI SIGNIFIKANSI VARIABEL
# ===============================
# 10.1 Untuk LDA - Wilks' Lambda Test
print("=== UJI SIGNIFIKANSI UNTUK LDA ===")
## [1] "=== UJI SIGNIFIKANSI UNTUK LDA ==="
wilks_test <- greedy.wilks(Class ~ ., data = training_data, niveau = 0.05)
print("Wilks' Lambda Test - Significant Variables:")
## [1] "Wilks' Lambda Test - Significant Variables:"
print(wilks_test)
## Formula containing included variables:
##
## Class ~ Perimeter + Compactness + ShapeFactor3 + MajorAxisLength +
## ConvexArea + ShapeFactor1 + roundness + ShapeFactor4 + MinorAxisLength +
## Solidity
## <environment: 0x000002a474d89230>
##
##
## Values calculated in each step of the selection procedure:
##
## vars Wilks.lambda F.statistics.overall p.value.overall
## 1 Perimeter 9.445946e-02 15140.361 0
## 2 Compactness 1.954504e-02 9716.444 0
## 3 ShapeFactor3 6.135831e-03 7525.299 0
## 4 MajorAxisLength 1.954944e-03 6853.658 0
## 5 ConvexArea 4.143312e-04 7589.510 0
## 6 ShapeFactor1 1.640545e-04 7251.793 0
## 7 roundness 1.297089e-04 6071.912 0
## 8 ShapeFactor4 1.084038e-04 5236.799 0
## 9 MinorAxisLength 9.508997e-05 4603.133 0
## 10 Solidity 9.061918e-05 4059.807 0
## F.statistics.diff p.value.diff
## 1 15140.36073 0
## 2 6052.80853 0
## 3 3450.73658 0
## 4 3376.52738 0
## 5 5869.97725 0
## 6 2408.11283 0
## 7 417.92679 0
## 8 310.16393 0
## 9 220.94073 0
## 10 77.84407 0
# Alternatif: stepwise selection untuk LDA
lda_stepwise <- stepclass(Class ~ ., data = training_data,
method = "lda", direction = "both")
## `stepwise classification', using 10-fold cross-validated correctness rate of method lda'.
## 9483 observations of 10 variables in 7 classes; direction: both
## stop criterion: improvement less than 5%.
## correctness rate: 0.63883; in: "Perimeter"; variables (1): Perimeter
## correctness rate: 0.88052; in: "Compactness"; variables (2): Perimeter, Compactness
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 3.55
print("Stepwise Variable Selection for LDA:")
## [1] "Stepwise Variable Selection for LDA:"
print(lda_stepwise)
## method : lda
## final model : Class ~ Perimeter + Compactness
## <environment: 0x000002a397d0cc10>
##
## correctness rate = 0.8805
# 10.2 Untuk Multinomial Regression
print("=== UJI SIGNIFIKANSI UNTUK MULTINOMIAL REGRESSION ===")
## [1] "=== UJI SIGNIFIKANSI UNTUK MULTINOMIAL REGRESSION ==="
# Uji Serentak (Likelihood Ratio Test)
null_model <- multinom(Class ~ 1, data = training_data, trace = FALSE)
full_model <- multinom_model
# Likelihood Ratio Test
lr_test <- anova(null_model, full_model)
print("Likelihood Ratio Test (Uji Serentak):")
## [1] "Likelihood Ratio Test (Uji Serentak):"
print(lr_test)
## Likelihood ratio tests of Multinomial Models
##
## Response: Class
## Model
## 1 1
## 2 Perimeter + MajorAxisLength + MinorAxisLength + ConvexArea + Solidity + roundness + Compactness + ShapeFactor1 + ShapeFactor3 + ShapeFactor4
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 56892 34781.10
## 2 56832 3806.58 1 vs 2 60 30974.52 0
# Uji Parsial (Wald Test) - melalui summary
multinom_summary <- summary(multinom_model)
z_values <- multinom_summary$coefficients / multinom_summary$standard.errors
print("Wald Test (Uji Parsial) - Z values:")
## [1] "Wald Test (Uji Parsial) - Z values:"
print(z_values)
## (Intercept) Perimeter MajorAxisLength MinorAxisLength ConvexArea
## BOMBAY -0.7679093 -0.1812417 1.0676188 -1.0585723 -0.4769655
## CALI -0.3692284 -0.9855241 0.8648163 -0.5276840 0.0672000
## DERMASON -1.4774626 1.2365198 -0.1134052 0.7016587 -0.9766399
## HOROZ 3.1104326 0.4204100 0.8016545 2.0159093 -1.4275290
## SEKER 0.5831967 1.1712832 0.8686602 -2.0243903 -0.6894104
## SIRA 1.5608022 -2.6567269 1.3829367 1.1207741 -0.8574066
## Solidity roundness Compactness ShapeFactor1 ShapeFactor3
## BOMBAY 0.3299333 0.3407732 0.1370328 -0.25141968 0.07664869
## CALI 5.1325460 0.7029320 -0.1549680 -0.35278931 0.25005352
## DERMASON 3.6073361 2.5973682 -0.2989232 0.76773290 0.21949292
## HOROZ 4.9459695 1.4601464 -2.5182130 2.58229273 3.16033188
## SEKER 7.1502793 0.6572262 0.4778835 0.04913254 0.19697796
## SIRA 3.8849571 -1.5697169 0.5952005 -0.30880428 -0.81541190
## ShapeFactor4
## BOMBAY -0.3486730
## CALI -11.1719096
## DERMASON -3.9080630
## HOROZ -8.8925515
## SEKER 0.5931435
## SIRA -6.7066070
# P-values untuk Wald test
p_values <- 2 * (1 - pnorm(abs(z_values)))
print("P-values for Wald Test:")
## [1] "P-values for Wald Test:"
print(p_values)
## (Intercept) Perimeter MajorAxisLength MinorAxisLength ConvexArea
## BOMBAY 0.442541053 0.856177826 0.2856925 0.28979460 0.6333867
## CALI 0.711957472 0.324366691 0.3871396 0.59771867 0.9464225
## DERMASON 0.139551668 0.216265408 0.9097093 0.48289205 0.3287475
## HOROZ 0.001868135 0.674185994 0.4227529 0.04380946 0.1534275
## SEKER 0.559760895 0.241484966 0.3850330 0.04293001 0.4905651
## SIRA 0.118570418 0.007890335 0.1666843 0.26238405 0.3912202
## Solidity roundness Compactness ShapeFactor1 ShapeFactor3
## BOMBAY 7.414504e-01 0.733274309 0.8910049 0.80148965 0.938903025
## CALI 2.858488e-07 0.482098154 0.8768465 0.72424640 0.802545960
## DERMASON 3.093569e-04 0.009394118 0.7649987 0.44264589 0.826266091
## HOROZ 7.576583e-07 0.144249845 0.0117952 0.00981463 0.001575895
## SEKER 8.659740e-13 0.511035485 0.6327331 0.96081367 0.843844787
## SIRA 1.023480e-04 0.116480980 0.5517095 0.75747042 0.414836576
## ShapeFactor4
## BOMBAY 7.273348e-01
## CALI 0.000000e+00
## DERMASON 9.303903e-05
## HOROZ 0.000000e+00
## SEKER 5.530852e-01
## SIRA 1.992029e-11
# ===============================
# 11. INTERPRETASI MODEL
# ===============================
# 11.1 Interpretasi LDA
print("=== INTERPRETASI LDA ===")
## [1] "=== INTERPRETASI LDA ==="
print("Linear Discriminant Functions:")
## [1] "Linear Discriminant Functions:"
print(lda_model$scaling)
## LD1 LD2 LD3 LD4 LD5
## Perimeter 14.759517512 4.94667435 2.0984460 0.48459912 9.4637508
## MajorAxisLength 15.503193202 3.53729438 2.4468627 1.52466353 -1.8528240
## MinorAxisLength 4.691705213 0.13659826 -2.4890270 2.66798012 6.5901292
## ConvexArea -19.720072249 -6.72246228 -2.4128740 -5.34900200 -10.6661952
## Solidity -0.374710813 -0.08062581 0.1501279 0.09255218 -0.1683759
## roundness 2.723733592 1.41131563 0.7968456 -0.13601989 -0.1554135
## Compactness 20.437435753 9.64079102 -3.1926852 -34.86415992 1.9932279
## ShapeFactor1 7.554851617 1.84644401 0.2424708 -0.84565113 3.7952784
## ShapeFactor3 -14.828771090 -7.03416144 2.9042898 34.23645984 -3.2874574
## ShapeFactor4 -0.001651586 0.16263670 -0.4225245 0.02166458 0.8005345
## LD6
## Perimeter -2.18996702
## MajorAxisLength 1.05456814
## MinorAxisLength 0.96165953
## ConvexArea 5.55197015
## Solidity -0.03176155
## roundness -0.53821410
## Compactness -9.13281139
## ShapeFactor1 5.11236051
## ShapeFactor3 11.37576901
## ShapeFactor4 0.20836509
# Hitung group means untuk interpretasi
group_means <- aggregate(training_data[,1:10], by = list(training_data$Class), mean)
print("Group Means by Class:")
## [1] "Group Means by Class:"
print(group_means)
## Group.1 Perimeter MajorAxisLength MinorAxisLength ConvexArea Solidity
## 1 BARBUNYA 1.0475455 0.6485612 1.2612685 1.0498030 -1.13361276
## 2 BOMBAY 2.8500216 2.9149811 2.4550829 2.5979875 -0.05029267
## 3 CALI 1.1174012 1.1395376 1.1551181 1.3450959 -0.47662784
## 4 DERMASON -0.9637978 -0.8932994 -1.0084744 -0.9557661 0.25033834
## 5 HOROZ 0.3885068 0.6771079 -0.4361962 0.1894566 -0.33277447
## 6 SEKER -0.6315100 -0.8344825 0.1032214 -0.5495921 0.84152134
## 7 SIRA -0.2696673 -0.2355313 -0.2420822 -0.2915444 0.13927551
## roundness Compactness ShapeFactor1 ShapeFactor3 ShapeFactor4
## 1 -1.2163342 0.07032166 -1.1203010 0.04549504 0.14600407
## 2 -0.1848996 -0.14794967 -2.5240356 -0.17385898 -0.94419655
## 3 -0.4831725 -0.71076066 -1.0317637 -0.72487646 -1.20961526
## 4 0.5895301 0.30577979 1.0855891 0.28090952 0.47471932
## 5 -1.3554422 -1.61999726 0.3906743 -1.55445096 -0.71161343
## 6 1.2163076 1.57612738 -0.2257978 1.63395670 0.92473859
## 7 0.1819127 -0.05091467 0.1351048 -0.08114952 0.01369591
# 11.2 Interpretasi Multinomial Regression - Odd Ratios
print("=== INTERPRETASI MULTINOMIAL REGRESSION ===")
## [1] "=== INTERPRETASI MULTINOMIAL REGRESSION ==="
odd_ratios <- exp(multinom_summary$coefficients)
print("Odd Ratios:")
## [1] "Odd Ratios:"
print(odd_ratios)
## (Intercept) Perimeter MajorAxisLength MinorAxisLength ConvexArea
## BOMBAY 0.01601843 2.471512e-03 9.498714e+17 1.440019e-08 1.004895e-04
## CALI 0.59515015 8.838517e-06 4.518984e+06 3.858544e-02 1.942954e+00
## DERMASON 0.07949650 3.070723e+06 1.813788e-02 1.040927e+04 3.872338e-14
## HOROZ 35.73353033 2.985685e+01 3.201300e+05 1.857331e+06 7.033725e-08
## SEKER 2.29295335 5.719279e+04 9.261936e+07 5.553635e-07 6.569811e-06
## SIRA 8.31672288 4.435200e-18 2.554621e+14 3.909032e+04 3.645729e-08
## Solidity roundness Compactness ShapeFactor1 ShapeFactor3
## BOMBAY 2.725721 22.71965433 1.549368e+05 2.296750e-02 2.555587e+02
## CALI 3.446898 4.95316082 1.814844e-02 7.785184e-02 1.619715e+02
## DERMASON 3.937203 121.47945520 6.833611e-06 5.302503e+03 6.762745e+02
## HOROZ 4.194345 8.73785706 1.640875e-25 7.649395e+07 3.130780e+24
## SEKER 19.280550 2.75959394 9.557252e+04 1.478044e+00 3.734862e+01
## SIRA 3.420948 0.02759943 5.056631e+07 5.922998e-02 8.810983e-09
## ShapeFactor4
## BOMBAY 0.28719590
## CALI 0.05747768
## DERMASON 0.25173546
## HOROZ 0.07900404
## SEKER 1.24157441
## SIRA 0.13571992
# ===============================
# 12. EVALUASI MODEL
# ===============================
# Compare the two models
model_comparison <- data.frame(
Model = c("Linear Discriminant Analysis", "Multinomial Logistic Regression"),
Accuracy = c(lda_accuracy, multinom_accuracy)
)
print("Model Comparison:")
## [1] "Model Comparison:"
print(model_comparison)
## Model Accuracy
## 1 Linear Discriminant Analysis 0.8931034
## 2 Multinomial Logistic Regression 0.9214286
# Cross-validation for more robust evaluation
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
# Cross-validate LDA
lda_cv <- train(Class ~ ., data = beans_selected,
method = "lda",
trControl = ctrl,
metric = "Accuracy")
# Cross-validate Multinomial Regression
multinom_cv <- train(Class ~ ., data = beans_selected,
method = "multinom",
trControl = ctrl,
metric = "Accuracy",
trace = FALSE)
# Compare CV results
cv_results <- resamples(list(LDA = lda_cv, Multinom = multinom_cv))
summary(cv_results)
##
## Call:
## summary.resamples(object = cv_results)
##
## Models: LDA, Multinom
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8803545 0.8890942 0.8947950 0.8950766 0.9020695 0.9077491 0
## Multinom 0.9041298 0.9240998 0.9254062 0.9242447 0.9287823 0.9319527 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8552953 0.8658171 0.8728968 0.8731431 0.8815879 0.8884639 0
## Multinom 0.8839937 0.9081776 0.9097748 0.9083625 0.9138306 0.9176596 0
bwplot(cv_results)
# Calculate precision, recall, and F1-score for each class
lda_metrics <- data.frame(
Class = levels(testing_data$Class),
Precision = lda_conf_matrix$byClass[, "Precision"],
Recall = lda_conf_matrix$byClass[, "Sensitivity"],
F1_Score = lda_conf_matrix$byClass[, "F1"]
)
multinom_metrics <- data.frame(
Class = levels(testing_data$Class),
Precision = multinom_conf_matrix$byClass[, "Precision"],
Recall = multinom_conf_matrix$byClass[, "Sensitivity"],
F1_Score = multinom_conf_matrix$byClass[, "F1"]
)
print("LDA Metrics by Class:")
## [1] "LDA Metrics by Class:"
print(lda_metrics)
## Class Precision Recall F1_Score
## Class: BARBUNYA BARBUNYA 0.9634831 0.8661616 0.9122340
## Class: BOMBAY BOMBAY 1.0000000 0.9935897 0.9967846
## Class: CALI CALI 0.9373695 0.9182004 0.9276860
## Class: DERMASON DERMASON 0.9567627 0.8118532 0.8783715
## Class: HOROZ HOROZ 0.9829868 0.9318996 0.9567617
## Class: SEKER SEKER 0.9480969 0.9013158 0.9241147
## Class: SIRA SIRA 0.7049953 0.9468354 0.8082118
print("Multinomial Regression Metrics by Class:")
## [1] "Multinomial Regression Metrics by Class:"
print(multinom_metrics)
## Class Precision Recall F1_Score
## Class: BARBUNYA BARBUNYA 0.9438776 0.9343434 0.9390863
## Class: BOMBAY BOMBAY 1.0000000 1.0000000 1.0000000
## Class: CALI CALI 0.9412955 0.9509202 0.9460834
## Class: DERMASON DERMASON 0.9196172 0.9040452 0.9117647
## Class: HOROZ HOROZ 0.9620253 0.9534050 0.9576958
## Class: SEKER SEKER 0.9375000 0.9375000 0.9375000
## Class: SIRA SIRA 0.8460591 0.8696203 0.8576779
# ===============================
# 13. ROC ANALYSIS
# ===============================
print("=== ROC ANALYSIS ===")
## [1] "=== ROC ANALYSIS ==="
# Konversi prediksi ke probabilitas
multinom_probs_test <- predict(multinom_model, testing_data, type = "probs")
lda_probs_test <- predict(lda_model, testing_data)$posterior
# Hitung ROC untuk setiap model
print("ROC Analysis per Class:")
## [1] "ROC Analysis per Class:"
for(class_name in levels(testing_data$Class)) {
# Untuk Multinomial
binary_actual <- ifelse(testing_data$Class == class_name, 1, 0)
multinom_roc <- roc(binary_actual, multinom_probs_test[,class_name], quiet = TRUE)
lda_roc <- roc(binary_actual, lda_probs_test[,class_name], quiet = TRUE)
cat("Class:", class_name, "\n")
cat("Multinomial AUC:", round(auc(multinom_roc), 4), "\n")
cat("LDA AUC:", round(auc(lda_roc), 4), "\n\n")
}
## Class: BARBUNYA
## Multinomial AUC: 0.9981
## LDA AUC: 0.9969
##
## Class: BOMBAY
## Multinomial AUC: 1
## LDA AUC: 1
##
## Class: CALI
## Multinomial AUC: 0.9979
## LDA AUC: 0.9969
##
## Class: DERMASON
## Multinomial AUC: 0.9918
## LDA AUC: 0.9894
##
## Class: HOROZ
## Multinomial AUC: 0.998
## LDA AUC: 0.9978
##
## Class: SEKER
## Multinomial AUC: 0.9967
## LDA AUC: 0.9932
##
## Class: SIRA
## Multinomial AUC: 0.984
## LDA AUC: 0.9766
# ===============================
# 14. VISUALISASI
# ===============================
# Visualize the performance metrics
metrics_long <- rbind(
data.frame(Model = "LDA", lda_metrics, stringsAsFactors = FALSE),
data.frame(Model = "Multinom", multinom_metrics, stringsAsFactors = FALSE)
)
metrics_long_tidy <- pivot_longer(metrics_long,
cols = c("Precision", "Recall", "F1_Score"),
names_to = "Metric",
values_to = "Value")
ggplot(metrics_long_tidy, aes(x = Class, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
facet_wrap(~ Metric, scales = "free") +
labs(title = "Performance Metrics by Class and Model",
y = "Value", x = "Bean Class") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a heatmap of the LDA coefficients
lda_coeffs <- lda_model$scaling
lda_coeffs_df <- as.data.frame(lda_coeffs)
lda_coeffs_tidy <- pivot_longer(data.frame(
Feature = rownames(lda_coeffs),
lda_coeffs
), cols = -Feature, names_to = "LD", values_to = "Coefficient")
ggplot(lda_coeffs_tidy, aes(x = LD, y = Feature, fill = Coefficient)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = "LDA Coefficients Heatmap",
x = "Linear Discriminant", y = "Feature") +
theme_minimal()
# ===============================
# 15. MDS VISUALIZATION
# ===============================
# Create a distance matrix for MDS
# Using a subset of the data for visualization clarity
set.seed(123)
subset_indices <- sample(1:nrow(beans_data_clean), 300)
subset_data <- beans_data_clean[subset_indices, ]
# Calculate Euclidean distance matrix
dist_matrix <- dist(subset_data[, 1:16], method = "euclidean")
# Perform MDS
mds_result <- cmdscale(dist_matrix, k = 2, eig = TRUE)
mds_coords <- data.frame(mds_result$points)
names(mds_coords) <- c("Dim.1", "Dim.2")
mds_coords$Class <- subset_data$Class
# Plot MDS results
ggplot(mds_coords, aes(x = Dim.1, y = Dim.2, color = Class)) +
geom_point(size = 3, alpha = 0.7) +
labs(title = "MDS Representation of Bean Classes",
x = "Dimension 1", y = "Dimension 2") +
theme_minimal() +
theme(legend.position = "right")
# ===============================
# 16. PCA BIPLOT
# ===============================
# Create BiPlot from PCA
# Using the first two principal components
biplot_data <- data.frame(PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Class = beans_data_clean$Class)
# Calculate variance explained by the first two PCs
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
pct_var_explained <- round(var_explained * 100, 1)
# Create biplot
ggplot(biplot_data, aes(x = PC1, y = PC2, color = Class)) +
geom_point(alpha = 0.7) +
labs(title = "PCA Biplot of Bean Classes",
x = paste0("PC1 (", pct_var_explained[1], "% variance)"),
y = paste0("PC2 (", pct_var_explained[2], "% variance)")) +
theme_minimal() +
theme(legend.position = "right") +
# Add eigenvectors
geom_segment(data = data.frame(
x1 = 0, y1 = 0,
x2 = pca_result$rotation[, 1] * 5,
y2 = pca_result$rotation[, 2] * 5,
Feature = rownames(pca_result$rotation)
), aes(x = x1, y = y1, xend = x2, yend = y2),
arrow = arrow(length = unit(0.2, "cm")), color = "red") +
geom_text(data = data.frame(
x = pca_result$rotation[, 1] * 5.5,
y = pca_result$rotation[, 2] * 5.5,
Feature = rownames(pca_result$rotation)
), aes(x = x, y = y, label = Feature), color = "black", size = 3)
# ===============================
# 17. MODEL COMPARISON WITH AIC
# ===============================
# Calculate the likelihood and AIC for both models
# For LDA
lda_probs <- predict(lda_model, testing_data, type = "posterior")$posterior
lda_log_likelihood <- sum(log(sapply(1:nrow(testing_data), function(i)
lda_probs[i, as.character(testing_data$Class[i])])))
lda_params <- ncol(lda_model$scaling) * (nrow(lda_model$scaling) + ncol(lda_model$scaling))
lda_aic <- -2 * lda_log_likelihood + 2 * lda_params
# For Multinomial
multinom_probs <- fitted(multinom_model)
multinom_log_likelihood <- logLik(multinom_model)
multinom_aic <- AIC(multinom_model)
# Display AIC and likelihood values
model_metrics <- data.frame(
Model = c("LDA", "Multinomial"),
Log_Likelihood = c(lda_log_likelihood, as.numeric(multinom_log_likelihood)),
AIC = c(lda_aic, multinom_aic)
)
print("Model Comparison by Likelihood and AIC:")
## [1] "Model Comparison by Likelihood and AIC:"
print(model_metrics)
## Model Log_Likelihood AIC
## 1 LDA -1788.514 3769.027
## 2 Multinomial -1903.290 3938.580
# ===============================
# 18. ADDITIONAL VISUALIZATIONS
# ===============================
# Plotting hasil uji asumsi
par(mfrow=c(2,2))
# Q-Q plots untuk normalitas
qqnorm(beans_data_clean$Area, main="Q-Q Plot: Area")
qqline(beans_data_clean$Area)
qqnorm(beans_data_clean$Perimeter, main="Q-Q Plot: Perimeter")
qqline(beans_data_clean$Perimeter)
# Scatter plot untuk homogenitas
plot(beans_data_clean$Area, beans_data_clean$Perimeter,
col=beans_data_clean$Class, main="Scatter Plot by Class")
legend("topright", legend=levels(beans_data_clean$Class),
col=1:length(levels(beans_data_clean$Class)), pch=1)
par(mfrow=c(1,1))
# ===============================
# 19. FINAL SUMMARY
# ===============================
cat("\n===== RINGKASAN ANALISIS LENGKAP =====\n")
##
## ===== RINGKASAN ANALISIS LENGKAP =====
cat("\n1. PREPROCESSING:\n")
##
## 1. PREPROCESSING:
cat("- Missing values:", missing_values, "\n")
## - Missing values: 0
cat("- Duplicate rows:", duplicate_rows, "\n")
## - Duplicate rows: 68
cat("- Outlier handling: IQR method with capping\n")
## - Outlier handling: IQR method with capping
cat("- Feature scaling: Standardization applied\n")
## - Feature scaling: Standardization applied
cat("\n2. UJI ASUMSI:\n")
##
## 2. UJI ASUMSI:
cat("- Normalitas Multivariat (Mardia):",
ifelse(mvn_result$multivariateNormality$`p value` > 0.05, "TERPENUHI", "TIDAK TERPENUHI"), "\n")
## Warning in Ops.factor(mvn_result$multivariateNormality$`p value`, 0.05): '>'
## not meaningful for factors
## - Normalitas Multivariat (Mardia): NA NA NA
cat("- Homogenitas Kovarians (Box's M):",
ifelse(boxm_test$p.value > 0.05, "TERPENUHI", "TIDAK TERPENUHI"), "\n")
## - Homogenitas Kovarians (Box's M): TIDAK TERPENUHI
cat("\n3. FEATURE SELECTION:\n")
##
## 3. FEATURE SELECTION:
cat("- Total features awal:", ncol(beans_data) - 1, "\n")
## - Total features awal: 16
cat("- Features terpilih:", length(top_features), "\n")
## - Features terpilih: 10
cat("- Top features:", paste(top_features[1:5], collapse = ", "), "...\n")
## - Top features: roundness, ShapeFactor4, ShapeFactor1, Perimeter, MajorAxisLength ...
cat("\n4. PEMODELAN:\n")
##
## 4. PEMODELAN:
cat("- Model 1: Linear Discriminant Analysis\n")
## - Model 1: Linear Discriminant Analysis
cat("- Model 2: Multinomial Logistic Regression\n")
## - Model 2: Multinomial Logistic Regression
cat("\n5. UJI SIGNIFIKANSI:\n")
##
## 5. UJI SIGNIFIKANSI:
cat("- LDA: Wilks' Lambda test dilakukan\n")
## - LDA: Wilks' Lambda test dilakukan
cat("- Multinomial: Likelihood Ratio Test dan Wald Test dilakukan\n")
## - Multinomial: Likelihood Ratio Test dan Wald Test dilakukan
cat("\n6. EVALUASI MODEL:\n")
##
## 6. EVALUASI MODEL:
cat("- LDA Accuracy:", round(lda_accuracy, 4), "\n")
## - LDA Accuracy: 0.8931
cat("- Multinomial Accuracy:", round(multinom_accuracy, 4), "\n")
## - Multinomial Accuracy: 0.9214
cat("- Best Model:", ifelse(lda_accuracy > multinom_accuracy, "LDA", "Multinomial"), "\n")
## - Best Model: Multinomial
cat("- Cross-validation performed with 10-fold CV\n")
## - Cross-validation performed with 10-fold CV
cat("\n7. INTERPRETASI:\n")
##
## 7. INTERPRETASI:
cat("- LDA: Koefisien linear discriminant dan group means\n")
## - LDA: Koefisien linear discriminant dan group means
cat("- Multinomial: Odd ratios dan signifikansi koefisien\n")
## - Multinomial: Odd ratios dan signifikansi koefisien
cat("\n8. ADDITIONAL ANALYSIS:\n")
##
## 8. ADDITIONAL ANALYSIS:
cat("- PCA: Explained variance by", optimal_pcs, "components\n")
## - PCA: Explained variance by 2 components
cat("- MDS: 2D representation created\n")
## - MDS: 2D representation created
cat("- ROC Analysis: AUC calculated for each class\n")
## - ROC Analysis: AUC calculated for each class
Kesimpulan
Dalam penelitian ini, dilakukan klasifikasi terhadap dataset Dry Bean menggunakan dua metode statistik multivariat, yaitu Linear Discriminant Analysis (LDA) dan Regresi Logistik Multinomial. Proses analisis dimulai dengan eksplorasi dan pembersihan data, termasuk penanganan outlier dengan metode IQR, penghapusan duplikasi, serta uji asumsi normalitas, homogenitas varians, dan multikolinearitas. Hasil menunjukkan bahwa sebagian besar fitur tidak normal, tidak homogen antar kelas, serta memiliki korelasi dan nilai VIF tinggi, terutama pada fitur-fitur yang secara logis memang saling berkaitan dalam pengukuran morfologi kacang.
Untuk mengurangi kompleksitas data, dilakukan Analisis Komponen Utama (PCA) yang menunjukkan bahwa dua komponen utama sudah cukup menjelaskan lebih dari 80% variansi data. Visualisasi PCA biplot juga memperlihatkan pemisahan kelas kacang yang cukup baik untuk kelas seperti SIRA dan HOROZ, namun terdapat overlap pada kelas CALI, DERMASON, dan BARBUNYA. Seleksi fitur menggunakan Random Forest menghasilkan sepuluh fitur terbaik yang berkontribusi besar terhadap akurasi klasifikasi, di antaranya roundness, Perimeter, ShapeFactor1, dan MajorAxisLength.
Evaluasi terhadap dua model klasifikasi menunjukkan bahwa Regresi Logistik Multinomial memiliki performa lebih baik dengan akurasi sebesar 92,14%, dibandingkan LDA dengan akurasi 89,31%. Kedua model cukup stabil dan mampu mengenali pola morfologi antar kelas, meskipun beberapa kelas seperti DERMASON dan SIRA masih menunjukkan kesalahan klasifikasi akibat tumpang tindih karakteristik visual. Secara keseluruhan, hasil ini menunjukkan bahwa pengklasifikasian varietas kacang secara otomatis dapat dilakukan dengan cukup akurat menggunakan pendekatan statistik modern, yang berpotensi besar dalam aplikasi industri pertanian dan teknologi pangan.