LOAD LIBRARY
packages <- c(
"readr",
"dplyr",
"tidyr",
"ggplot2",
"MASS",
"caret",
"car",
"biotools",
"corrplot",
"knitr",
"kableExtra"
)
installed <- rownames(installed.packages())
for (pkg in packages) {
if (!(pkg %in% installed)) {
install.packages(pkg)
}
}
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(biotools)
## ---
## biotools version 4.3
library(corrplot)
## corrplot 0.95 loaded
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
DESKRIPSI DATA
if (file.exists("WineQT.csv")) {
data <- read_csv("WineQT.csv", show_col_types = FALSE)
} else {
data <- read_csv(file.choose(), show_col_types = FALSE)
}
# Melihat data awal
head(data)
## # A tibble: 6 × 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ℹ 8 more variables: `free sulfur dioxide` <dbl>,
## # `total sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, Id <dbl>
str(data)
## spc_tbl_ [1,143 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed acidity : num [1:1143] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
## $ volatile acidity : num [1:1143] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
## $ citric acid : num [1:1143] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
## $ residual sugar : num [1:1143] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
## $ chlorides : num [1:1143] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
## $ free sulfur dioxide : num [1:1143] 11 25 15 17 11 13 15 15 9 15 ...
## $ total sulfur dioxide: num [1:1143] 34 67 54 60 34 40 59 21 18 65 ...
## $ density : num [1:1143] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:1143] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
## $ sulphates : num [1:1143] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
## $ alcohol : num [1:1143] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
## $ quality : num [1:1143] 5 5 5 6 5 5 5 7 7 5 ...
## $ Id : num [1:1143] 0 1 2 3 4 5 6 7 8 10 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double(),
## .. Id = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.100 1st Qu.:0.3925 1st Qu.:0.0900 1st Qu.: 1.900
## Median : 7.900 Median :0.5200 Median :0.2500 Median : 2.200
## Mean : 8.311 Mean :0.5313 Mean :0.2684 Mean : 2.532
## 3rd Qu.: 9.100 3rd Qu.:0.6400 3rd Qu.:0.4200 3rd Qu.: 2.600
## Max. :15.900 Max. :1.5800 Max. :1.0000 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 21.00 1st Qu.:0.9956
## Median :0.07900 Median :13.00 Median : 37.00 Median :0.9967
## Mean :0.08693 Mean :15.62 Mean : 45.91 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 61.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :68.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.205 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6577 Mean :10.44 Mean :5.657
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
## Id
## Min. : 0
## 1st Qu.: 411
## Median : 794
## Mean : 805
## 3rd Qu.:1210
## Max. :1597
# Mengubah nama kolom agar mudah dipanggil di R
# Contoh: fixed acidity menjadi fixed.acidity
names(data) <- make.names(names(data))
names(data)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
## [13] "Id"
# Jumlah baris dan kolom
dim(data)
## [1] 1143 13
IDENTIFIKASI DAN CLEANING DATA
# Pengecekan missing value
na_check <- data.frame(Nilai_Kosong = colSums(is.na(data)))
kable(na_check, caption = "Identifikasi Missing Value per Variabel") %>%
kable_styling(bootstrap_options = "condensed", full_width = FALSE)
| Nilai_Kosong | |
|---|---|
| fixed.acidity | 0 |
| volatile.acidity | 0 |
| citric.acid | 0 |
| residual.sugar | 0 |
| chlorides | 0 |
| free.sulfur.dioxide | 0 |
| total.sulfur.dioxide | 0 |
| density | 0 |
| pH | 0 |
| sulphates | 0 |
| alcohol | 0 |
| quality | 0 |
| Id | 0 |
# Menghapus missing value jika ada
data <- na.omit(data)
# Menghapus kolom Id jika ada, karena Id bukan variabel prediktor
if ("Id" %in% names(data)) {
data <- dplyr::select(data, -Id)
}
# Cek ulang missing value
colSums(is.na(data))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
STATISTIKA DESKRIPTIF
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.100 1st Qu.:0.3925 1st Qu.:0.0900 1st Qu.: 1.900
## Median : 7.900 Median :0.5200 Median :0.2500 Median : 2.200
## Mean : 8.311 Mean :0.5313 Mean :0.2684 Mean : 2.532
## 3rd Qu.: 9.100 3rd Qu.:0.6400 3rd Qu.:0.4200 3rd Qu.: 2.600
## Max. :15.900 Max. :1.5800 Max. :1.0000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 21.00 1st Qu.:0.9956
## Median :0.07900 Median :13.00 Median : 37.00 Median :0.9967
## Mean :0.08693 Mean :15.62 Mean : 45.91 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 61.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :68.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.205 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6577 Mean :10.44 Mean :5.657
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
# Histogram variabel numerik utama
hist(data$quality,
main = "Distribusi Quality",
xlab = "Quality",
col = "lightblue")
hist(data$alcohol,
main = "Distribusi Alcohol",
xlab = "Alcohol",
col = "lightgreen")
hist(data$volatile.acidity,
main = "Distribusi Volatile Acidity",
xlab = "Volatile Acidity",
col = "lightpink")
hist(data$sulphates,
main = "Distribusi Sulphates",
xlab = "Sulphates",
col = "lightyellow")
# Visualisasi outlier untuk seluruh variabel numerik
data_numeric <- data %>%
dplyr::select(where(is.numeric))
data_long <- data_numeric %>%
pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
ggplot(data_long, aes(x = "", y = Nilai, fill = Variabel)) +
geom_boxplot(outlier.color = "red", alpha = 0.7) +
facet_wrap(~Variabel, scales = "free", ncol = 3) +
theme_bw() +
labs(title = "Distribusi Outlier pada Variabel Numerik",
x = "",
y = "Nilai") +
theme(legend.position = "none")
PEMBENTUKAN VARIABEL ORDINAL
# Variabel quality asli memiliki urutan nilai, misalnya 3 < 4 < 5 < 6 < 7 < 8.
# Agar kelompok lebih seimbang dan mudah dianalisis dengan LDA, quality dibagi
# menjadi tiga kategori ordinal:
# Low = quality <= 5
# Medium = quality == 6
# High = quality >= 7
data$quality_ord <- cut(
data$quality,
breaks = c(-Inf, 5, 6, Inf),
labels = c("Low", "Medium", "High"),
ordered_result = TRUE
)
data$quality_ord <- factor(
data$quality_ord,
levels = c("Low", "Medium", "High"),
ordered = TRUE
)
table(data$quality)
##
## 3 4 5 6 7 8
## 6 33 483 462 143 16
table(data$quality_ord)
##
## Low Medium High
## 522 462 159
prop.table(table(data$quality_ord))
##
## Low Medium High
## 0.4566929 0.4041995 0.1391076
barplot(table(data$quality_ord),
main = "Distribusi Kategori Quality Ordinal",
xlab = "Kategori Quality",
ylab = "Frekuensi",
col = c("tomato", "gold", "seagreen"))
PEMILIHAN VARIABEL LDA
# LDA bekerja paling baik jika variabel independen numerik.
# Karena itu, variabel quality asli tidak digunakan sebagai prediktor.
variabel_x <- c(
"fixed.acidity",
"volatile.acidity",
"citric.acid",
"residual.sugar",
"chlorides",
"free.sulfur.dioxide",
"total.sulfur.dioxide",
"density",
"pH",
"sulphates",
"alcohol"
)
# Menyesuaikan jika ada nama kolom yang berbeda
variabel_x <- variabel_x[variabel_x %in% names(data)]
data_lda <- data %>%
dplyr::select(all_of(variabel_x), quality_ord)
str(data_lda)
## tibble [1,143 × 12] (S3: tbl_df/tbl/data.frame)
## $ fixed.acidity : num [1:1143] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
## $ volatile.acidity : num [1:1143] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
## $ citric.acid : num [1:1143] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
## $ residual.sugar : num [1:1143] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
## $ chlorides : num [1:1143] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
## $ free.sulfur.dioxide : num [1:1143] 11 25 15 17 11 13 15 15 9 15 ...
## $ total.sulfur.dioxide: num [1:1143] 34 67 54 60 34 40 59 21 18 65 ...
## $ density : num [1:1143] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:1143] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
## $ sulphates : num [1:1143] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
## $ alcohol : num [1:1143] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
## $ quality_ord : Ord.factor w/ 3 levels "Low"<"Medium"<..: 1 1 1 2 1 1 1 3 3 1 ...
summary(data_lda)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.100 1st Qu.:0.3925 1st Qu.:0.0900 1st Qu.: 1.900
## Median : 7.900 Median :0.5200 Median :0.2500 Median : 2.200
## Mean : 8.311 Mean :0.5313 Mean :0.2684 Mean : 2.532
## 3rd Qu.: 9.100 3rd Qu.:0.6400 3rd Qu.:0.4200 3rd Qu.: 2.600
## Max. :15.900 Max. :1.5800 Max. :1.0000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 21.00 1st Qu.:0.9956
## Median :0.07900 Median :13.00 Median : 37.00 Median :0.9967
## Mean :0.08693 Mean :15.62 Mean : 45.91 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 61.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :68.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality_ord
## Min. :2.740 Min. :0.3300 Min. : 8.40 Low :522
## 1st Qu.:3.205 1st Qu.:0.5500 1st Qu.: 9.50 Medium:462
## Median :3.310 Median :0.6200 Median :10.20 High :159
## Mean :3.311 Mean :0.6577 Mean :10.44
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :4.010 Max. :2.0000 Max. :14.90
UJI KORELASI ANTAR VARIABEL INDEPENDEN
cor_mat <- cor(data_lda[, variabel_x])
round(cor_mat, 3)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 1.000 -0.251 0.673 0.172
## volatile.acidity -0.251 1.000 -0.544 -0.006
## citric.acid 0.673 -0.544 1.000 0.176
## residual.sugar 0.172 -0.006 0.176 1.000
## chlorides 0.108 0.056 0.245 0.071
## free.sulfur.dioxide -0.165 -0.002 -0.058 0.165
## total.sulfur.dioxide -0.111 0.078 0.037 0.191
## density 0.682 0.017 0.375 0.380
## pH -0.685 0.221 -0.546 -0.117
## sulphates 0.175 -0.276 0.331 0.017
## alcohol -0.075 -0.204 0.106 0.058
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity 0.108 -0.165 -0.111 0.682
## volatile.acidity 0.056 -0.002 0.078 0.017
## citric.acid 0.245 -0.058 0.037 0.375
## residual.sugar 0.071 0.165 0.191 0.380
## chlorides 1.000 0.015 0.048 0.209
## free.sulfur.dioxide 0.015 1.000 0.661 -0.054
## total.sulfur.dioxide 0.048 0.661 1.000 0.050
## density 0.209 -0.054 0.050 1.000
## pH -0.278 0.073 -0.059 -0.353
## sulphates 0.375 0.034 0.027 0.143
## alcohol -0.230 -0.047 -0.188 -0.495
## pH sulphates alcohol
## fixed.acidity -0.685 0.175 -0.075
## volatile.acidity 0.221 -0.276 -0.204
## citric.acid -0.546 0.331 0.106
## residual.sugar -0.117 0.017 0.058
## chlorides -0.278 0.375 -0.230
## free.sulfur.dioxide 0.073 0.034 -0.047
## total.sulfur.dioxide -0.059 0.027 -0.188
## density -0.353 0.143 -0.495
## pH 1.000 -0.185 0.225
## sulphates -0.185 1.000 0.094
## alcohol 0.225 0.094 1.000
corrplot(cor_mat,
method = "color",
type = "upper",
tl.cex = 0.7,
addCoef.col = "black",
number.cex = 0.6,
col = colorRampPalette(c("#E41A1C", "white", "#377EB8"))(200))
# Mencari korelasi sangat tinggi.
# Jika korelasi mendekati 1, matriks dapat menjadi singular.
high_cor <- findCorrelation(cor_mat, cutoff = 0.90, names = TRUE)
high_cor
## character(0)
# Jika ada variabel dengan korelasi > 0.90, variabel tersebut dihapus.
if (length(high_cor) > 0) {
data_lda <- data_lda %>%
dplyr::select(-all_of(high_cor))
variabel_x <- setdiff(variabel_x, high_cor)
}
variabel_x
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol"
UJI MULTIKOLINEARITAS DENGAN VIF
vif_model <- lm(
as.numeric(quality_ord) ~ .,
data = data_lda
)
vif_values <- vif(vif_model)
vif_values
## fixed.acidity volatile.acidity citric.acid
## 7.780540 1.778704 3.222840
## residual.sugar chlorides free.sulfur.dioxide
## 1.743237 1.538470 1.906045
## total.sulfur.dioxide density pH
## 2.103748 6.595115 3.393307
## sulphates alcohol
## 1.440741 3.184642
if (is.matrix(vif_values)) {
vif_table <- as.data.frame(vif_values)
} else {
vif_table <- data.frame(VIF = vif_values)
}
kable(vif_table,
caption = "Hasil Pengujian Multikolinearitas (VIF)") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
| VIF | |
|---|---|
| fixed.acidity | 7.780540 |
| volatile.acidity | 1.778704 |
| citric.acid | 3.222841 |
| residual.sugar | 1.743237 |
| chlorides | 1.538470 |
| free.sulfur.dioxide | 1.906045 |
| total.sulfur.dioxide | 2.103747 |
| density | 6.595115 |
| pH | 3.393307 |
| sulphates | 1.440741 |
| alcohol | 3.184642 |
STANDARDISASI DATA
# Standardisasi dilakukan karena skala antar variabel berbeda.
data_scaled <- data_lda
data_scaled[, variabel_x] <- scale(data_scaled[, variabel_x])
summary(data_scaled)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. :-2.1236 Min. :-2.28988 Min. :-1.36443 Min. :-1.20373
## 1st Qu.:-0.6930 1st Qu.:-0.77290 1st Qu.:-0.90685 1st Qu.:-0.46622
## Median :-0.2352 Median :-0.06312 Median :-0.09337 Median :-0.24496
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.4514 3rd Qu.: 0.60491 3rd Qu.: 0.77096 3rd Qu.: 0.05004
## Max. : 4.3425 Max. : 5.83779 Max. : 3.71982 Max. : 9.56389
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :-1.58529 Min. :-1.4258 Min. :-1.2176 Min. :-3.45983
## 1st Qu.:-0.35823 1st Qu.:-0.8405 1st Qu.:-0.7600 1st Qu.:-0.60279
## Median :-0.16782 Median :-0.2552 Median :-0.2719 Median :-0.02619
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.06489 3rd Qu.: 0.5253 3rd Qu.: 0.4602 3rd Qu.: 0.57899
## Max. :11.08730 Max. : 5.1104 Max. : 7.4152 Max. : 3.61524
## pH sulphates alcohol quality_ord
## Min. :-3.644836 Min. :-1.9232 Min. :-1.8870 Low :522
## 1st Qu.:-0.676702 1st Qu.:-0.6321 1st Qu.:-0.8706 Medium:462
## Median :-0.006478 Median :-0.2213 Median :-0.2237 High :159
## Mean : 0.000000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.568000 3rd Qu.: 0.4243 3rd Qu.: 0.6079
## Max. : 4.461681 Max. : 7.8774 Max. : 4.1193
SPLIT DATA TRAINING DAN TESTING
set.seed(123)
index_train <- createDataPartition(
data_scaled$quality_ord,
p = 0.80,
list = FALSE
)
train_data <- data_scaled[index_train, ]
test_data <- data_scaled[-index_train, ]
table(train_data$quality_ord)
##
## Low Medium High
## 418 370 128
table(test_data$quality_ord)
##
## Low Medium High
## 104 92 31
UJI ASUMSI KESAMAAN MATRIKS KOVARIANS
# Hipotesis:
# H0: Matriks kovarians antar kelompok sama
# H1: Minimal ada satu matriks kovarians kelompok yang berbeda
#
# Keputusan:
# Jika p-value > 0.05, asumsi kesamaan kovarians terpenuhi.
# Jika p-value <= 0.05, asumsi tidak terpenuhi secara sempurna.
box_m_result <- boxM(
train_data[, variabel_x],
train_data$quality_ord
)
box_m_result
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train_data[, variabel_x]
## Chi-Sq (approx.) = 499.99, df = 132, p-value < 2.2e-16
MODEL LDA ORDINAL
# quality_ord adalah label kategori ordinal:
# Low < Medium < High
model_lda <- lda(
quality_ord ~ .,
data = train_data
)
model_lda
## Call:
## lda(quality_ord ~ ., data = train_data)
##
## Prior probabilities of groups:
## Low Medium High
## 0.4563319 0.4039301 0.1397380
##
## Group means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## Low -0.110260133 0.3017243 -0.18045599 0.008487284 0.07719297
## Medium -0.007131207 -0.1547489 -0.04169996 -0.083111822 -0.04248831
## High 0.371840376 -0.7233203 0.68515881 0.249395909 -0.23145890
## free.sulfur.dioxide total.sulfur.dioxide density pH
## Low 0.09119116 0.2460172 0.15543871 -0.004034735
## Medium -0.04013776 -0.1998813 -0.08395937 0.111695308
## High -0.16979542 -0.2921946 -0.32236146 -0.242352957
## sulphates alcohol
## Low -0.2843297 -0.4846884
## Medium 0.1343133 0.1959699
## High 0.5402496 1.0399846
##
## Coefficients of linear discriminants:
## LD1 LD2
## fixed.acidity 0.320292666 -0.1491082
## volatile.acidity -0.305528521 0.4966217
## citric.acid 0.007342081 1.0164622
## residual.sugar 0.157525665 0.4700904
## chlorides -0.235641057 -0.3368592
## free.sulfur.dioxide 0.036132054 -0.3217828
## total.sulfur.dioxide -0.281068741 0.4419615
## density -0.296585443 -0.6168292
## pH -0.043149649 -0.2910780
## sulphates 0.506586361 -0.1095078
## alcohol 0.711169055 -0.2805789
##
## Proportion of trace:
## LD1 LD2
## 0.9315 0.0685
PREDIKSI MODEL
pred_lda <- predict(model_lda, newdata = test_data)
# Kelas prediksi
pred_class <- pred_lda$class
head(pred_class)
## [1] Low Low Medium Medium Low Low
## Levels: Low Medium High
# Probabilitas posterior setiap kelas
head(pred_lda$posterior)
## Low Medium High
## 1 0.7498436 0.2488625 0.001293881
## 2 0.5028424 0.4338747 0.063282938
## 3 0.1113364 0.6276220 0.261041616
## 4 0.4325203 0.4818492 0.085630524
## 5 0.5175663 0.4724009 0.010032765
## 6 0.7680478 0.2306181 0.001334101
# Nilai diskriminan
head(pred_lda$x)
## LD1 LD2
## 1 -1.61049044 -1.3669911
## 2 0.01234132 0.6806346
## 3 1.34453123 -0.5028942
## 4 0.21222157 0.5714479
## 5 -0.60827008 -1.2077404
## 6 -1.62669539 -1.1499516
EVALUASI MODEL
conf_matrix <- confusionMatrix(
pred_class,
test_data$quality_ord
)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Medium High
## Low 82 38 1
## Medium 20 43 16
## High 2 11 14
##
## Overall Statistics
##
## Accuracy : 0.6123
## 95% CI : (0.5456, 0.6761)
## No Information Rate : 0.4581
## P-Value [Acc > NIR] : 2.19e-06
##
## Kappa : 0.3523
##
## Mcnemar's Test P-Value : 0.07699
##
## Statistics by Class:
##
## Class: Low Class: Medium Class: High
## Sensitivity 0.7885 0.4674 0.45161
## Specificity 0.6829 0.7333 0.93367
## Pos Pred Value 0.6777 0.5443 0.51852
## Neg Pred Value 0.7925 0.6689 0.91500
## Prevalence 0.4581 0.4053 0.13656
## Detection Rate 0.3612 0.1894 0.06167
## Detection Prevalence 0.5330 0.3480 0.11894
## Balanced Accuracy 0.7357 0.6004 0.69264
# Matriks klasifikasi
kable(as.data.frame(conf_matrix$table),
caption = "Matriks Klasifikasi LDA") %>%
kable_styling(full_width = FALSE)
| Prediction | Reference | Freq |
|---|---|---|
| Low | Low | 82 |
| Medium | Low | 20 |
| High | Low | 2 |
| Low | Medium | 38 |
| Medium | Medium | 43 |
| High | Medium | 11 |
| Low | High | 1 |
| Medium | High | 16 |
| High | High | 14 |
# Akurasi model
akurasi_final <- round(conf_matrix$overall["Accuracy"] * 100, 2)
cat("TINGKAT AKURASI MODEL LDA ORDINAL ADALAH", akurasi_final, "%")
## TINGKAT AKURASI MODEL LDA ORDINAL ADALAH 61.23 %
TABEL HASIL PREDIKSI
hasil_prediksi <- data.frame(
Aktual = test_data$quality_ord,
Prediksi = pred_class,
Prob_Low = pred_lda$posterior[, "Low"],
Prob_Medium = pred_lda$posterior[, "Medium"],
Prob_High = pred_lda$posterior[, "High"]
)
head(hasil_prediksi, 20)
## Aktual Prediksi Prob_Low Prob_Medium Prob_High
## 1 Low Low 0.7498436 0.2488625 0.001293881
## 2 Medium Low 0.5028424 0.4338747 0.063282938
## 3 Low Medium 0.1113364 0.6276220 0.261041616
## 4 High Medium 0.4325203 0.4818492 0.085630524
## 5 Medium Low 0.5175663 0.4724009 0.010032765
## 6 Low Low 0.7680478 0.2306181 0.001334101
## 7 Low Low 0.8146235 0.1790213 0.006355177
## 8 Low Low 0.8035610 0.1936708 0.002768178
## 9 Medium Low 0.6696601 0.3236804 0.006659506
## 10 High Medium 0.4024342 0.5428509 0.054714897
## 11 Low Low 0.4393912 0.3823168 0.178292020
## 12 Medium Medium 0.1888625 0.7016261 0.109511415
## 13 Low Low 0.8621843 0.1352404 0.002575302
## 14 Low Low 0.6011813 0.3715021 0.027316593
## 15 Low Low 0.8105400 0.1885610 0.000899027
## 16 Medium High 0.0116227 0.1865561 0.801821240
## 17 Low Low 0.5989230 0.3823572 0.018719824
## 18 Low Low 0.8735175 0.1250941 0.001388347
## 19 Low Low 0.7638400 0.2315798 0.004580199
## 20 Medium Low 0.6059195 0.3811105 0.012969994
kable(head(hasil_prediksi, 20),
caption = "Preview Hasil Prediksi Model LDA") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
| Aktual | Prediksi | Prob_Low | Prob_Medium | Prob_High |
|---|---|---|---|---|
| Low | Low | 0.7498436 | 0.2488625 | 0.0012939 |
| Medium | Low | 0.5028424 | 0.4338747 | 0.0632829 |
| Low | Medium | 0.1113364 | 0.6276220 | 0.2610416 |
| High | Medium | 0.4325203 | 0.4818492 | 0.0856305 |
| Medium | Low | 0.5175663 | 0.4724009 | 0.0100328 |
| Low | Low | 0.7680478 | 0.2306181 | 0.0013341 |
| Low | Low | 0.8146235 | 0.1790213 | 0.0063552 |
| Low | Low | 0.8035610 | 0.1936708 | 0.0027682 |
| Medium | Low | 0.6696601 | 0.3236804 | 0.0066595 |
| High | Medium | 0.4024342 | 0.5428509 | 0.0547149 |
| Low | Low | 0.4393912 | 0.3823168 | 0.1782920 |
| Medium | Medium | 0.1888625 | 0.7016261 | 0.1095114 |
| Low | Low | 0.8621843 | 0.1352404 | 0.0025753 |
| Low | Low | 0.6011813 | 0.3715021 | 0.0273166 |
| Low | Low | 0.8105400 | 0.1885610 | 0.0008990 |
| Medium | High | 0.0116227 | 0.1865561 | 0.8018212 |
| Low | Low | 0.5989230 | 0.3823572 | 0.0187198 |
| Low | Low | 0.8735175 | 0.1250941 | 0.0013883 |
| Low | Low | 0.7638400 | 0.2315798 | 0.0045802 |
| Medium | Low | 0.6059195 | 0.3811105 | 0.0129700 |
VISUALISASI HASIL LDA
plot_data <- data.frame(
LD1 = pred_lda$x[, 1],
LD2 = pred_lda$x[, 2],
Aktual = test_data$quality_ord,
Prediksi = pred_class
)
ggplot(plot_data, aes(x = LD1, y = LD2, color = Aktual, shape = Prediksi)) +
geom_point(alpha = 0.8, size = 2) +
theme_bw() +
labs(title = "Plot Hasil LDA Wine Quality",
x = "LD1",
y = "LD2",
color = "Kelas Aktual",
shape = "Kelas Prediksi")
KESIMPULAN
cat("1. Dataset Wine Quality cocok digunakan untuk LDA karena seluruh variabel independen berupa numerik.\n")
## 1. Dataset Wine Quality cocok digunakan untuk LDA karena seluruh variabel independen berupa numerik.
cat("2. Variabel quality diubah menjadi variabel ordinal: Low < Medium < High.\n")
## 2. Variabel quality diubah menjadi variabel ordinal: Low < Medium < High.
cat("3. Variabel quality asli tidak digunakan sebagai prediktor agar tidak terjadi kebocoran target.\n")
## 3. Variabel quality asli tidak digunakan sebagai prediktor agar tidak terjadi kebocoran target.
if (length(high_cor) == 0) {
cat("4. Tidak ditemukan korelasi antar prediktor yang melebihi 0.90, sehingga risiko singular matrix relatif kecil.\n")
} else {
cat("4. Variabel yang dihapus karena korelasi tinggi adalah:",
paste(high_cor, collapse = ", "), ".\n")
}
## 4. Tidak ditemukan korelasi antar prediktor yang melebihi 0.90, sehingga risiko singular matrix relatif kecil.
if (box_m_result$p.value > 0.05) {
cat("5. Asumsi kesamaan matriks kovarians terpenuhi karena p-value Box's M > 0.05.\n")
} else {
cat("5. Asumsi kesamaan matriks kovarians tidak terpenuhi sempurna karena p-value Box's M <= 0.05.\n")
cat(" Oleh karena itu, hasil LDA tetap dapat dibahas, tetapi interpretasi perlu dilakukan secara hati-hati.\n")
}
## 5. Asumsi kesamaan matriks kovarians tidak terpenuhi sempurna karena p-value Box's M <= 0.05.
## Oleh karena itu, hasil LDA tetap dapat dibahas, tetapi interpretasi perlu dilakukan secara hati-hati.
cat("6. Akurasi model LDA Ordinal pada data testing adalah", akurasi_final, "%.\n")
## 6. Akurasi model LDA Ordinal pada data testing adalah 61.23 %.