LOAD LIBRARY

packages <- c(
  "readr",
  "dplyr",
  "tidyr",
  "ggplot2",
  "MASS",
  "caret",
  "car",
  "biotools",
  "corrplot",
  "knitr",
  "kableExtra"
)
 
installed <- rownames(installed.packages())
for (pkg in packages) {
  if (!(pkg %in% installed)) {
    install.packages(pkg)
  }
}
 
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(biotools)
## ---
## biotools version 4.3
library(corrplot)
## corrplot 0.95 loaded
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

DESKRIPSI DATA

if (file.exists("WineQT.csv")) {
  data <- read_csv("WineQT.csv", show_col_types = FALSE)
} else {
  data <- read_csv(file.choose(), show_col_types = FALSE)
}
 
# Melihat data awal
head(data)
## # A tibble: 6 × 13
##   `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
##             <dbl>              <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4               0.7           0                 1.9     0.076
## 2             7.8               0.88          0                 2.6     0.098
## 3             7.8               0.76          0.04              2.3     0.092
## 4            11.2               0.28          0.56              1.9     0.075
## 5             7.4               0.7           0                 1.9     0.076
## 6             7.4               0.66          0                 1.8     0.075
## # ℹ 8 more variables: `free sulfur dioxide` <dbl>,
## #   `total sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## #   alcohol <dbl>, quality <dbl>, Id <dbl>
str(data)
## spc_tbl_ [1,143 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ fixed acidity       : num [1:1143] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
##  $ volatile acidity    : num [1:1143] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
##  $ citric acid         : num [1:1143] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
##  $ residual sugar      : num [1:1143] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
##  $ chlorides           : num [1:1143] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
##  $ free sulfur dioxide : num [1:1143] 11 25 15 17 11 13 15 15 9 15 ...
##  $ total sulfur dioxide: num [1:1143] 34 67 54 60 34 40 59 21 18 65 ...
##  $ density             : num [1:1143] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num [1:1143] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
##  $ sulphates           : num [1:1143] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
##  $ alcohol             : num [1:1143] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
##  $ quality             : num [1:1143] 5 5 5 6 5 5 5 7 7 5 ...
##  $ Id                  : num [1:1143] 0 1 2 3 4 5 6 7 8 10 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `fixed acidity` = col_double(),
##   ..   `volatile acidity` = col_double(),
##   ..   `citric acid` = col_double(),
##   ..   `residual sugar` = col_double(),
##   ..   chlorides = col_double(),
##   ..   `free sulfur dioxide` = col_double(),
##   ..   `total sulfur dioxide` = col_double(),
##   ..   density = col_double(),
##   ..   pH = col_double(),
##   ..   sulphates = col_double(),
##   ..   alcohol = col_double(),
##   ..   quality = col_double(),
##   ..   Id = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(data)
##  fixed acidity    volatile acidity  citric acid     residual sugar  
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.100   1st Qu.:0.3925   1st Qu.:0.0900   1st Qu.: 1.900  
##  Median : 7.900   Median :0.5200   Median :0.2500   Median : 2.200  
##  Mean   : 8.311   Mean   :0.5313   Mean   :0.2684   Mean   : 2.532  
##  3rd Qu.: 9.100   3rd Qu.:0.6400   3rd Qu.:0.4200   3rd Qu.: 2.600  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.0000   Max.   :15.500  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 21.00       1st Qu.:0.9956  
##  Median :0.07900   Median :13.00       Median : 37.00       Median :0.9967  
##  Mean   :0.08693   Mean   :15.62       Mean   : 45.91       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 61.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :68.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.205   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6577   Mean   :10.44   Mean   :5.657  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000  
##        Id      
##  Min.   :   0  
##  1st Qu.: 411  
##  Median : 794  
##  Mean   : 805  
##  3rd Qu.:1210  
##  Max.   :1597
# Mengubah nama kolom agar mudah dipanggil di R
# Contoh: fixed acidity menjadi fixed.acidity
names(data) <- make.names(names(data))
names(data)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"             
## [13] "Id"
# Jumlah baris dan kolom
dim(data)
## [1] 1143   13

IDENTIFIKASI DAN CLEANING DATA

# Pengecekan missing value
na_check <- data.frame(Nilai_Kosong = colSums(is.na(data)))
 
kable(na_check, caption = "Identifikasi Missing Value per Variabel") %>%
  kable_styling(bootstrap_options = "condensed", full_width = FALSE)
Identifikasi Missing Value per Variabel
Nilai_Kosong
fixed.acidity 0
volatile.acidity 0
citric.acid 0
residual.sugar 0
chlorides 0
free.sulfur.dioxide 0
total.sulfur.dioxide 0
density 0
pH 0
sulphates 0
alcohol 0
quality 0
Id 0
# Menghapus missing value jika ada
data <- na.omit(data)
 
# Menghapus kolom Id jika ada, karena Id bukan variabel prediktor
if ("Id" %in% names(data)) {
  data <- dplyr::select(data, -Id)
}
 
# Cek ulang missing value
colSums(is.na(data))
##        fixed.acidity     volatile.acidity          citric.acid 
##                    0                    0                    0 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                    0                    0                    0 
## total.sulfur.dioxide              density                   pH 
##                    0                    0                    0 
##            sulphates              alcohol              quality 
##                    0                    0                    0

STATISTIKA DESKRIPTIF

summary(data)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.100   1st Qu.:0.3925   1st Qu.:0.0900   1st Qu.: 1.900  
##  Median : 7.900   Median :0.5200   Median :0.2500   Median : 2.200  
##  Mean   : 8.311   Mean   :0.5313   Mean   :0.2684   Mean   : 2.532  
##  3rd Qu.: 9.100   3rd Qu.:0.6400   3rd Qu.:0.4200   3rd Qu.: 2.600  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.0000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 21.00       1st Qu.:0.9956  
##  Median :0.07900   Median :13.00       Median : 37.00       Median :0.9967  
##  Mean   :0.08693   Mean   :15.62       Mean   : 45.91       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 61.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :68.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.205   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6577   Mean   :10.44   Mean   :5.657  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
# Histogram variabel numerik utama
hist(data$quality,
     main = "Distribusi Quality",
     xlab = "Quality",
     col = "lightblue")

hist(data$alcohol,
     main = "Distribusi Alcohol",
     xlab = "Alcohol",
     col = "lightgreen")

hist(data$volatile.acidity,
     main = "Distribusi Volatile Acidity",
     xlab = "Volatile Acidity",
     col = "lightpink")

hist(data$sulphates,
     main = "Distribusi Sulphates",
     xlab = "Sulphates",
     col = "lightyellow")

# Visualisasi outlier untuk seluruh variabel numerik
data_numeric <- data %>%
  dplyr::select(where(is.numeric))
 
data_long <- data_numeric %>%
  pivot_longer(cols = everything(),
               names_to = "Variabel",
               values_to = "Nilai")
 
ggplot(data_long, aes(x = "", y = Nilai, fill = Variabel)) +
  geom_boxplot(outlier.color = "red", alpha = 0.7) +
  facet_wrap(~Variabel, scales = "free", ncol = 3) +
  theme_bw() +
  labs(title = "Distribusi Outlier pada Variabel Numerik",
       x = "",
       y = "Nilai") +
  theme(legend.position = "none")

PEMBENTUKAN VARIABEL ORDINAL

# Variabel quality asli memiliki urutan nilai, misalnya 3 < 4 < 5 < 6 < 7 < 8.
# Agar kelompok lebih seimbang dan mudah dianalisis dengan LDA, quality dibagi
# menjadi tiga kategori ordinal:
# Low    = quality <= 5
# Medium = quality == 6
# High   = quality >= 7
 
data$quality_ord <- cut(
  data$quality,
  breaks = c(-Inf, 5, 6, Inf),
  labels = c("Low", "Medium", "High"),
  ordered_result = TRUE
)
 
data$quality_ord <- factor(
  data$quality_ord,
  levels = c("Low", "Medium", "High"),
  ordered = TRUE
)
 
table(data$quality)
## 
##   3   4   5   6   7   8 
##   6  33 483 462 143  16
table(data$quality_ord)
## 
##    Low Medium   High 
##    522    462    159
prop.table(table(data$quality_ord))
## 
##       Low    Medium      High 
## 0.4566929 0.4041995 0.1391076
barplot(table(data$quality_ord),
        main = "Distribusi Kategori Quality Ordinal",
        xlab = "Kategori Quality",
        ylab = "Frekuensi",
        col = c("tomato", "gold", "seagreen"))

PEMILIHAN VARIABEL LDA

# LDA bekerja paling baik jika variabel independen numerik.
# Karena itu, variabel quality asli tidak digunakan sebagai prediktor.
 
variabel_x <- c(
  "fixed.acidity",
  "volatile.acidity",
  "citric.acid",
  "residual.sugar",
  "chlorides",
  "free.sulfur.dioxide",
  "total.sulfur.dioxide",
  "density",
  "pH",
  "sulphates",
  "alcohol"
)
 
# Menyesuaikan jika ada nama kolom yang berbeda
variabel_x <- variabel_x[variabel_x %in% names(data)]
 
data_lda <- data %>%
  dplyr::select(all_of(variabel_x), quality_ord)
 
str(data_lda)
## tibble [1,143 × 12] (S3: tbl_df/tbl/data.frame)
##  $ fixed.acidity       : num [1:1143] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
##  $ volatile.acidity    : num [1:1143] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
##  $ citric.acid         : num [1:1143] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
##  $ residual.sugar      : num [1:1143] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
##  $ chlorides           : num [1:1143] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
##  $ free.sulfur.dioxide : num [1:1143] 11 25 15 17 11 13 15 15 9 15 ...
##  $ total.sulfur.dioxide: num [1:1143] 34 67 54 60 34 40 59 21 18 65 ...
##  $ density             : num [1:1143] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num [1:1143] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
##  $ sulphates           : num [1:1143] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
##  $ alcohol             : num [1:1143] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
##  $ quality_ord         : Ord.factor w/ 3 levels "Low"<"Medium"<..: 1 1 1 2 1 1 1 3 3 1 ...
summary(data_lda)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.100   1st Qu.:0.3925   1st Qu.:0.0900   1st Qu.: 1.900  
##  Median : 7.900   Median :0.5200   Median :0.2500   Median : 2.200  
##  Mean   : 8.311   Mean   :0.5313   Mean   :0.2684   Mean   : 2.532  
##  3rd Qu.: 9.100   3rd Qu.:0.6400   3rd Qu.:0.4200   3rd Qu.: 2.600  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.0000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 21.00       1st Qu.:0.9956  
##  Median :0.07900   Median :13.00       Median : 37.00       Median :0.9967  
##  Mean   :0.08693   Mean   :15.62       Mean   : 45.91       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 61.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :68.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol      quality_ord 
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Low   :522  
##  1st Qu.:3.205   1st Qu.:0.5500   1st Qu.: 9.50   Medium:462  
##  Median :3.310   Median :0.6200   Median :10.20   High  :159  
##  Mean   :3.311   Mean   :0.6577   Mean   :10.44               
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10               
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90

UJI KORELASI ANTAR VARIABEL INDEPENDEN

cor_mat <- cor(data_lda[, variabel_x])
 
round(cor_mat, 3)
##                      fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity                1.000           -0.251       0.673          0.172
## volatile.acidity            -0.251            1.000      -0.544         -0.006
## citric.acid                  0.673           -0.544       1.000          0.176
## residual.sugar               0.172           -0.006       0.176          1.000
## chlorides                    0.108            0.056       0.245          0.071
## free.sulfur.dioxide         -0.165           -0.002      -0.058          0.165
## total.sulfur.dioxide        -0.111            0.078       0.037          0.191
## density                      0.682            0.017       0.375          0.380
## pH                          -0.685            0.221      -0.546         -0.117
## sulphates                    0.175           -0.276       0.331          0.017
## alcohol                     -0.075           -0.204       0.106          0.058
##                      chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity            0.108              -0.165               -0.111   0.682
## volatile.acidity         0.056              -0.002                0.078   0.017
## citric.acid              0.245              -0.058                0.037   0.375
## residual.sugar           0.071               0.165                0.191   0.380
## chlorides                1.000               0.015                0.048   0.209
## free.sulfur.dioxide      0.015               1.000                0.661  -0.054
## total.sulfur.dioxide     0.048               0.661                1.000   0.050
## density                  0.209              -0.054                0.050   1.000
## pH                      -0.278               0.073               -0.059  -0.353
## sulphates                0.375               0.034                0.027   0.143
## alcohol                 -0.230              -0.047               -0.188  -0.495
##                          pH sulphates alcohol
## fixed.acidity        -0.685     0.175  -0.075
## volatile.acidity      0.221    -0.276  -0.204
## citric.acid          -0.546     0.331   0.106
## residual.sugar       -0.117     0.017   0.058
## chlorides            -0.278     0.375  -0.230
## free.sulfur.dioxide   0.073     0.034  -0.047
## total.sulfur.dioxide -0.059     0.027  -0.188
## density              -0.353     0.143  -0.495
## pH                    1.000    -0.185   0.225
## sulphates            -0.185     1.000   0.094
## alcohol               0.225     0.094   1.000
corrplot(cor_mat,
         method = "color",
         type = "upper",
         tl.cex = 0.7,
         addCoef.col = "black",
         number.cex = 0.6,
         col = colorRampPalette(c("#E41A1C", "white", "#377EB8"))(200))

# Mencari korelasi sangat tinggi.
# Jika korelasi mendekati 1, matriks dapat menjadi singular.
high_cor <- findCorrelation(cor_mat, cutoff = 0.90, names = TRUE)
high_cor
## character(0)
# Jika ada variabel dengan korelasi > 0.90, variabel tersebut dihapus.
if (length(high_cor) > 0) {
  data_lda <- data_lda %>%
    dplyr::select(-all_of(high_cor))
  variabel_x <- setdiff(variabel_x, high_cor)
}
 
variabel_x
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"

UJI MULTIKOLINEARITAS DENGAN VIF

vif_model <- lm(
  as.numeric(quality_ord) ~ .,
  data = data_lda
)
 
vif_values <- vif(vif_model)
vif_values
##        fixed.acidity     volatile.acidity          citric.acid 
##             7.780540             1.778704             3.222840 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##             1.743237             1.538470             1.906045 
## total.sulfur.dioxide              density                   pH 
##             2.103748             6.595115             3.393307 
##            sulphates              alcohol 
##             1.440741             3.184642
if (is.matrix(vif_values)) {
  vif_table <- as.data.frame(vif_values)
} else {
  vif_table <- data.frame(VIF = vif_values)
}
 
kable(vif_table,
      caption = "Hasil Pengujian Multikolinearitas (VIF)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Hasil Pengujian Multikolinearitas (VIF)
VIF
fixed.acidity 7.780540
volatile.acidity 1.778704
citric.acid 3.222841
residual.sugar 1.743237
chlorides 1.538470
free.sulfur.dioxide 1.906045
total.sulfur.dioxide 2.103747
density 6.595115
pH 3.393307
sulphates 1.440741
alcohol 3.184642

STANDARDISASI DATA

# Standardisasi dilakukan karena skala antar variabel berbeda.
 
data_scaled <- data_lda
data_scaled[, variabel_x] <- scale(data_scaled[, variabel_x])
 
summary(data_scaled)
##  fixed.acidity     volatile.acidity    citric.acid       residual.sugar    
##  Min.   :-2.1236   Min.   :-2.28988   Min.   :-1.36443   Min.   :-1.20373  
##  1st Qu.:-0.6930   1st Qu.:-0.77290   1st Qu.:-0.90685   1st Qu.:-0.46622  
##  Median :-0.2352   Median :-0.06312   Median :-0.09337   Median :-0.24496  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.4514   3rd Qu.: 0.60491   3rd Qu.: 0.77096   3rd Qu.: 0.05004  
##  Max.   : 4.3425   Max.   : 5.83779   Max.   : 3.71982   Max.   : 9.56389  
##    chlorides        free.sulfur.dioxide total.sulfur.dioxide    density        
##  Min.   :-1.58529   Min.   :-1.4258     Min.   :-1.2176      Min.   :-3.45983  
##  1st Qu.:-0.35823   1st Qu.:-0.8405     1st Qu.:-0.7600      1st Qu.:-0.60279  
##  Median :-0.16782   Median :-0.2552     Median :-0.2719      Median :-0.02619  
##  Mean   : 0.00000   Mean   : 0.0000     Mean   : 0.0000      Mean   : 0.00000  
##  3rd Qu.: 0.06489   3rd Qu.: 0.5253     3rd Qu.: 0.4602      3rd Qu.: 0.57899  
##  Max.   :11.08730   Max.   : 5.1104     Max.   : 7.4152      Max.   : 3.61524  
##        pH              sulphates          alcohol        quality_ord 
##  Min.   :-3.644836   Min.   :-1.9232   Min.   :-1.8870   Low   :522  
##  1st Qu.:-0.676702   1st Qu.:-0.6321   1st Qu.:-0.8706   Medium:462  
##  Median :-0.006478   Median :-0.2213   Median :-0.2237   High  :159  
##  Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.0000               
##  3rd Qu.: 0.568000   3rd Qu.: 0.4243   3rd Qu.: 0.6079               
##  Max.   : 4.461681   Max.   : 7.8774   Max.   : 4.1193

SPLIT DATA TRAINING DAN TESTING

set.seed(123)
 
index_train <- createDataPartition(
  data_scaled$quality_ord,
  p = 0.80,
  list = FALSE
)
 
train_data <- data_scaled[index_train, ]
test_data <- data_scaled[-index_train, ]
 
table(train_data$quality_ord)
## 
##    Low Medium   High 
##    418    370    128
table(test_data$quality_ord)
## 
##    Low Medium   High 
##    104     92     31

UJI ASUMSI KESAMAAN MATRIKS KOVARIANS

# Hipotesis:
# H0: Matriks kovarians antar kelompok sama
# H1: Minimal ada satu matriks kovarians kelompok yang berbeda
#
# Keputusan:
# Jika p-value > 0.05, asumsi kesamaan kovarians terpenuhi.
# Jika p-value <= 0.05, asumsi tidak terpenuhi secara sempurna.
 
box_m_result <- boxM(
  train_data[, variabel_x],
  train_data$quality_ord
)
 
box_m_result
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train_data[, variabel_x]
## Chi-Sq (approx.) = 499.99, df = 132, p-value < 2.2e-16

MODEL LDA ORDINAL

# quality_ord adalah label kategori ordinal:
# Low < Medium < High
 
model_lda <- lda(
  quality_ord ~ .,
  data = train_data
)
 
model_lda
## Call:
## lda(quality_ord ~ ., data = train_data)
## 
## Prior probabilities of groups:
##       Low    Medium      High 
## 0.4563319 0.4039301 0.1397380 
## 
## Group means:
##        fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## Low     -0.110260133        0.3017243 -0.18045599    0.008487284  0.07719297
## Medium  -0.007131207       -0.1547489 -0.04169996   -0.083111822 -0.04248831
## High     0.371840376       -0.7233203  0.68515881    0.249395909 -0.23145890
##        free.sulfur.dioxide total.sulfur.dioxide     density           pH
## Low             0.09119116            0.2460172  0.15543871 -0.004034735
## Medium         -0.04013776           -0.1998813 -0.08395937  0.111695308
## High           -0.16979542           -0.2921946 -0.32236146 -0.242352957
##         sulphates    alcohol
## Low    -0.2843297 -0.4846884
## Medium  0.1343133  0.1959699
## High    0.5402496  1.0399846
## 
## Coefficients of linear discriminants:
##                               LD1        LD2
## fixed.acidity         0.320292666 -0.1491082
## volatile.acidity     -0.305528521  0.4966217
## citric.acid           0.007342081  1.0164622
## residual.sugar        0.157525665  0.4700904
## chlorides            -0.235641057 -0.3368592
## free.sulfur.dioxide   0.036132054 -0.3217828
## total.sulfur.dioxide -0.281068741  0.4419615
## density              -0.296585443 -0.6168292
## pH                   -0.043149649 -0.2910780
## sulphates             0.506586361 -0.1095078
## alcohol               0.711169055 -0.2805789
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9315 0.0685

PREDIKSI MODEL

pred_lda <- predict(model_lda, newdata = test_data)
 
# Kelas prediksi
pred_class <- pred_lda$class
head(pred_class)
## [1] Low    Low    Medium Medium Low    Low   
## Levels: Low Medium High
# Probabilitas posterior setiap kelas
head(pred_lda$posterior)
##         Low    Medium        High
## 1 0.7498436 0.2488625 0.001293881
## 2 0.5028424 0.4338747 0.063282938
## 3 0.1113364 0.6276220 0.261041616
## 4 0.4325203 0.4818492 0.085630524
## 5 0.5175663 0.4724009 0.010032765
## 6 0.7680478 0.2306181 0.001334101
# Nilai diskriminan
head(pred_lda$x)
##           LD1        LD2
## 1 -1.61049044 -1.3669911
## 2  0.01234132  0.6806346
## 3  1.34453123 -0.5028942
## 4  0.21222157  0.5714479
## 5 -0.60827008 -1.2077404
## 6 -1.62669539 -1.1499516

EVALUASI MODEL

conf_matrix <- confusionMatrix(
  pred_class,
  test_data$quality_ord
)
 
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Medium High
##     Low     82     38    1
##     Medium  20     43   16
##     High     2     11   14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6123          
##                  95% CI : (0.5456, 0.6761)
##     No Information Rate : 0.4581          
##     P-Value [Acc > NIR] : 2.19e-06        
##                                           
##                   Kappa : 0.3523          
##                                           
##  Mcnemar's Test P-Value : 0.07699         
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              0.7885        0.4674     0.45161
## Specificity              0.6829        0.7333     0.93367
## Pos Pred Value           0.6777        0.5443     0.51852
## Neg Pred Value           0.7925        0.6689     0.91500
## Prevalence               0.4581        0.4053     0.13656
## Detection Rate           0.3612        0.1894     0.06167
## Detection Prevalence     0.5330        0.3480     0.11894
## Balanced Accuracy        0.7357        0.6004     0.69264
# Matriks klasifikasi
kable(as.data.frame(conf_matrix$table),
      caption = "Matriks Klasifikasi LDA") %>%
  kable_styling(full_width = FALSE)
Matriks Klasifikasi LDA
Prediction Reference Freq
Low Low 82
Medium Low 20
High Low 2
Low Medium 38
Medium Medium 43
High Medium 11
Low High 1
Medium High 16
High High 14
# Akurasi model
akurasi_final <- round(conf_matrix$overall["Accuracy"] * 100, 2)
 
cat("TINGKAT AKURASI MODEL LDA ORDINAL ADALAH", akurasi_final, "%")
## TINGKAT AKURASI MODEL LDA ORDINAL ADALAH 61.23 %

TABEL HASIL PREDIKSI

hasil_prediksi <- data.frame(
  Aktual = test_data$quality_ord,
  Prediksi = pred_class,
  Prob_Low = pred_lda$posterior[, "Low"],
  Prob_Medium = pred_lda$posterior[, "Medium"],
  Prob_High = pred_lda$posterior[, "High"]
)
 
head(hasil_prediksi, 20)
##    Aktual Prediksi  Prob_Low Prob_Medium   Prob_High
## 1     Low      Low 0.7498436   0.2488625 0.001293881
## 2  Medium      Low 0.5028424   0.4338747 0.063282938
## 3     Low   Medium 0.1113364   0.6276220 0.261041616
## 4    High   Medium 0.4325203   0.4818492 0.085630524
## 5  Medium      Low 0.5175663   0.4724009 0.010032765
## 6     Low      Low 0.7680478   0.2306181 0.001334101
## 7     Low      Low 0.8146235   0.1790213 0.006355177
## 8     Low      Low 0.8035610   0.1936708 0.002768178
## 9  Medium      Low 0.6696601   0.3236804 0.006659506
## 10   High   Medium 0.4024342   0.5428509 0.054714897
## 11    Low      Low 0.4393912   0.3823168 0.178292020
## 12 Medium   Medium 0.1888625   0.7016261 0.109511415
## 13    Low      Low 0.8621843   0.1352404 0.002575302
## 14    Low      Low 0.6011813   0.3715021 0.027316593
## 15    Low      Low 0.8105400   0.1885610 0.000899027
## 16 Medium     High 0.0116227   0.1865561 0.801821240
## 17    Low      Low 0.5989230   0.3823572 0.018719824
## 18    Low      Low 0.8735175   0.1250941 0.001388347
## 19    Low      Low 0.7638400   0.2315798 0.004580199
## 20 Medium      Low 0.6059195   0.3811105 0.012969994
kable(head(hasil_prediksi, 20),
      caption = "Preview Hasil Prediksi Model LDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Preview Hasil Prediksi Model LDA
Aktual Prediksi Prob_Low Prob_Medium Prob_High
Low Low 0.7498436 0.2488625 0.0012939
Medium Low 0.5028424 0.4338747 0.0632829
Low Medium 0.1113364 0.6276220 0.2610416
High Medium 0.4325203 0.4818492 0.0856305
Medium Low 0.5175663 0.4724009 0.0100328
Low Low 0.7680478 0.2306181 0.0013341
Low Low 0.8146235 0.1790213 0.0063552
Low Low 0.8035610 0.1936708 0.0027682
Medium Low 0.6696601 0.3236804 0.0066595
High Medium 0.4024342 0.5428509 0.0547149
Low Low 0.4393912 0.3823168 0.1782920
Medium Medium 0.1888625 0.7016261 0.1095114
Low Low 0.8621843 0.1352404 0.0025753
Low Low 0.6011813 0.3715021 0.0273166
Low Low 0.8105400 0.1885610 0.0008990
Medium High 0.0116227 0.1865561 0.8018212
Low Low 0.5989230 0.3823572 0.0187198
Low Low 0.8735175 0.1250941 0.0013883
Low Low 0.7638400 0.2315798 0.0045802
Medium Low 0.6059195 0.3811105 0.0129700

VISUALISASI HASIL LDA

plot_data <- data.frame(
  LD1 = pred_lda$x[, 1],
  LD2 = pred_lda$x[, 2],
  Aktual = test_data$quality_ord,
  Prediksi = pred_class
)
 
ggplot(plot_data, aes(x = LD1, y = LD2, color = Aktual, shape = Prediksi)) +
  geom_point(alpha = 0.8, size = 2) +
  theme_bw() +
  labs(title = "Plot Hasil LDA Wine Quality",
       x = "LD1",
       y = "LD2",
       color = "Kelas Aktual",
       shape = "Kelas Prediksi")

KESIMPULAN

cat("1. Dataset Wine Quality cocok digunakan untuk LDA karena seluruh variabel independen berupa numerik.\n")
## 1. Dataset Wine Quality cocok digunakan untuk LDA karena seluruh variabel independen berupa numerik.
cat("2. Variabel quality diubah menjadi variabel ordinal: Low < Medium < High.\n")
## 2. Variabel quality diubah menjadi variabel ordinal: Low < Medium < High.
cat("3. Variabel quality asli tidak digunakan sebagai prediktor agar tidak terjadi kebocoran target.\n")
## 3. Variabel quality asli tidak digunakan sebagai prediktor agar tidak terjadi kebocoran target.
if (length(high_cor) == 0) {
  cat("4. Tidak ditemukan korelasi antar prediktor yang melebihi 0.90, sehingga risiko singular matrix relatif kecil.\n")
} else {
  cat("4. Variabel yang dihapus karena korelasi tinggi adalah:",
      paste(high_cor, collapse = ", "), ".\n")
}
## 4. Tidak ditemukan korelasi antar prediktor yang melebihi 0.90, sehingga risiko singular matrix relatif kecil.
if (box_m_result$p.value > 0.05) {
  cat("5. Asumsi kesamaan matriks kovarians terpenuhi karena p-value Box's M > 0.05.\n")
} else {
  cat("5. Asumsi kesamaan matriks kovarians tidak terpenuhi sempurna karena p-value Box's M <= 0.05.\n")
  cat("   Oleh karena itu, hasil LDA tetap dapat dibahas, tetapi interpretasi perlu dilakukan secara hati-hati.\n")
}
## 5. Asumsi kesamaan matriks kovarians tidak terpenuhi sempurna karena p-value Box's M <= 0.05.
##    Oleh karena itu, hasil LDA tetap dapat dibahas, tetapi interpretasi perlu dilakukan secara hati-hati.
cat("6. Akurasi model LDA Ordinal pada data testing adalah", akurasi_final, "%.\n")
## 6. Akurasi model LDA Ordinal pada data testing adalah 61.23 %.