Load Library

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.3

library(DescTools)

## Warning: package 'DescTools' was built under R version 4.4.3

## 
## Attaching package: 'DescTools'

## The following object is masked from 'package:car':
## 
##     Recode

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

library(brant)

## Warning: package 'brant' was built under R version 4.4.3

library(ggplot2)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load Data

Dataset yang digunakan adalah Wine Quality Dataset yang berisi karakteristik kimia wine dan kualitasnya.

# Load data (ubah path sesuai file kamu)
df <- read.csv("C:/Users/ella/Downloads/WineQT.csv")

# Lihat data
head(df)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality Id
## 1       5  0
## 2       5  1
## 3       5  2
## 4       6  3
## 5       5  4
## 6       5  5

str(df)

## 'data.frame':    1143 obs. of  13 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 15 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 65 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
##  $ Id                  : int  0 1 2 3 4 5 6 7 8 10 ...

summary(df)

##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.100   1st Qu.:0.3925   1st Qu.:0.0900   1st Qu.: 1.900  
##  Median : 7.900   Median :0.5200   Median :0.2500   Median : 2.200  
##  Mean   : 8.311   Mean   :0.5313   Mean   :0.2684   Mean   : 2.532  
##  3rd Qu.: 9.100   3rd Qu.:0.6400   3rd Qu.:0.4200   3rd Qu.: 2.600  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.0000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 21.00       1st Qu.:0.9956  
##  Median :0.07900   Median :13.00       Median : 37.00       Median :0.9967  
##  Mean   :0.08693   Mean   :15.62       Mean   : 45.91       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 61.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :68.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.205   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6577   Mean   :10.44   Mean   :5.657  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000  
##        Id      
##  Min.   :   0  
##  1st Qu.: 411  
##  Median : 794  
##  Mean   : 805  
##  3rd Qu.:1210  
##  Max.   :1597

Data Preparation

Tahap ini bertujuan untuk membersihkan data dan menyiapkan variabel target untuk masing-masing metode.

# Menghapus variabel yang tidak digunakan
df <- df[, !(names(df) %in% c("Id"))]

# Target untuk Ordinal Logistic Regression
df$quality_ord <- factor(df$quality,
                         levels = sort(unique(df$quality)),
                         ordered = TRUE)

# Target untuk LDA (kategori)
df$quality_cat <- cut(df$quality,
                      breaks = c(-Inf, 5, 6, Inf),
                      labels = c("Low", "Medium", "High"))

Visualisasi Data Exploratory Data Analysis (EDA)

Distribusi Frekuensi Kualitas Wine

ggplot(df, aes(x = factor(quality))) +
  geom_bar(fill = "skyblue") +
  labs(
    title = "Distribusi Kualitas Wine",
    x = "Quality",
    y = "Frekuensi"
  ) +
  theme_minimal()

Boxplot Alcohol vs Quality

ggplot(df, aes(x = factor(quality), y = alcohol)) +
  geom_boxplot(fill = "lightgreen") +
  labs(
    title = "Boxplot Alcohol terhadap Quality Wine",
    x = "Quality",
    y = "Alcohol"
  ) +
  theme_minimal()

Heatmap Korelasi

num_data <- df %>%
  select(where(is.numeric)) %>%
  select(-quality)

cor_matrix <- cor(num_data)

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.cex = 0.8,
         number.cex = 0.7)

Uji Multikolinearitas (VIF)

Digunakan untuk memastikan tidak terjadi korelasi tinggi antar variabel independen.

model_vif <- lm(quality ~ . -quality_ord -quality_cat, data = df)
vif(model_vif)

##        fixed.acidity     volatile.acidity          citric.acid 
##             7.780540             1.778704             3.222840 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##             1.743237             1.538470             1.906045 
## total.sulfur.dioxide              density                   pH 
##             2.103748             6.595115             3.393307 
##            sulphates              alcohol 
##             1.440741             3.184642

Split Data

Data dibagi menjadi data training (70%) dan testing (30%).

set.seed(123)

trainIndex <- createDataPartition(df$quality_cat, p = 0.7, list = FALSE)

train <- df[trainIndex, ]
test  <- df[-trainIndex, ]

Regresi Logistik Ordinal (OLR)

Model OLR

model_olr <- polr(quality_ord ~ . -quality -quality_cat,
                  data = train,
                  Hess = TRUE)

summary(model_olr)

## Call:
## polr(formula = quality_ord ~ . - quality - quality_cat, data = train, 
##     Hess = TRUE)
## 
## Coefficients:
##                          Value Std. Error  t value
## fixed.acidity          0.13687   0.073683   1.8576
## volatile.acidity      -3.04096   0.566333  -5.3696
## citric.acid           -0.62307   0.674707  -0.9235
## residual.sugar         0.08627   0.053142   1.6233
## chlorides             -4.72120   1.971655  -2.3945
## free.sulfur.dioxide    0.00865   0.009843   0.8788
## total.sulfur.dioxide  -0.01076   0.003425  -3.1408
## density              -75.21442   1.368309 -54.9689
## pH                    -0.57262   0.695126  -0.8238
## sulphates              3.35607   0.501163   6.6966
## alcohol                0.87640   0.085513  10.2487
## 
## Intercepts:
##     Value    Std. Error t value 
## 3|4 -72.6651   1.4020   -51.8285
## 4|5 -71.0353   1.4001   -50.7352
## 5|6 -67.0415   1.4081   -47.6116
## 6|7 -64.1589   1.4289   -44.9021
## 7|8 -61.4770   1.4589   -42.1399
## 
## Residual Deviance: 1517.518 
## AIC: 1549.518

Uji Signifikansi (Wald Test)

ctable <- coef(summary(model_olr))

p_values <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

cbind(ctable, "p value" = p_values)

##                              Value  Std. Error     t value      p value
## fixed.acidity          0.136871040 0.073682596   1.8575762 6.322921e-02
## volatile.acidity      -3.040956341 0.566332627  -5.3695588 7.892950e-08
## citric.acid           -0.623065017 0.674707281  -0.9234598 3.557677e-01
## residual.sugar         0.086266396 0.053141607   1.6233306 1.045188e-01
## chlorides             -4.721197698 1.971654620  -2.3945359 1.664142e-02
## free.sulfur.dioxide    0.008650461 0.009843058   0.8788388 3.794887e-01
## total.sulfur.dioxide  -0.010757281 0.003425062  -3.1407550 1.685129e-03
## density              -75.214419110 1.368308644 -54.9688986 0.000000e+00
## pH                    -0.572618190 0.695126497  -0.8237611 4.100753e-01
## sulphates              3.356069970 0.501163162   6.6965616 2.133806e-11
## alcohol                0.876402469 0.085513395  10.2487156 1.199265e-24
## 3|4                  -72.665137162 1.402031024 -51.8284802 0.000000e+00
## 4|5                  -71.035339971 1.400120265 -50.7351702 0.000000e+00
## 5|6                  -67.041486362 1.408090193 -47.6116421 0.000000e+00
## 6|7                  -64.158898842 1.428862237 -44.9020887 0.000000e+00
## 7|8                  -61.476958688 1.458879333 -42.1398517 0.000000e+00

Odds Ration

exp(coef(model_olr))

##        fixed.acidity     volatile.acidity          citric.acid 
##         1.146680e+00         4.778916e-02         5.362982e-01 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##         1.090097e+00         8.904507e-03         1.008688e+00 
## total.sulfur.dioxide              density                   pH 
##         9.893004e-01         2.161687e-33         5.640467e-01 
##            sulphates              alcohol 
##         2.867627e+01         2.402242e+00

Uji Asumsi Proportional Odds

brant(model_olr)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## ---------------------------------------------------- 
## Test for     X2  df  probability 
## ---------------------------------------------------- 
## Omnibus          83.38   44  0
## fixed.acidity        19.66   4   0
## volatile.acidity 10.84   4   0.03
## citric.acid      17.24   4   0
## residual.sugar       6.27    4   0.18
## chlorides        8.68    4   0.07
## free.sulfur.dioxide  3.1 4   0.54
## total.sulfur.dioxide 4.1 4   0.39
## density          8.54    4   0.07
## pH           20.2    4   0
## sulphates        1.22    4   0.87
## alcohol          2.31    4   0.68
## ---------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

## Warning in brant(model_olr): 12 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.

Evaluasi Model OLR

pred_olr <- predict(model_olr, test)

confusionMatrix(pred_olr, test$quality_ord)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8
##          3   0   0   0   0   0   0
##          4   0   0   0   0   0   0
##          5   1   9 113  50   2   0
##          6   0   5  28  80  23   3
##          7   0   0   0   8  19   0
##          8   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6217          
##                  95% CI : (0.5679, 0.6734)
##     No Information Rate : 0.4135          
##     P-Value [Acc > NIR] : 8.044e-15       
##                                           
##                   Kappa : 0.3825          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000  0.00000   0.8014   0.5797  0.43182 0.000000
## Specificity          1.000000  1.00000   0.6900   0.7094  0.97306 1.000000
## Pos Pred Value            NaN      NaN   0.6457   0.5755  0.70370      NaN
## Neg Pred Value       0.997067  0.95894   0.8313   0.7129  0.92038 0.991202
## Prevalence           0.002933  0.04106   0.4135   0.4047  0.12903 0.008798
## Detection Rate       0.000000  0.00000   0.3314   0.2346  0.05572 0.000000
## Detection Prevalence 0.000000  0.00000   0.5132   0.4076  0.07918 0.000000
## Balanced Accuracy    0.500000  0.50000   0.7457   0.6445  0.70244 0.500000

Analisis Diskriminan (LDA)

Standarisasi Data

train_scaled <- train
test_scaled  <- test

cols <- names(train)[!names(train) %in% c("quality","quality_ord","quality_cat")]

train_scaled[, cols] <- scale(train[, cols])
test_scaled[, cols]  <- scale(test[, cols])

Model LDA

model_lda <- lda(quality_cat ~ . -quality -quality_ord, data = train_scaled)

model_lda

## Call:
## lda(quality_cat ~ . - quality - quality_ord, data = train_scaled)
## 
## Prior probabilities of groups:
##       Low    Medium      High 
## 0.4563591 0.4039900 0.1396509 
## 
## Group means:
##        fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## Low      -0.10343644        0.3257195 -0.16227737     0.01140763  0.09468940
## Medium    0.01184555       -0.1125907 -0.03761502    -0.05171899 -0.01382942
## High      0.30374803       -0.7386962  0.63911414     0.11233716 -0.26942491
##        free.sulfur.dioxide total.sulfur.dioxide     density          pH
## Low             0.09054020            0.2368601  0.15958686 -0.01321902
## Medium         -0.02901657           -0.1519207 -0.05976594  0.07613099
## High           -0.21193163           -0.3345403 -0.34861274 -0.17703821
##         sulphates    alcohol
## Low    -0.2897043 -0.4752330
## Medium  0.1616162  0.2002472
## High    0.4791795  0.9737073
## 
## Coefficients of linear discriminants:
##                               LD1        LD2
## fixed.acidity         0.281258868 -0.3376328
## volatile.acidity     -0.358127165  0.3668065
## citric.acid          -0.014783857  1.2425645
## residual.sugar        0.117353899  0.3720766
## chlorides            -0.205723704 -0.3867054
## free.sulfur.dioxide   0.060419283 -0.2986471
## total.sulfur.dioxide -0.310539452  0.2936798
## density              -0.236132419 -0.5849021
## pH                    0.002834069 -0.2108362
## sulphates             0.483752209 -0.2395399
## alcohol               0.711858238 -0.3497331
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9371 0.0629

Evaluasi Model LDA

pred_lda <- predict(model_lda, test_scaled)

confusionMatrix(pred_lda$class, test_scaled$quality_cat)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Medium High
##     Low    119     53    4
##     Medium  34     68   17
##     High     3     17   26
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6246          
##                  95% CI : (0.5709, 0.6762)
##     No Information Rate : 0.4575          
##     P-Value [Acc > NIR] : 4.116e-10       
##                                           
##                   Kappa : 0.3786          
##                                           
##  Mcnemar's Test P-Value : 0.2316          
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              0.7628        0.4928     0.55319
## Specificity              0.6919        0.7488     0.93197
## Pos Pred Value           0.6761        0.5714     0.56522
## Neg Pred Value           0.7758        0.6847     0.92881
## Prevalence               0.4575        0.4047     0.13783
## Detection Rate           0.3490        0.1994     0.07625
## Detection Prevalence     0.5161        0.3490     0.13490
## Balanced Accuracy        0.7274        0.6208     0.74258

lda_values <- predict(model_lda)$x

lda_df <- data.frame(
  LD1 = lda_values[,1],
  LD2 = lda_values[,2],
  Quality = train_scaled$quality_cat
)

ggplot(lda_df, aes(x = LD1,
                   y = LD2,
                   color = Quality)) +
  geom_point(size = 2, alpha = 0.7) +
  labs(title = "Visualisasi Hasil LDA")

Perbandingan Model

acc_olr <- confusionMatrix(pred_olr, test$quality_ord)$overall['Accuracy']
acc_lda <- confusionMatrix(pred_lda$class, test_scaled$quality_cat)$overall['Accuracy']

acc_olr

##  Accuracy 
## 0.6217009

acc_lda

##  Accuracy 
## 0.6246334

Analisis Kualitas Wine Menggunakan Metode Analisis Diskriminan dan Regresi Logistik Ordinal

Siti Noerhalizah (24031554073)

Stella Dagatha (24031554080)

Na’illah Shania Rizqillah (24031554225)

Kelas 2024A