library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
library(brant)
## Warning: package 'brant' was built under R version 4.4.3
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Dataset yang digunakan adalah Wine Quality Dataset yang berisi karakteristik kimia wine dan kualitasnya.
# Load data (ubah path sesuai file kamu)
df <- read.csv("C:/Users/ella/Downloads/WineQT.csv")
# Lihat data
head(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality Id
## 1 5 0
## 2 5 1
## 3 5 2
## 4 6 3
## 5 5 4
## 6 5 5
str(df)
## 'data.frame': 1143 obs. of 13 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 6.7 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.58 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 1.8 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.097 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 15 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 65 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 9.2 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
## $ Id : int 0 1 2 3 4 5 6 7 8 10 ...
summary(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.100 1st Qu.:0.3925 1st Qu.:0.0900 1st Qu.: 1.900
## Median : 7.900 Median :0.5200 Median :0.2500 Median : 2.200
## Mean : 8.311 Mean :0.5313 Mean :0.2684 Mean : 2.532
## 3rd Qu.: 9.100 3rd Qu.:0.6400 3rd Qu.:0.4200 3rd Qu.: 2.600
## Max. :15.900 Max. :1.5800 Max. :1.0000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 21.00 1st Qu.:0.9956
## Median :0.07900 Median :13.00 Median : 37.00 Median :0.9967
## Mean :0.08693 Mean :15.62 Mean : 45.91 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 61.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :68.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.205 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6577 Mean :10.44 Mean :5.657
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
## Id
## Min. : 0
## 1st Qu.: 411
## Median : 794
## Mean : 805
## 3rd Qu.:1210
## Max. :1597
Tahap ini bertujuan untuk membersihkan data dan menyiapkan variabel target untuk masing-masing metode.
# Menghapus variabel yang tidak digunakan
df <- df[, !(names(df) %in% c("Id"))]
# Target untuk Ordinal Logistic Regression
df$quality_ord <- factor(df$quality,
levels = sort(unique(df$quality)),
ordered = TRUE)
# Target untuk LDA (kategori)
df$quality_cat <- cut(df$quality,
breaks = c(-Inf, 5, 6, Inf),
labels = c("Low", "Medium", "High"))
ggplot(df, aes(x = factor(quality))) +
geom_bar(fill = "skyblue") +
labs(
title = "Distribusi Kualitas Wine",
x = "Quality",
y = "Frekuensi"
) +
theme_minimal()
ggplot(df, aes(x = factor(quality), y = alcohol)) +
geom_boxplot(fill = "lightgreen") +
labs(
title = "Boxplot Alcohol terhadap Quality Wine",
x = "Quality",
y = "Alcohol"
) +
theme_minimal()
num_data <- df %>%
select(where(is.numeric)) %>%
select(-quality)
cor_matrix <- cor(num_data)
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.cex = 0.8,
number.cex = 0.7)
Digunakan untuk memastikan tidak terjadi korelasi tinggi antar variabel independen.
model_vif <- lm(quality ~ . -quality_ord -quality_cat, data = df)
vif(model_vif)
## fixed.acidity volatile.acidity citric.acid
## 7.780540 1.778704 3.222840
## residual.sugar chlorides free.sulfur.dioxide
## 1.743237 1.538470 1.906045
## total.sulfur.dioxide density pH
## 2.103748 6.595115 3.393307
## sulphates alcohol
## 1.440741 3.184642
Data dibagi menjadi data training (70%) dan testing (30%).
set.seed(123)
trainIndex <- createDataPartition(df$quality_cat, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]
model_olr <- polr(quality_ord ~ . -quality -quality_cat,
data = train,
Hess = TRUE)
summary(model_olr)
## Call:
## polr(formula = quality_ord ~ . - quality - quality_cat, data = train,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## fixed.acidity 0.13687 0.073683 1.8576
## volatile.acidity -3.04096 0.566333 -5.3696
## citric.acid -0.62307 0.674707 -0.9235
## residual.sugar 0.08627 0.053142 1.6233
## chlorides -4.72120 1.971655 -2.3945
## free.sulfur.dioxide 0.00865 0.009843 0.8788
## total.sulfur.dioxide -0.01076 0.003425 -3.1408
## density -75.21442 1.368309 -54.9689
## pH -0.57262 0.695126 -0.8238
## sulphates 3.35607 0.501163 6.6966
## alcohol 0.87640 0.085513 10.2487
##
## Intercepts:
## Value Std. Error t value
## 3|4 -72.6651 1.4020 -51.8285
## 4|5 -71.0353 1.4001 -50.7352
## 5|6 -67.0415 1.4081 -47.6116
## 6|7 -64.1589 1.4289 -44.9021
## 7|8 -61.4770 1.4589 -42.1399
##
## Residual Deviance: 1517.518
## AIC: 1549.518
ctable <- coef(summary(model_olr))
p_values <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
cbind(ctable, "p value" = p_values)
## Value Std. Error t value p value
## fixed.acidity 0.136871040 0.073682596 1.8575762 6.322921e-02
## volatile.acidity -3.040956341 0.566332627 -5.3695588 7.892950e-08
## citric.acid -0.623065017 0.674707281 -0.9234598 3.557677e-01
## residual.sugar 0.086266396 0.053141607 1.6233306 1.045188e-01
## chlorides -4.721197698 1.971654620 -2.3945359 1.664142e-02
## free.sulfur.dioxide 0.008650461 0.009843058 0.8788388 3.794887e-01
## total.sulfur.dioxide -0.010757281 0.003425062 -3.1407550 1.685129e-03
## density -75.214419110 1.368308644 -54.9688986 0.000000e+00
## pH -0.572618190 0.695126497 -0.8237611 4.100753e-01
## sulphates 3.356069970 0.501163162 6.6965616 2.133806e-11
## alcohol 0.876402469 0.085513395 10.2487156 1.199265e-24
## 3|4 -72.665137162 1.402031024 -51.8284802 0.000000e+00
## 4|5 -71.035339971 1.400120265 -50.7351702 0.000000e+00
## 5|6 -67.041486362 1.408090193 -47.6116421 0.000000e+00
## 6|7 -64.158898842 1.428862237 -44.9020887 0.000000e+00
## 7|8 -61.476958688 1.458879333 -42.1398517 0.000000e+00
exp(coef(model_olr))
## fixed.acidity volatile.acidity citric.acid
## 1.146680e+00 4.778916e-02 5.362982e-01
## residual.sugar chlorides free.sulfur.dioxide
## 1.090097e+00 8.904507e-03 1.008688e+00
## total.sulfur.dioxide density pH
## 9.893004e-01 2.161687e-33 5.640467e-01
## sulphates alcohol
## 2.867627e+01 2.402242e+00
brant(model_olr)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## ----------------------------------------------------
## Test for X2 df probability
## ----------------------------------------------------
## Omnibus 83.38 44 0
## fixed.acidity 19.66 4 0
## volatile.acidity 10.84 4 0.03
## citric.acid 17.24 4 0
## residual.sugar 6.27 4 0.18
## chlorides 8.68 4 0.07
## free.sulfur.dioxide 3.1 4 0.54
## total.sulfur.dioxide 4.1 4 0.39
## density 8.54 4 0.07
## pH 20.2 4 0
## sulphates 1.22 4 0.87
## alcohol 2.31 4 0.68
## ----------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(model_olr): 12 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.
pred_olr <- predict(model_olr, test)
confusionMatrix(pred_olr, test$quality_ord)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 1 9 113 50 2 0
## 6 0 5 28 80 23 3
## 7 0 0 0 8 19 0
## 8 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6217
## 95% CI : (0.5679, 0.6734)
## No Information Rate : 0.4135
## P-Value [Acc > NIR] : 8.044e-15
##
## Kappa : 0.3825
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.00000 0.8014 0.5797 0.43182 0.000000
## Specificity 1.000000 1.00000 0.6900 0.7094 0.97306 1.000000
## Pos Pred Value NaN NaN 0.6457 0.5755 0.70370 NaN
## Neg Pred Value 0.997067 0.95894 0.8313 0.7129 0.92038 0.991202
## Prevalence 0.002933 0.04106 0.4135 0.4047 0.12903 0.008798
## Detection Rate 0.000000 0.00000 0.3314 0.2346 0.05572 0.000000
## Detection Prevalence 0.000000 0.00000 0.5132 0.4076 0.07918 0.000000
## Balanced Accuracy 0.500000 0.50000 0.7457 0.6445 0.70244 0.500000
train_scaled <- train
test_scaled <- test
cols <- names(train)[!names(train) %in% c("quality","quality_ord","quality_cat")]
train_scaled[, cols] <- scale(train[, cols])
test_scaled[, cols] <- scale(test[, cols])
model_lda <- lda(quality_cat ~ . -quality -quality_ord, data = train_scaled)
model_lda
## Call:
## lda(quality_cat ~ . - quality - quality_ord, data = train_scaled)
##
## Prior probabilities of groups:
## Low Medium High
## 0.4563591 0.4039900 0.1396509
##
## Group means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## Low -0.10343644 0.3257195 -0.16227737 0.01140763 0.09468940
## Medium 0.01184555 -0.1125907 -0.03761502 -0.05171899 -0.01382942
## High 0.30374803 -0.7386962 0.63911414 0.11233716 -0.26942491
## free.sulfur.dioxide total.sulfur.dioxide density pH
## Low 0.09054020 0.2368601 0.15958686 -0.01321902
## Medium -0.02901657 -0.1519207 -0.05976594 0.07613099
## High -0.21193163 -0.3345403 -0.34861274 -0.17703821
## sulphates alcohol
## Low -0.2897043 -0.4752330
## Medium 0.1616162 0.2002472
## High 0.4791795 0.9737073
##
## Coefficients of linear discriminants:
## LD1 LD2
## fixed.acidity 0.281258868 -0.3376328
## volatile.acidity -0.358127165 0.3668065
## citric.acid -0.014783857 1.2425645
## residual.sugar 0.117353899 0.3720766
## chlorides -0.205723704 -0.3867054
## free.sulfur.dioxide 0.060419283 -0.2986471
## total.sulfur.dioxide -0.310539452 0.2936798
## density -0.236132419 -0.5849021
## pH 0.002834069 -0.2108362
## sulphates 0.483752209 -0.2395399
## alcohol 0.711858238 -0.3497331
##
## Proportion of trace:
## LD1 LD2
## 0.9371 0.0629
pred_lda <- predict(model_lda, test_scaled)
confusionMatrix(pred_lda$class, test_scaled$quality_cat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Medium High
## Low 119 53 4
## Medium 34 68 17
## High 3 17 26
##
## Overall Statistics
##
## Accuracy : 0.6246
## 95% CI : (0.5709, 0.6762)
## No Information Rate : 0.4575
## P-Value [Acc > NIR] : 4.116e-10
##
## Kappa : 0.3786
##
## Mcnemar's Test P-Value : 0.2316
##
## Statistics by Class:
##
## Class: Low Class: Medium Class: High
## Sensitivity 0.7628 0.4928 0.55319
## Specificity 0.6919 0.7488 0.93197
## Pos Pred Value 0.6761 0.5714 0.56522
## Neg Pred Value 0.7758 0.6847 0.92881
## Prevalence 0.4575 0.4047 0.13783
## Detection Rate 0.3490 0.1994 0.07625
## Detection Prevalence 0.5161 0.3490 0.13490
## Balanced Accuracy 0.7274 0.6208 0.74258
lda_values <- predict(model_lda)$x
lda_df <- data.frame(
LD1 = lda_values[,1],
LD2 = lda_values[,2],
Quality = train_scaled$quality_cat
)
ggplot(lda_df, aes(x = LD1,
y = LD2,
color = Quality)) +
geom_point(size = 2, alpha = 0.7) +
labs(title = "Visualisasi Hasil LDA")
acc_olr <- confusionMatrix(pred_olr, test$quality_ord)$overall['Accuracy']
acc_lda <- confusionMatrix(pred_lda$class, test_scaled$quality_cat)$overall['Accuracy']
acc_olr
## Accuracy
## 0.6217009
acc_lda
## Accuracy
## 0.6246334