library(readr)
library(dplyr)
library(knitr)
library(kableExtra)
library(glmnet)
library(corrplot)
library(caret)
library(tidyr)

set.seed(123)

# 1. Load Data
data_wine <- read_delim("D:/Semester 2/Machine Learning/winequality-red.csv", 
                        delim = ";", show_col_types = FALSE)

# --- TABEL A: DIMENSI DATA ---
dimensi_df <- data.frame(
  Metric = c("Jumlah Baris (Observasi)", "Jumlah Kolom (Variabel)"),
  Value = c(nrow(data_wine), ncol(data_wine))
)

kable(dimensi_df, caption = "Tabel A: Dimensi Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Tabel A: Dimensi Dataset
Metric Value
Jumlah Baris (Observasi) 1469
Jumlah Kolom (Variabel) 12
# --- TABEL B: CEK MISSING VALUE ---
missing_df <- data.frame(
  Variabel = names(data_wine),
  Jumlah_Missing = colSums(is.na(data_wine))
)

kable(missing_df, caption = "Tabel B: Pengecekan Missing Value") %>%
  kable_styling(bootstrap_options = "condensed", full_width = F)
Tabel B: Pengecekan Missing Value
Variabel Jumlah_Missing
fixed acidity fixed acidity 0
volatile acidity volatile acidity 0
citric acid citric acid 0
residual sugar residual sugar 0
chlorides chlorides 0
free sulfur dioxide free sulfur dioxide 0
total sulfur dioxide total sulfur dioxide 0
density density 0
pH pH 0
sulphates sulphates 0
alcohol alcohol 0
quality quality 0
# --- TABEL C: RINGKASAN STATISTIK DESKRIPTIF ---
summary_table <- data_wine %>%
  summarise(across(everything(), list(
    Min = ~min(.),
    Mean = ~round(mean(.), 2),
    Max = ~max(.),
    SD = ~round(sd(.), 2)
  ))) %>%
  pivot_longer(everything(), names_to = c("Variabel", ".value"), names_sep = "_")

kable(summary_table, caption = "Tabel C: Statistik Deskriptif Fitur") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Tabel C: Statistik Deskriptif Fitur
Variabel Min Mean Max SD
fixed acidity 4.60000 8.44 15.90000 1.75
volatile acidity 0.12000 0.51 1.24000 0.17
citric acid 0.00000 0.29 1.00000 0.19
residual sugar 0.90000 2.57 15.50000 1.45
chlorides 0.01200 0.09 0.61100 0.05
free sulfur dioxide 1.00000 15.93 72.00000 10.43
total sulfur dioxide 6.00000 47.74 289.00000 33.57
density 0.99007 1.00 1.00369 0.00
pH 2.74000 3.30 3.90000 0.15
sulphates 0.33000 0.66 2.00000 0.17
alcohol 8.40000 10.41 14.90000 1.05
quality 3.00000 5.66 8.00000 0.80
# --- VISUALISASI MATRIKS KORELASI ---
M <- cor(data_wine)
corrplot(M, method = "color", type = "upper", order = "hclust", 
         addCoef.col = "black", tl.col = "black", tl.srt = 45, 
         diag = FALSE, title = "\n Gambar 1: Heatmap Korelasi", mar = c(0,0,1,0))

# 2. Persiapan Model (x dan y)
y <- data_wine$quality
x <- as.matrix(data_wine[, -which(names(data_wine) == "quality")])

# Split Data (80% Train, 20% Test)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
x_train <- x[trainIndex, ]
x_test  <- x[-trainIndex, ]
y_train <- y[trainIndex]
y_test  <- y[-trainIndex]

# 3. Training Model & Cross-Validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
cv_en    <- cv.glmnet(x_train, y_train, alpha = 0.5)

# --- PLOT CROSS-VALIDATION (Poin 6) ---
cat("### Gambar 2: Plot Cross-Validation untuk Penentuan Lambda Terbaik")
## ### Gambar 2: Plot Cross-Validation untuk Penentuan Lambda Terbaik
par(mfrow = c(1, 3)) 
plot(cv_ridge, main = "Ridge (Alpha 0)")
plot(cv_lasso, main = "Lasso (Alpha 1)")
plot(cv_en,    main = "Elastic Net (0.5)")

par(mfrow = c(1, 1)) 

# 4. Fungsi Evaluasi
hitung_evaluasi <- function(model, best_lambda, x_baru, y_asli, nama_model) {
  prediksi <- predict(model, s = best_lambda, newx = x_baru)
  return(data.frame(Model = nama_model, 
                    Lambda_Best = round(best_lambda, 5),
                    RMSE = round(RMSE(prediksi, y_asli), 4), 
                    MAE = round(MAE(prediksi, y_asli), 4)))
}

# --- TABEL D: PERBANDINGAN METRIK EVALUASI (Poin 7 & 8) ---
tabel_eval <- rbind(
  hitung_evaluasi(cv_ridge, cv_ridge$lambda.min, x_test, y_test, "Ridge"),
  hitung_evaluasi(cv_lasso, cv_lasso$lambda.min, x_test, y_test, "Lasso"),
  hitung_evaluasi(cv_en,    cv_en$lambda.min,    x_test, y_test, "Elastic Net")
)

kable(tabel_eval, caption = "Tabel D: Perbandingan Metrik Evaluasi & Lambda") %>%
  kable_styling(bootstrap_options = c("striped", "bordered"), full_width = F)
Tabel D: Perbandingan Metrik Evaluasi & Lambda
Model Lambda_Best RMSE MAE
Ridge 0.04027 0.6555 0.5092
Lasso 0.00558 0.6578 0.5107
Elastic Net 0.00844 0.6578 0.5107
# --- TABEL E: KOEFISIEN AKHIR TIAP MODEL ---
tabel_koef <- data.frame(
  Fitur = row.names(coef(cv_ridge)),
  Ridge = round(as.vector(coef(cv_ridge, s = "lambda.min")), 4),
  Lasso = round(as.vector(coef(cv_lasso, s = "lambda.min")), 4),
  ElasticNet = round(as.vector(coef(cv_en, s = "lambda.min")), 4)
)

kable(tabel_koef, caption = "Tabel E: Perbandingan Koefisien Akhir") %>%
  kable_styling(bootstrap_options = "hover", full_width = F)
Tabel E: Perbandingan Koefisien Akhir
Fitur Ridge Lasso ElasticNet
(Intercept) 26.0095 3.9312 3.9908
fixed acidity 0.0240 0.0000 0.0005
volatile acidity -0.9747 -0.9939 -1.0012
citric acid -0.1356 -0.0976 -0.1178
residual sugar 0.0421 0.0308 0.0319
chlorides -1.7447 -1.7223 -1.7512
free sulfur dioxide 0.0029 0.0023 0.0026
total sulfur dioxide -0.0030 -0.0028 -0.0029
density -22.3933 0.0000 0.0000
pH -0.3129 -0.4010 -0.4174
sulphates 0.8763 0.8471 0.8541
alcohol 0.2784 0.3067 0.3063
# --- TABEL F: SELEKSI FITUR OLEH LASSO ---
fitur_terpilih <- tabel_koef %>%
  filter(Lasso != 0 & Fitur != "(Intercept)") %>%
  select(Fitur, Lasso)

kable(fitur_terpilih, caption = "Tabel F: Fitur yang Berhasil Diseleksi Lasso") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  column_spec(2, bold = T, color = "white", background = "forestgreen")
Tabel F: Fitur yang Berhasil Diseleksi Lasso
Fitur Lasso
volatile acidity -0.9939
citric acid -0.0976
residual sugar 0.0308
chlorides -1.7223
free sulfur dioxide 0.0023
total sulfur dioxide -0.0028
pH -0.4010
sulphates 0.8471
alcohol 0.3067