Outline - Eksplorasi Berdasarkan provinsi - Cek distribusi kelas peringkat - Modeling K-NN (K-fold cross validation) - Imbalance (SMOTE dan tidak SMOTE) - Perbandingan analisis SMOTE dan tidak
SMOTE dan tidak SMOTE SPLIT DATA (HOLD OUT dan K-FOLD)
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.2
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
library(kknn)
## Warning: package 'kknn' was built under R version 4.4.3
df <- read_excel("C:/Users/nabil/Downloads/Pemodelan Klasifikasi/Data Tugas-2.xlsx")
str(df)
## tibble [6,648 × 7] (S3: tbl_df/tbl/data.frame)
## $ Nomor : num [1:6648] 1 2 3 4 5 6 7 8 9 10 ...
## $ Provinsi : chr [1:6648] "Prov. Jawa Timur" "Prov. Nusa Tenggara Timur" "Prov. Jawa Timur" "Prov. Sumatera Utara" ...
## $ Peringkat Akreditasi: chr [1:6648] "A" "B" "A" "A" ...
## $ Lit_2023 : num [1:6648] 20 80 91.1 57.8 55.6 ...
## $ Num_2023 : num [1:6648] 40 48.8 82.2 57.8 66.7 ...
## $ Lit_2024 : num [1:6648] 0 82.2 93.3 64.4 83.3 ...
## $ Num_2024 : num [1:6648] 0 62.8 93.3 62.2 66.7 ...
df <- df %>%
select(-Nomor)
glimpse(df)
## Rows: 6,648
## Columns: 6
## $ Provinsi <chr> "Prov. Jawa Timur", "Prov. Nusa Tenggara Timur"…
## $ `Peringkat Akreditasi` <chr> "A", "B", "A", "A", "B", "A", "C", "B", "A", "C…
## $ Lit_2023 <dbl> 20.00, 80.00, 91.11, 57.78, 55.56, 51.11, 56.52…
## $ Num_2023 <dbl> 40.00, 48.78, 82.22, 57.78, 66.67, 51.11, 61.90…
## $ Lit_2024 <dbl> 0.000, 82.222, 93.333, 64.444, 83.333, 71.795, …
## $ Num_2024 <dbl> 0.000, 62.791, 93.333, 62.222, 66.667, 55.556, …
Filter provinsi DKI, DIY, Bali, Banten
df <- df %>%
filter(Provinsi %in% c("Prov. D.K.I. Jakarta",
"Prov. D.I. Yogyakarta",
"Prov. Bali",
"Prov. Banten"))
# cek hasil
df %>% count(Provinsi)
## # A tibble: 4 × 2
## Provinsi n
## <chr> <int>
## 1 Prov. Bali 66
## 2 Prov. Banten 394
## 3 Prov. D.I. Yogyakarta 102
## 4 Prov. D.K.I. Jakarta 229
# hitung jumlah per provinsi
prov_count <- df %>%
count(Provinsi)
Jumlah sekolah per provinsi
# pie chart dengan label jumlah
ggplot(prov_count, aes(x = "", y = n, fill = Provinsi)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(aes(label = n),
position = position_stack(vjust = 0.5),
color = "black",
size = 4) +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Jumlah Sekolah per Provinsi",
fill = "Provinsi") +
theme_void()
summary(df[, c("Lit_2023","Num_2023","Lit_2024","Num_2024")])
## Lit_2023 Num_2023 Lit_2024 Num_2024
## Min. : 0.00 Min. : 6.67 Min. : 0.00 Min. : 0.00
## 1st Qu.: 72.78 1st Qu.: 65.89 1st Qu.: 77.78 1st Qu.: 71.11
## Median : 91.11 Median : 83.33 Median : 93.33 Median : 87.81
## Mean : 81.80 Mean : 77.17 Mean : 83.46 Mean : 80.11
## 3rd Qu.: 97.78 3rd Qu.: 93.25 3rd Qu.: 97.78 3rd Qu.: 95.56
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00
df <- df %>%
mutate(across(where(is.character),as.factor))
Cek Missing Value
plot_intro(df,theme_config = theme_classic())
Ubah Kolom “Peringkat Akreditasi” jadi Y biar lebih gampang
df <- df %>%
rename(Y = `Peringkat Akreditasi`)
Barplot Frekuensi Peringkat Akreditasi
ggplot(df, aes(x = Y, fill = Y)) +
geom_bar() +
labs(title = "Frekuensi Peringkat Akreditasi",
x = "Peringkat Akreditasi",
y = "Frekuensi") +
scale_fill_brewer(palette = "Oranges") +
theme_classic()
# Hitung frekuensi + proporsi
freq_tab <- df %>%
count(Y) %>%
mutate(prop = n / sum(n) * 100)
# Plot
ggplot(freq_tab, aes(x = Y, y = n, fill = Y)) +
geom_col() +
geom_text(aes(label = paste0(n, " (", round(prop,1), "%)")),
vjust = -0.5, size = 4) +
labs(title = "Frekuensi Peringkat Akreditasi",
x = "Peringkat Akreditasi",
y = "Frekuensi") +
scale_fill_brewer(palette = "Oranges") +
theme_classic()
hapus observasi pada kelas “Tidak Terakreditasi” karena terlalu sedikit
TTA <- df %>%
filter(Y == "Tidak Terakreditasi")
TTA
## # A tibble: 2 × 6
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Prov. Banten Tidak Terakreditasi 66.7 66.7 0 100
## 2 Prov. Banten Tidak Terakreditasi 25 18.8 4.54 25
df <- df %>%
filter(Y != "Tidak Terakreditasi") %>%
droplevels()
# Hitung frekuensi + proporsi
freq_tab <- df %>%
count(Y) %>%
mutate(prop = n / sum(n) * 100)
# Plot
ggplot(freq_tab, aes(x = Y, y = n, fill = Y)) +
geom_col() +
geom_text(aes(label = paste0(n, " (", round(prop,1), "%)")),
vjust = -0.5, size = 4) +
labs(title = "Frekuensi Peringkat Akreditasi",
x = "Peringkat Akreditasi",
y = "Frekuensi") +
scale_fill_brewer(palette = "Oranges") +
theme_classic()
Boxplot sebaran nilai peubah terhadap kelas Y
plot_boxplot(data = df,
by = "Y",
geom_boxplot_args = list(fill="#03A9F4"),
ggtheme = theme_classic())
sama saja dengan di atas
df_long <- df %>%
pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
names_to = "Variabel",
values_to = "Skor")
ggplot(df_long, aes(x = Y, y = Skor, fill = Y)) +
geom_boxplot() +
facet_wrap(~ Variabel, scales = "free_y") +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Distribusi Nilai Literasi & Numerasi per Akreditasi",
x = "Akreditasi", y = "Skor") +
theme_minimal()
Kelas A B C nilai masing-masing peubah terlihat berbeda rentang nilainya yaitu A lebih tinggi dari B C, tapi A banyak outlier
Proporsi peringkat akreditasi per provinsi
df %>%
count(Provinsi, Y) %>%
ggplot(aes(x = Provinsi, y = n, fill = Y)) +
geom_col(position = "fill") +
labs(title = "Proporsi Peringkat Akreditasi per Provinsi",
x = "Provinsi", y = "Proporsi") +
theme_classic() +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
scale_fill_brewer(palette = "Oranges") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Frekuensi Peringkat Akreditasi per Provinsi
df %>%
count(Provinsi, Y) %>%
arrange(Provinsi, desc(n)) %>%
ggplot(aes(x = Provinsi, y = n, fill = Y)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Distribusi Peringkat Akreditasi per Provinsi",
x = "Provinsi",
y = "Frekuensi") +
theme_minimal() + # background putih polos, no grid
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.background = element_blank(),
plot.background = element_blank(),
legend.background = element_blank())
Korelasi masing-masing variabel
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
df %>%
select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
ggpairs()
library(GGally)
library(ggplot2)
df %>%
select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
ggpairs(
aes(alpha = 0.6),
upper = list(continuous = wrap("cor", size = 4, stars = FALSE)), # hanya angka korelasi
lower = list(continuous = wrap("smooth",
color = "black", alpha = 0.6, se = FALSE)),
diag = list(continuous = wrap("densityDiag",
fill = "orange", alpha = 0.6))
) +
theme_classic()
df %>%
summarise(across(c(Lit_2023, Num_2023, Lit_2024, Num_2024), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(c(Lit_2023, Num_2023, Lit_2024, Num_2024), mean, na.rm =
## TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 1 × 4
## Lit_2023 Num_2023 Lit_2024 Num_2024
## <dbl> <dbl> <dbl> <dbl>
## 1 81.9 77.3 83.7 80.2
df %>%
summarise(Lit_2023 = mean(Lit_2023, na.rm=TRUE),
Lit_2024 = mean(Lit_2024, na.rm=TRUE),
Num_2023 = mean(Num_2023, na.rm=TRUE),
Num_2024 = mean(Num_2024, na.rm=TRUE)) %>%
tidyr::pivot_longer(everything(), names_to = "Tahun", values_to = "Rata2") %>%
ggplot(aes(x = Tahun, y = Rata2, fill = Tahun)) +
geom_col() +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Rata-rata Literasi & Numerasi (2023 vs 2024)",
x = "", y = "Rata-rata") +
theme_minimal()
library(ggplot2)
library(dplyr)
library(tidyr)
df %>%
select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai") %>%
ggplot(aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(alpha = 0.7) +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Boxplot Literasi & Numerasi (2023 vs 2024)",
x = "Variabel", y = "Nilai") +
theme_minimal()
# Misal untuk variabel Lit_2023
Q1 <- quantile(df$Lit_2023, 0.25, na.rm = TRUE)
Q3 <- quantile(df$Lit_2023, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
outlier_Lit2023 <- df %>%
filter(Lit_2023 < (Q1 - 1.5 * IQR) | Lit_2023 > (Q3 + 1.5 * IQR))
outlier_Lit2023
## # A tibble: 45 × 6
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Prov. Banten C 4.35 34.8 0 0
## 2 Prov. Banten C 20 36 20 38
## 3 Prov. Banten B 0 16.7 78.6 64.3
## 4 Prov. Banten C 33.3 33.3 50 44.4
## 5 Prov. Banten C 31.4 28.6 55.3 63.2
## 6 Prov. Banten C 30 35 26.3 15.8
## 7 Prov. Banten B 23.1 38.5 7.69 15.4
## 8 Prov. Banten B 33.3 35.6 49.0 62.2
## 9 Prov. Banten C 16.1 26.5 38.9 27.8
## 10 Prov. Banten C 31.1 35.6 24.4 26.7
## # ℹ 35 more rows
outliers <- df %>%
pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
names_to = "Variabel", values_to = "Nilai") %>%
group_by(Variabel) %>%
mutate(Q1 = quantile(Nilai, 0.25, na.rm = TRUE),
Q3 = quantile(Nilai, 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Outlier = ifelse(Nilai < Q1 - 1.5*IQR | Nilai > Q3 + 1.5*IQR, TRUE, FALSE)) %>%
filter(Outlier == TRUE)
outliers
## # A tibble: 173 × 8
## # Groups: Variabel [4]
## Provinsi Y Variabel Nilai Q1 Q3 IQR Outlier
## <fct> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 Prov. Banten C Lit_2023 4.35 73.3 97.8 24.5 TRUE
## 2 Prov. Banten C Lit_2024 0 77.8 97.8 20 TRUE
## 3 Prov. Banten C Num_2024 0 71.1 95.6 24.4 TRUE
## 4 Prov. Banten C Lit_2024 15.8 77.8 97.8 20 TRUE
## 5 Prov. Banten C Num_2024 21.1 71.1 95.6 24.4 TRUE
## 6 Prov. Banten C Lit_2023 20 73.3 97.8 24.5 TRUE
## 7 Prov. Banten C Lit_2024 20 77.8 97.8 20 TRUE
## 8 Prov. Banten C Lit_2024 22.2 77.8 97.8 20 TRUE
## 9 Prov. Banten B Lit_2023 0 73.3 97.8 24.5 TRUE
## 10 Prov. Banten B Num_2023 16.7 66.7 93.3 26.7 TRUE
## # ℹ 163 more rows
df_avg <- df %>%
rowwise() %>%
mutate(Rata_All = mean(c_across(c(Lit_2023, Num_2023, Lit_2024, Num_2024)), na.rm = TRUE)) %>%
ungroup()
# Buat rata-rata per observasi dari semua variabel
df_avg <- df %>%
mutate(Avg_Skor = rowMeans(select(., c(Lit_2023, Num_2023, Lit_2024, Num_2024)), na.rm = TRUE))
# Boxplot per Akreditasi
ggplot(df_avg, aes(x = Y, y = Avg_Skor, fill = Y)) +
geom_boxplot(alpha = 0.7) +
# Tambahkan titik mean per Akreditasi
stat_summary(fun = mean, geom = "point",
shape = 18, size = 4, color = "red") +
# Tambahkan label angka mean
stat_summary(fun = mean, geom = "text",
aes(label = round(..y.., 1)),
vjust = -1, color = "black") +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Distribusi Rata-Rata Skor Literasi & Numerasi per Akreditasi",
x = "Akreditasi", y = "Rata-Rata Skor") +
theme_minimal()
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(dplyr)
anomali <- df_avg %>%
group_by(Y) %>%
mutate(Q1 = quantile(Avg_Skor, 0.25, na.rm = TRUE),
Q3 = quantile(Avg_Skor, 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Lower = Q1 - 1.5 * IQR,
Upper = Q3 + 1.5 * IQR,
Outlier = ifelse(Avg_Skor < Lower | Avg_Skor > Upper, TRUE, FALSE)) %>%
filter(Outlier == TRUE)
anomali
## # A tibble: 29 × 13
## # Groups: Y [2]
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024 Avg_Skor Q1 Q3 IQR
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Prov. B… A 80 75.6 62.2 53.3 67.8 86.8 96.7 9.82
## 2 Prov. B… A 55.6 53.3 88.9 80 69.4 86.8 96.7 9.82
## 3 Prov. B… A 66.7 66.7 75.6 75.6 71.1 86.8 96.7 9.82
## 4 Prov. B… A 77.3 50 90 60 69.3 86.8 96.7 9.82
## 5 Prov. B… A 84.4 73.3 64.4 64.4 71.7 86.8 96.7 9.82
## 6 Prov. B… A 66.7 68.9 75.6 68.9 70.0 86.8 96.7 9.82
## 7 Prov. B… A 62.2 60 68.9 60.9 63.0 86.8 96.7 9.82
## 8 Prov. B… A 68.9 60 62.2 91.1 70.6 86.8 96.7 9.82
## 9 Prov. B… A 44.4 72.2 95.2 61.9 68.5 86.8 96.7 9.82
## 10 Prov. D… A 75 60.7 75 67.9 69.6 86.8 96.7 9.82
## # ℹ 19 more rows
## # ℹ 3 more variables: Lower <dbl>, Upper <dbl>, Outlier <lgl>
library(dplyr)
library(tidyr)
library(ggplot2)
# Ubah ke long format
df_long <- df %>%
pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
names_to = "Variabel", values_to = "Nilai")
# Hitung rata-rata tiap variabel
mean_df <- df_long %>%
group_by(Variabel) %>%
summarise(Rata2 = mean(Nilai, na.rm = TRUE), .groups = "drop")
# Plot boxplot + titik rata-rata + label nilai
ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(alpha = 0.7) +
geom_point(data = mean_df, aes(x = Variabel, y = Rata2),
color = "red", size = 3) +
geom_text(data = mean_df, aes(x = Variabel, y = Rata2,
label = round(Rata2, 1)),
vjust = -1, color = "red", fontface = "bold") +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Boxplot Literasi & Numerasi dengan Rata-rata",
x = "", y = "Nilai") +
theme_minimal()
library(ggplot2)
library(dplyr)
library(tidyr)
# pivot ke long format
df_long <- df %>%
pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
names_to = "Variabel",
values_to = "Skor")
# plot density
ggplot(df_long, aes(x = Skor, fill = Y, color = Y)) +
geom_density(alpha = 0.3) +
facet_wrap(~ Variabel, scales = "free") +
labs(title = "Distribusi Density Skor Literasi & Numerasi per Akreditasi",
x = "Skor", y = "Density") +
theme_minimal()
library(corrplot)
## corrplot 0.95 loaded
cor_mat <- df %>%
select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
cor(use = "complete.obs")
corrplot(cor_mat, method = "color", type = "upper",
addCoef.col = "white", number.cex = 0.7,
tl.col = "black", tl.srt = 45)
untuk analisis klasifikasi kolom Provinsi tidak digunakan sebagai variabel prediktor
df<-df %>%
select(-Provinsi)
1.Split data → imbal vs stratified. 2.Cross-validation di training. 3.SMOTE hanya di train. 4.KNN dengan normalisasi. 5.Evaluasi model (Accuracy, Balanced Accuracy, F1, AUC, confusion matrix).
# === 0. Library ===
library(tidymodels)
library(themis) # untuk SMOTE / handling imbalance
## Warning: package 'themis' was built under R version 4.4.3
library(dplyr)
library(ggplot2)
set.seed(123)
# === 1. Holdout Split ===
# a) tanpa stratifikasi (imbalance bisa muncul)
split_imbal <- initial_split(df, prop = 0.8)
train_imbal <- training(split_imbal)
test_imbal <- testing(split_imbal)
# b) dengan stratifikasi (lebih seimbang)
split_strata <- initial_split(df, prop = 0.8, strata = Y)
train_strata <- training(split_strata)
test_strata <- testing(split_strata)
# Cek distribusi kelas
table(train_imbal$Y)
##
## A B C
## 430 161 40
table(test_imbal$Y)
##
## A B C
## 112 40 6
table(train_strata$Y)
##
## A B C
## 436 161 34
table(test_strata$Y)
##
## A B C
## 106 40 12
# === 2. Cross Validation pada training (pakai stratified split lebih disarankan) ===
folds <- vfold_cv(train_strata, v = 10, strata = Y)
# === 3. Recipe (preprocessing + balancing) ===
knn_recipe <- recipe(Y ~ ., data = train_strata) %>%
step_normalize(all_numeric_predictors()) %>% # normalisasi penting untuk KNN
step_smote(Y) # oversampling dengan SMOTE (train saja)
no_prep <- recipe(Y ~ ., data = train_strata)
# === 4. Model ===
knn_spec <- nearest_neighbor(neighbors = 7, weight_func = "rectangular", dist_power = 2) %>%
set_engine("kknn") %>%
set_mode("classification")
# === 5. Workflow ===
knn_wf <- workflow() %>%
add_model(knn_spec) %>%
add_recipe(knn_recipe)
knn_wff<- workflow() %>%
add_model(knn_spec) %>%
add_recipe(no_prep)
# === 6. Latih model dengan cross-validation SMOTE ===
knn_res <- fit_resamples(
knn_wf,
resamples = folds,
metrics = metric_set(accuracy, recall, precision, f_meas,bal_accuracy, roc_auc, f_meas)
)
collect_metrics(knn_res)
## # A tibble: 6 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy multiclass 0.727 10 0.0179 Preprocessor1_Model1
## 2 bal_accuracy macro 0.737 10 0.0257 Preprocessor1_Model1
## 3 f_meas macro 0.577 20 0.0217 Preprocessor1_Model1
## 4 precision macro 0.564 10 0.0275 Preprocessor1_Model1
## 5 recall macro 0.626 10 0.0430 Preprocessor1_Model1
## 6 roc_auc hand_till 0.794 10 0.0264 Preprocessor1_Model1
# === 6. Latih model dengan cross-validation TIDAK SMOTE ===
knn_ress <- fit_resamples(
knn_wff,
resamples = folds,
metrics = metric_set(accuracy, recall, precision, f_meas,bal_accuracy, roc_auc, f_meas)
)
collect_metrics(knn_ress)
## # A tibble: 6 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy multiclass 0.791 10 0.0114 Preprocessor1_Model1
## 2 bal_accuracy macro 0.678 10 0.0149 Preprocessor1_Model1
## 3 f_meas macro 0.538 20 0.0188 Preprocessor1_Model1
## 4 precision macro 0.603 10 0.0441 Preprocessor1_Model1
## 5 recall macro 0.524 10 0.0227 Preprocessor1_Model1
## 6 roc_auc hand_till 0.769 10 0.0224 Preprocessor1_Model1
# === 7. Final fit pada full training (SMOTE applied) ===
final_knn_smote <- fit(knn_wf, data = train_strata)
# === 7. Final fit pada full training ( NO SMOTE) ===
final_knn_nosmote <- fit(knn_wff, data = train_strata)
# === 8. Evaluasi di test set (SMOTE) ===
preds_smote <- predict(final_knn_smote, test_strata, type = "prob") %>%
bind_cols(predict(final_knn_smote, test_strata)) %>%
bind_cols(test_strata %>% select(Y))
# === 8. Evaluasi di test set (NO SMOTE) ===
preds_nosmote <- predict(final_knn_nosmote, test_strata, type = "prob") %>%
bind_cols(predict(final_knn_nosmote, test_strata)) %>%
bind_cols(test_strata %>% select(Y))
# con matrix SMOTE
confussion_matrix_smote <- preds_smote %>%
conf_mat(truth=Y,estimate=.pred_class)
autoplot(confussion_matrix_smote,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
preds_smote %>%
conf_mat(truth=Y,estimate=.pred_class) %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.747
## 2 kap multiclass 0.500
## 3 sens macro 0.677
## 4 spec macro 0.857
## 5 ppv macro 0.615
## 6 npv macro 0.841
## 7 mcc multiclass 0.503
## 8 j_index macro 0.534
## 9 bal_accuracy macro 0.767
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.615
## 12 recall macro 0.677
## 13 f_meas macro 0.636
#metrik evaluasi NO SMOTE
confussion_matrix_nosmote<- preds_nosmote %>%
conf_mat(truth=Y,estimate=.pred_class)
autoplot(confussion_matrix_nosmote,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
preds_nosmote %>%
conf_mat(truth=Y,estimate=.pred_class) %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.804
## 2 kap multiclass 0.549
## 3 sens macro 0.587
## 4 spec macro 0.849
## 5 ppv macro 0.706
## 6 npv macro 0.897
## 7 mcc multiclass 0.561
## 8 j_index macro 0.436
## 9 bal_accuracy macro 0.718
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.706
## 12 recall macro 0.587
## 13 f_meas macro 0.619
Prediksi data tidak terakreditasi
TTA
## # A tibble: 2 × 6
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Prov. Banten Tidak Terakreditasi 66.7 66.7 0 100
## 2 Prov. Banten Tidak Terakreditasi 25 18.8 4.54 25
TTA_new <- TTA %>% select(-Y)
# Prediksi ke data baru
pred_TTA <- predict(final_knn_smote, new_data = TTA_new, type = "prob") %>%
bind_cols(predict(final_knn_smote, new_data = TTA_new)) %>%
bind_cols(TTA_new)
pred_TTA
## # A tibble: 2 × 9
## .pred_A .pred_B .pred_C .pred_class Provinsi Lit_2023 Num_2023 Lit_2024
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl>
## 1 0 0.714 0.286 B Prov. Banten 66.7 66.7 0
## 2 0 0 1 C Prov. Banten 25 18.8 4.54
## # ℹ 1 more variable: Num_2024 <dbl>
prediksi <- predict(final_knn_smote, new_data = TTA_new)
# Gabungkan dengan data baru
hasil_pred <- TTA %>%
bind_cols(prediksi %>% rename(prediksi_knn_Hold_SMOTE = .pred_class))
hasil_pred
## # A tibble: 2 × 7
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_Hold_SM…¹
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 Prov. Banten Tidak… 66.7 66.7 0 100 B
## 2 Prov. Banten Tidak… 25 18.8 4.54 25 C
## # ℹ abbreviated name: ¹prediksi_knn_Hold_SMOTE
# --- Prediksi data baru ---
prediksi <- predict(final_knn_nosmote, new_data = TTA_new)
# Gabungkan dengan data baru
hasil_pred <- TTA %>%
bind_cols(prediksi %>% rename(prediksi_knn_Hold_noSMOTE = .pred_class))
hasil_pred
## # A tibble: 2 × 7
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_Hold_no…¹
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 Prov. Banten Tidak… 66.7 66.7 0 100 B
## 2 Prov. Banten Tidak… 25 18.8 4.54 25 B
## # ℹ abbreviated name: ¹prediksi_knn_Hold_noSMOTE
# === Visualisasi Distribusi Kelas ===
# 1. Train Asli (belum di-SMOTE)
p_train <- train_strata %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Train Asli", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
# 2. Test Set (selalu asli, tidak di-SMOTE)
p_test <- test_strata %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Test Set", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
# 3. Train hasil SMOTE
smote_recipe <- recipe(Y ~ ., data = train_strata) %>%
step_smote(Y)
smote_prep <- prep(smote_recipe)
train_smote <- juice(smote_prep)
p_smote <- train_smote %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Train Hasil SMOTE", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
# 4. Gabungkan plot
library(patchwork)
p_train + p_test + p_smote
# === Scatter Plot Train Asli vs SMOTE ===
# Gabungkan data asli dan hasil SMOTE
plot_data <-
train_strata %>% mutate(type = "Asli")
ggplot(plot_data, aes(x = Lit_2023, y = Num_2023, color = Y, shape = type)) +
geom_point(alpha = 0.5, size = 1.5) +
labs(
title = "Visualisasi Data Asli",
subtitle = "Distribusi data sebelum balancing",
x = "Literasi 2023",
y = "Numerasi 2023"
) +
theme_minimal() +
scale_color_brewer(palette = "Dark2")
# === Scatter Plot Train Asli vs SMOTE ===
# Gabungkan data asli dan hasil SMOTE
plot_data <- bind_rows(
train_strata %>% mutate(type = "Asli"),
train_smote %>% mutate(type = "SMOTE")
)
ggplot(plot_data, aes(x = Lit_2023, y = Num_2023, color = Y, shape = type)) +
geom_point(alpha = 0.5, size = 1.5) +
labs(
title = "Visualisasi Data Asli vs Hasil SMOTE",
subtitle = "Perbandingan sebaran kelas setelah balancing",
x = "Literasi 2023",
y = "Numerasi 2023"
) +
theme_minimal() +
scale_color_brewer(palette = "Dark2")
library(tidymodels)
library(themis)
library(dplyr)
library(ggplot2)
library(patchwork)
set.seed(123)
# === 1. Split dengan stratifikasi ===
split_strata <- initial_split(df, prop = 0.8, strata = Y)
train_strata <- training(split_strata)
test_strata <- testing(split_strata)
# === 2. SMOTE pada train ===
smote_recipe <- recipe(Y ~ ., data = train_strata) %>%
step_smote(Y)
smote_prep <- prep(smote_recipe)
train_smote <- juice(smote_prep)
# === 3. Bar plot distribusi kelas ===
p_train <- train_strata %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Train Asli", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
p_test <- test_strata %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Test Set", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
p_smote <- train_smote %>%
count(Y) %>%
ggplot(aes(x = Y, y = n, fill = Y)) +
geom_col() +
labs(title = "Distribusi Y - Train Hasil SMOTE", x = "Kelas", y = "Jumlah") +
scale_fill_brewer(palette = "Oranges") +
theme_minimal()
bars <- p_train + p_test + p_smote
# === 4. Scatter plot Train Asli vs SMOTE ===
plot_data <- bind_rows(
train_strata %>% mutate(type = "Asli"),
train_smote %>% mutate(type = "SMOTE")
)
p_scatter <- ggplot(plot_data, aes(x = Lit_2023, y = Num_2023,
color = Y, shape = type)) +
geom_point(alpha = 0.6, size = 2) +
labs(
title = "Sebaran Data Asli vs Hasil SMOTE",
subtitle = "Literasi 2023 vs Numerasi 2023",
x = "Literasi 2023",
y = "Numerasi 2023"
) +
theme_minimal() +
scale_color_brewer(palette = "Dark2")
# === 5. Gabungkan semua plot ===
final_plot <- bars / p_scatter
final_plot
library(yardstick)
# === 6. Final fit pada full training (dengan SMOTE) ===
final_knn <- fit(knn_wf, data = train_strata)
# === 7. Prediksi pada test set (tetap original) ===
preds <- predict(final_knn, test_strata, type = "prob") %>%
bind_cols(predict(final_knn, test_strata)) %>%
bind_cols(test_strata %>% select(Y))
# === 8. Confusion matrix ===
confmat <- conf_mat(preds, truth = Y, estimate = .pred_class)
confmat
## Truth
## Prediction A B C
## A 92 14 1
## B 13 21 0
## C 3 7 7
# === 9. Visualisasi confusion matrix (heatmap) ===
p_confmat <- autoplot(confmat, type = "heatmap") +
scale_fill_gradient(low = "white", high = "orange") +
labs(title = "Confusion Matrix - KNN pada Test Set")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
# === 10. Gabungkan dengan plot distribusi & scatter ===
final_plot_all <- (bars / p_scatter) / p_confmat
final_plot_all
library(themis)
no_prep <- recipe(Y~.,data=df)
set.seed(123) # pastikan sama setiap running
folds <- vfold_cv(df, v = 10, strata = Y)
# -----------------------------
# DEFINISIKAN MODEL KNN
# -----------------------------
knn_tune <- nearest_neighbor(neighbors=tune(),weight_func="rectangular",dist_power = 2) %>%
set_engine("kknn") %>%
set_mode("classification")
# -----------------------------
# MENENTUKAN NILAI HYPERPARAMETER YANG DIGUNAKAN
# -----------------------------
knn_grid <- grid_regular(neighbors(range = c(2,50)),levels= 50)
fitted_knn <- function(x){
mod <- extract_fit_engine(x)
fitted(mod)
}
smote_rec <- recipe(Y ~ ., data = df) %>%
step_smote(Y)
wfst0 <- workflow_set(preproc = list(smote_rec=smote_rec),models = list(knn=knn_tune))
wfst1 <- workflow_set(
preproc = list(no_prep = no_prep,
smote_rec = smote_rec),
models = list(knn = knn_tune)
)
set.seed(123) # seed konsisten juga untuk SMOTE + tuning
knn_cv <- wfst0 %>%
workflow_map(fn = "tune_grid",
verbose = TRUE,
resamples = folds,
grid = knn_grid,
metrics = metric_set(accuracy),
control = control_resamples(save_pred = TRUE))
## i 1 of 1 tuning: smote_rec_knn
## ✔ 1 of 1 tuning: smote_rec_knn (11.6s)
collect_metrics(knn_cv)
## # A tibble: 49 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 smote_rec_knn Preproces… recipe near… accura… multiclass 0.709 10 0.0102
## 2 smote_rec_knn Preproces… recipe near… accura… multiclass 0.724 10 0.0119
## 3 smote_rec_knn Preproces… recipe near… accura… multiclass 0.724 10 0.0125
## 4 smote_rec_knn Preproces… recipe near… accura… multiclass 0.738 10 0.0122
## 5 smote_rec_knn Preproces… recipe near… accura… multiclass 0.735 10 0.0131
## 6 smote_rec_knn Preproces… recipe near… accura… multiclass 0.747 10 0.0133
## 7 smote_rec_knn Preproces… recipe near… accura… multiclass 0.749 10 0.0134
## 8 smote_rec_knn Preproces… recipe near… accura… multiclass 0.743 10 0.0103
## 9 smote_rec_knn Preproces… recipe near… accura… multiclass 0.741 10 0.0102
## 10 smote_rec_knn Preproces… recipe near… accura… multiclass 0.741 10 0.00962
## # ℹ 39 more rows
# ambil hasil tuning
knn_res <- knn_cv %>%
workflowsets::extract_workflow_set_result(id = "smote_rec_knn")
# pilih parameter terbaik berdasarkan metric (misal accuracy)
best_params <- select_best(knn_res, metric = "accuracy")
best_params
## # A tibble: 1 × 2
## neighbors .config
## <int> <chr>
## 1 35 Preprocessor1_Model34
confussion_matrix_cv <- knn_res %>%
conf_mat_resampled(parameters = best_params, tidy = FALSE)
autoplot(confussion_matrix_cv, type = "heatmap") +
scale_fill_viridis_c(direction = -1, option = "inferno", alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
confussion_matrix_cv %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.758
## 2 kap multiclass 0.501
## 3 sens macro 0.691
## 4 spec macro 0.856
## 5 ppv macro 0.611
## 6 npv macro 0.841
## 7 mcc multiclass 0.503
## 8 j_index macro 0.547
## 9 bal_accuracy macro 0.773
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.611
## 12 recall macro 0.691
## 13 f_meas macro 0.636
# pilih parameter terbaik langsung dari knn_res
best_params <- knn_res %>%
select_best(metric = "accuracy")
best_params
## # A tibble: 1 × 2
## neighbors .config
## <int> <chr>
## 1 35 Preprocessor1_Model34
best_params_1se <- knn_res %>%
select_by_one_std_err(metric = "accuracy", desc(neighbors))
best_params_1se
## # A tibble: 1 × 2
## neighbors .config
## <int> <chr>
## 1 50 Preprocessor1_Model49
# --- Ambil hasil tuning ---
neighbors_result <- knn_res %>%
collect_metrics() %>%
filter(.metric == "accuracy")
# --- Ambil best & 1SE ---
best_neighbors <- knn_res %>%
select_best(metric = "accuracy")
best_neighbors_1se <- knn_res %>%
select_by_one_std_err(metric = "accuracy", desc(neighbors))
# --- Plot ---
neighbors_result %>%
ggplot(aes(x = neighbors, y = mean)) +
geom_errorbar(aes(ymin = mean - std_err,
ymax = mean + std_err),
width = 0.2, alpha = 0.6) +
geom_point(size = 2, color = "black") +
geom_vline(aes(xintercept = best_neighbors$neighbors,
color = "Highest"),
linetype = "dashed", linewidth = 0.8) +
geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
color = "1-SE-Rule"),
linetype = "dashed", linewidth = 0.8) +
ylab("Accuracy") +
xlab("Jumlah Neighbors (k)") +
scale_x_continuous(n.breaks = 12) +
scale_color_manual(values = c("Highest" = "#03A9F4",
"1-SE-Rule" = "#f44e03"),
name = "Selection") +
theme_bw() +
theme(legend.position = "top")
neighbors_result %>%
# mutate(neighbors = factor(neighbors)) %>%
ggplot(aes(x=neighbors, y=mean)) +
geom_errorbar(aes(ymin=(mean-std_err),
ymax=(mean+std_err)))+
geom_point()+
geom_vline(aes(xintercept = best_neighbors$neighbors,color="Highest"),
linetype="dashed",linewidth=0.8)+
geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
color="1-SE-Rule"
),
linetype="dashed",linewidth=0.8)+
ylab("Accuracy") +
scale_x_continuous(n.breaks = 12)+
scale_color_manual(values = c("#03A9F4","#f44e03"),
breaks = c("Highest","1-SE-Rule"),
name = "Selection"
)+
theme_bw()+
theme(legend.position = "top")
# --- Finalisasi workflow dengan parameter terbaik ---
final_wf <- workflow() %>%
add_model(knn_tune %>% finalize_model(best_params)) %>%
add_recipe(smote_rec)
# Fit ulang ke seluruh data training
final_fit <- final_wf %>% fit(data = df)
Prediksi data tidak terakreditasi
TTA
## # A tibble: 2 × 6
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Prov. Banten Tidak Terakreditasi 66.7 66.7 0 100
## 2 Prov. Banten Tidak Terakreditasi 25 18.8 4.54 25
TTA_new <- TTA %>% select(-Y)
# --- Prediksi data baru ---
prediksi <- predict(final_fit, new_data = TTA_new)
# Gabungkan dengan data baru
hasil_pred <- TTA %>%
bind_cols(prediksi %>% rename(prediksi_knn_cv_SMOTE = .pred_class))
hasil_pred
## # A tibble: 2 × 7
## Provinsi Y Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_cv_SMOTE
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 Prov. Banten Tidak … 66.7 66.7 0 100 C
## 2 Prov. Banten Tidak … 25 18.8 4.54 25 C