Outline - Eksplorasi Berdasarkan provinsi - Cek distribusi kelas peringkat - Modeling K-NN (K-fold cross validation) - Imbalance (SMOTE dan tidak SMOTE) - Perbandingan analisis SMOTE dan tidak

SMOTE dan tidak SMOTE SPLIT DATA (HOLD OUT dan K-FOLD)

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(DataExplorer)

## Warning: package 'DataExplorer' was built under R version 4.4.3

library(kknn)

## Warning: package 'kknn' was built under R version 4.4.3

Import Data

df <- read_excel("C:/Users/nabil/Downloads/Pemodelan Klasifikasi/Data Tugas-2.xlsx")
str(df)

## tibble [6,648 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Nomor               : num [1:6648] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Provinsi            : chr [1:6648] "Prov. Jawa Timur" "Prov. Nusa Tenggara Timur" "Prov. Jawa Timur" "Prov. Sumatera Utara" ...
##  $ Peringkat Akreditasi: chr [1:6648] "A" "B" "A" "A" ...
##  $ Lit_2023            : num [1:6648] 20 80 91.1 57.8 55.6 ...
##  $ Num_2023            : num [1:6648] 40 48.8 82.2 57.8 66.7 ...
##  $ Lit_2024            : num [1:6648] 0 82.2 93.3 64.4 83.3 ...
##  $ Num_2024            : num [1:6648] 0 62.8 93.3 62.2 66.7 ...

df <- df %>% 
  select(-Nomor)
glimpse(df)

## Rows: 6,648
## Columns: 6
## $ Provinsi               <chr> "Prov. Jawa Timur", "Prov. Nusa Tenggara Timur"…
## $ `Peringkat Akreditasi` <chr> "A", "B", "A", "A", "B", "A", "C", "B", "A", "C…
## $ Lit_2023               <dbl> 20.00, 80.00, 91.11, 57.78, 55.56, 51.11, 56.52…
## $ Num_2023               <dbl> 40.00, 48.78, 82.22, 57.78, 66.67, 51.11, 61.90…
## $ Lit_2024               <dbl> 0.000, 82.222, 93.333, 64.444, 83.333, 71.795, …
## $ Num_2024               <dbl> 0.000, 62.791, 93.333, 62.222, 66.667, 55.556, …

Filter provinsi DKI, DIY, Bali, Banten

df <- df %>%
  filter(Provinsi %in% c("Prov. D.K.I. Jakarta",
                         "Prov. D.I. Yogyakarta",
                         "Prov. Bali",
                         "Prov. Banten"))

# cek hasil
df %>% count(Provinsi)

## # A tibble: 4 × 2
##   Provinsi                  n
##   <chr>                 <int>
## 1 Prov. Bali               66
## 2 Prov. Banten            394
## 3 Prov. D.I. Yogyakarta   102
## 4 Prov. D.K.I. Jakarta    229

# hitung jumlah per provinsi
prov_count <- df %>%
  count(Provinsi)

Eksplorasi Data

Jumlah sekolah per provinsi

# pie chart dengan label jumlah
ggplot(prov_count, aes(x = "", y = n, fill = Provinsi)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = n),
            position = position_stack(vjust = 0.5),
            color = "black",
            size = 4) +
   scale_fill_brewer(palette = "Oranges") +
  labs(title = "Jumlah Sekolah per Provinsi",
       fill = "Provinsi") +
  theme_void()

summary(df[, c("Lit_2023","Num_2023","Lit_2024","Num_2024")])

##     Lit_2023         Num_2023         Lit_2024         Num_2024     
##  Min.   :  0.00   Min.   :  6.67   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 72.78   1st Qu.: 65.89   1st Qu.: 77.78   1st Qu.: 71.11  
##  Median : 91.11   Median : 83.33   Median : 93.33   Median : 87.81  
##  Mean   : 81.80   Mean   : 77.17   Mean   : 83.46   Mean   : 80.11  
##  3rd Qu.: 97.78   3rd Qu.: 93.25   3rd Qu.: 97.78   3rd Qu.: 95.56  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00

df <- df %>% 
      mutate(across(where(is.character),as.factor))

Cek Missing Value

plot_intro(df,theme_config = theme_classic())

Ubah Kolom “Peringkat Akreditasi” jadi Y biar lebih gampang

df <- df %>%
  rename(Y = `Peringkat Akreditasi`)

Barplot Frekuensi Peringkat Akreditasi

ggplot(df, aes(x = Y, fill = Y)) +
  geom_bar() +
  labs(title = "Frekuensi Peringkat Akreditasi",
       x = "Peringkat Akreditasi",
       y = "Frekuensi") +
  scale_fill_brewer(palette = "Oranges") +
  theme_classic()

# Hitung frekuensi + proporsi
freq_tab <- df %>%
  count(Y) %>%
  mutate(prop = n / sum(n) * 100)

# Plot
ggplot(freq_tab, aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  geom_text(aes(label = paste0(n, " (", round(prop,1), "%)")),
            vjust = -0.5, size = 4) +
  labs(title = "Frekuensi Peringkat Akreditasi",
       x = "Peringkat Akreditasi",
       y = "Frekuensi") +
  scale_fill_brewer(palette = "Oranges") +
  theme_classic()

hapus observasi pada kelas “Tidak Terakreditasi” karena terlalu sedikit

TTA <- df %>%
  filter(Y == "Tidak Terakreditasi")
TTA

## # A tibble: 2 × 6
##   Provinsi     Y                   Lit_2023 Num_2023 Lit_2024 Num_2024
##   <fct>        <fct>                  <dbl>    <dbl>    <dbl>    <dbl>
## 1 Prov. Banten Tidak Terakreditasi     66.7     66.7     0         100
## 2 Prov. Banten Tidak Terakreditasi     25       18.8     4.54       25

df <- df %>%
  filter(Y != "Tidak Terakreditasi") %>%
  droplevels()

# Hitung frekuensi + proporsi
freq_tab <- df %>%
  count(Y) %>%
  mutate(prop = n / sum(n) * 100)

# Plot
ggplot(freq_tab, aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  geom_text(aes(label = paste0(n, " (", round(prop,1), "%)")),
            vjust = -0.5, size = 4) +
  labs(title = "Frekuensi Peringkat Akreditasi",
       x = "Peringkat Akreditasi",
       y = "Frekuensi") +
  scale_fill_brewer(palette = "Oranges") +
  theme_classic()

Boxplot sebaran nilai peubah terhadap kelas Y

plot_boxplot(data = df,
             by = "Y",
             geom_boxplot_args = list(fill="#03A9F4"),
             ggtheme = theme_classic())

sama saja dengan di atas

df_long <- df %>%
  pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
               names_to = "Variabel",
               values_to = "Skor")

ggplot(df_long, aes(x = Y, y = Skor, fill = Y)) +
  geom_boxplot() +
  facet_wrap(~ Variabel, scales = "free_y") +
   scale_fill_brewer(palette = "Oranges") +
  labs(title = "Distribusi Nilai Literasi & Numerasi per Akreditasi",
       x = "Akreditasi", y = "Skor") +
  theme_minimal()

Kelas A B C nilai masing-masing peubah terlihat berbeda rentang nilainya yaitu A lebih tinggi dari B C, tapi A banyak outlier

Proporsi peringkat akreditasi per provinsi

df %>%
  count(Provinsi, Y) %>%
  ggplot(aes(x = Provinsi, y = n, fill = Y)) +
  geom_col(position = "fill") +
  labs(title = "Proporsi Peringkat Akreditasi per Provinsi",
       x = "Provinsi", y = "Proporsi") +
    theme_classic() + 
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  scale_fill_brewer(palette = "Oranges") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Frekuensi Peringkat Akreditasi per Provinsi

df %>%
  count(Provinsi, Y) %>%
  arrange(Provinsi, desc(n)) %>%
  ggplot(aes(x = Provinsi, y = n, fill = Y)) +
  geom_col(position = "stack") +
  scale_fill_brewer(palette = "Oranges") +
  labs(title = "Distribusi Peringkat Akreditasi per Provinsi",
       x = "Provinsi",
       y = "Frekuensi") +
  theme_minimal() +  # background putih polos, no grid
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.background = element_blank(),
        plot.background = element_blank(),
        legend.background = element_blank())

Korelasi masing-masing variabel

library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

df %>%
  select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
  ggpairs()

library(GGally)
library(ggplot2)

df %>%
  select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
  ggpairs(
    aes(alpha = 0.6),
    upper = list(continuous = wrap("cor", size = 4, stars = FALSE)),  # hanya angka korelasi
    lower = list(continuous = wrap("smooth", 
                                   color = "black", alpha = 0.6, se = FALSE)), 
    diag  = list(continuous = wrap("densityDiag", 
                                   fill = "orange", alpha = 0.6))
  ) +
  theme_classic()

df %>%
  summarise(across(c(Lit_2023, Num_2023, Lit_2024, Num_2024), mean, na.rm = TRUE))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(c(Lit_2023, Num_2023, Lit_2024, Num_2024), mean, na.rm =
##   TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

## # A tibble: 1 × 4
##   Lit_2023 Num_2023 Lit_2024 Num_2024
##      <dbl>    <dbl>    <dbl>    <dbl>
## 1     81.9     77.3     83.7     80.2

df %>%
  summarise(Lit_2023 = mean(Lit_2023, na.rm=TRUE),
            Lit_2024 = mean(Lit_2024, na.rm=TRUE),
            Num_2023 = mean(Num_2023, na.rm=TRUE),
            Num_2024 = mean(Num_2024, na.rm=TRUE)) %>%
  tidyr::pivot_longer(everything(), names_to = "Tahun", values_to = "Rata2") %>%
  ggplot(aes(x = Tahun, y = Rata2, fill = Tahun)) +
  geom_col() +
   scale_fill_brewer(palette = "Oranges") +
  labs(title = "Rata-rata Literasi & Numerasi (2023 vs 2024)",
       x = "", y = "Rata-rata") +
  theme_minimal()

library(ggplot2)
library(dplyr)
library(tidyr)

df %>%
  select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
  pivot_longer(cols = everything(), 
               names_to = "Variabel", 
               values_to = "Nilai") %>%
  ggplot(aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_brewer(palette = "Oranges") +
  labs(title = "Boxplot Literasi & Numerasi (2023 vs 2024)",
       x = "Variabel", y = "Nilai") +
  theme_minimal()

# Misal untuk variabel Lit_2023
Q1 <- quantile(df$Lit_2023, 0.25, na.rm = TRUE)
Q3 <- quantile(df$Lit_2023, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

outlier_Lit2023 <- df %>%
  filter(Lit_2023 < (Q1 - 1.5 * IQR) | Lit_2023 > (Q3 + 1.5 * IQR))

outlier_Lit2023

## # A tibble: 45 × 6
##    Provinsi     Y     Lit_2023 Num_2023 Lit_2024 Num_2024
##    <fct>        <fct>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 Prov. Banten C         4.35     34.8     0         0  
##  2 Prov. Banten C        20        36      20        38  
##  3 Prov. Banten B         0        16.7    78.6      64.3
##  4 Prov. Banten C        33.3      33.3    50        44.4
##  5 Prov. Banten C        31.4      28.6    55.3      63.2
##  6 Prov. Banten C        30        35      26.3      15.8
##  7 Prov. Banten B        23.1      38.5     7.69     15.4
##  8 Prov. Banten B        33.3      35.6    49.0      62.2
##  9 Prov. Banten C        16.1      26.5    38.9      27.8
## 10 Prov. Banten C        31.1      35.6    24.4      26.7
## # ℹ 35 more rows

outliers <- df %>%
  pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
               names_to = "Variabel", values_to = "Nilai") %>%
  group_by(Variabel) %>%
  mutate(Q1 = quantile(Nilai, 0.25, na.rm = TRUE),
         Q3 = quantile(Nilai, 0.75, na.rm = TRUE),
         IQR = Q3 - Q1,
         Outlier = ifelse(Nilai < Q1 - 1.5*IQR | Nilai > Q3 + 1.5*IQR, TRUE, FALSE)) %>%
  filter(Outlier == TRUE)

outliers

## # A tibble: 173 × 8
## # Groups:   Variabel [4]
##    Provinsi     Y     Variabel Nilai    Q1    Q3   IQR Outlier
##    <fct>        <fct> <chr>    <dbl> <dbl> <dbl> <dbl> <lgl>  
##  1 Prov. Banten C     Lit_2023  4.35  73.3  97.8  24.5 TRUE   
##  2 Prov. Banten C     Lit_2024  0     77.8  97.8  20   TRUE   
##  3 Prov. Banten C     Num_2024  0     71.1  95.6  24.4 TRUE   
##  4 Prov. Banten C     Lit_2024 15.8   77.8  97.8  20   TRUE   
##  5 Prov. Banten C     Num_2024 21.1   71.1  95.6  24.4 TRUE   
##  6 Prov. Banten C     Lit_2023 20     73.3  97.8  24.5 TRUE   
##  7 Prov. Banten C     Lit_2024 20     77.8  97.8  20   TRUE   
##  8 Prov. Banten C     Lit_2024 22.2   77.8  97.8  20   TRUE   
##  9 Prov. Banten B     Lit_2023  0     73.3  97.8  24.5 TRUE   
## 10 Prov. Banten B     Num_2023 16.7   66.7  93.3  26.7 TRUE   
## # ℹ 163 more rows

df_avg <- df %>%
  rowwise() %>%
  mutate(Rata_All = mean(c_across(c(Lit_2023, Num_2023, Lit_2024, Num_2024)), na.rm = TRUE)) %>%
  ungroup()

# Buat rata-rata per observasi dari semua variabel
df_avg <- df %>%
  mutate(Avg_Skor = rowMeans(select(., c(Lit_2023, Num_2023, Lit_2024, Num_2024)), na.rm = TRUE))

# Boxplot per Akreditasi
ggplot(df_avg, aes(x = Y, y = Avg_Skor, fill = Y)) +
  geom_boxplot(alpha = 0.7) +
  # Tambahkan titik mean per Akreditasi
  stat_summary(fun = mean, geom = "point", 
               shape = 18, size = 4, color = "red") +
  # Tambahkan label angka mean
  stat_summary(fun = mean, geom = "text", 
               aes(label = round(..y.., 1)), 
               vjust = -1, color = "black") +
  scale_fill_brewer(palette = "Oranges") +
  labs(title = "Distribusi Rata-Rata Skor Literasi & Numerasi per Akreditasi",
       x = "Akreditasi", y = "Rata-Rata Skor") +
  theme_minimal()

## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(dplyr)

anomali <- df_avg %>%
  group_by(Y) %>%
  mutate(Q1 = quantile(Avg_Skor, 0.25, na.rm = TRUE),
         Q3 = quantile(Avg_Skor, 0.75, na.rm = TRUE),
         IQR = Q3 - Q1,
         Lower = Q1 - 1.5 * IQR,
         Upper = Q3 + 1.5 * IQR,
         Outlier = ifelse(Avg_Skor < Lower | Avg_Skor > Upper, TRUE, FALSE)) %>%
  filter(Outlier == TRUE)

anomali

## # A tibble: 29 × 13
## # Groups:   Y [2]
##    Provinsi Y     Lit_2023 Num_2023 Lit_2024 Num_2024 Avg_Skor    Q1    Q3   IQR
##    <fct>    <fct>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl> <dbl> <dbl> <dbl>
##  1 Prov. B… A         80       75.6     62.2     53.3     67.8  86.8  96.7  9.82
##  2 Prov. B… A         55.6     53.3     88.9     80       69.4  86.8  96.7  9.82
##  3 Prov. B… A         66.7     66.7     75.6     75.6     71.1  86.8  96.7  9.82
##  4 Prov. B… A         77.3     50       90       60       69.3  86.8  96.7  9.82
##  5 Prov. B… A         84.4     73.3     64.4     64.4     71.7  86.8  96.7  9.82
##  6 Prov. B… A         66.7     68.9     75.6     68.9     70.0  86.8  96.7  9.82
##  7 Prov. B… A         62.2     60       68.9     60.9     63.0  86.8  96.7  9.82
##  8 Prov. B… A         68.9     60       62.2     91.1     70.6  86.8  96.7  9.82
##  9 Prov. B… A         44.4     72.2     95.2     61.9     68.5  86.8  96.7  9.82
## 10 Prov. D… A         75       60.7     75       67.9     69.6  86.8  96.7  9.82
## # ℹ 19 more rows
## # ℹ 3 more variables: Lower <dbl>, Upper <dbl>, Outlier <lgl>

library(dplyr)
library(tidyr)
library(ggplot2)

# Ubah ke long format
df_long <- df %>%
  pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
               names_to = "Variabel", values_to = "Nilai")

# Hitung rata-rata tiap variabel
mean_df <- df_long %>%
  group_by(Variabel) %>%
  summarise(Rata2 = mean(Nilai, na.rm = TRUE), .groups = "drop")

# Plot boxplot + titik rata-rata + label nilai
ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot(alpha = 0.7) +
  geom_point(data = mean_df, aes(x = Variabel, y = Rata2),
             color = "red", size = 3) +
  geom_text(data = mean_df, aes(x = Variabel, y = Rata2,
                                label = round(Rata2, 1)),
            vjust = -1, color = "red", fontface = "bold") +
  scale_fill_brewer(palette = "Oranges") +
  labs(title = "Boxplot Literasi & Numerasi dengan Rata-rata",
       x = "", y = "Nilai") +
  theme_minimal()

library(ggplot2)
library(dplyr)
library(tidyr)

# pivot ke long format
df_long <- df %>%
  pivot_longer(cols = c(Lit_2023, Num_2023, Lit_2024, Num_2024),
               names_to = "Variabel",
               values_to = "Skor")

# plot density
ggplot(df_long, aes(x = Skor, fill = Y, color = Y)) +
  geom_density(alpha = 0.3) +
  facet_wrap(~ Variabel, scales = "free") +
  labs(title = "Distribusi Density Skor Literasi & Numerasi per Akreditasi",
       x = "Skor", y = "Density") +
  theme_minimal()

library(corrplot)

## corrplot 0.95 loaded

cor_mat <- df %>%
  select(Lit_2023, Num_2023, Lit_2024, Num_2024) %>%
  cor(use = "complete.obs")

corrplot(cor_mat, method = "color", type = "upper", 
         addCoef.col = "white", number.cex = 0.7, 
         tl.col = "black", tl.srt = 45)

KNN

untuk analisis klasifikasi kolom Provinsi tidak digunakan sebagai variabel prediktor

df<-df %>% 
select(-Provinsi)

Holdout - Imbalance dan Tanpa Imbalance

1.Split data → imbal vs stratified. 2.Cross-validation di training. 3.SMOTE hanya di train. 4.KNN dengan normalisasi. 5.Evaluasi model (Accuracy, Balanced Accuracy, F1, AUC, confusion matrix).

# === 0. Library ===
library(tidymodels)
library(themis)       # untuk SMOTE / handling imbalance

## Warning: package 'themis' was built under R version 4.4.3

library(dplyr)
library(ggplot2)

set.seed(123)

# === 1. Holdout Split ===
# a) tanpa stratifikasi (imbalance bisa muncul)
split_imbal <- initial_split(df, prop = 0.8) 
train_imbal <- training(split_imbal)
test_imbal  <- testing(split_imbal)

# b) dengan stratifikasi (lebih seimbang)
split_strata <- initial_split(df, prop = 0.8, strata = Y)
train_strata <- training(split_strata)
test_strata  <- testing(split_strata)

# Cek distribusi kelas
table(train_imbal$Y)

## 
##   A   B   C 
## 430 161  40

table(test_imbal$Y)

## 
##   A   B   C 
## 112  40   6

table(train_strata$Y)

## 
##   A   B   C 
## 436 161  34

table(test_strata$Y)

## 
##   A   B   C 
## 106  40  12

# === 2. Cross Validation pada training (pakai stratified split lebih disarankan) ===
folds <- vfold_cv(train_strata, v = 10, strata = Y)

# === 3. Recipe (preprocessing + balancing) ===
knn_recipe <- recipe(Y ~ ., data = train_strata) %>%
  step_normalize(all_numeric_predictors()) %>%   # normalisasi penting untuk KNN
  step_smote(Y)                                  # oversampling dengan SMOTE (train saja)

no_prep <- recipe(Y ~ ., data = train_strata)

# === 4. Model ===
knn_spec <- nearest_neighbor(neighbors = 7, weight_func = "rectangular", dist_power = 2) %>%
  set_engine("kknn") %>%
  set_mode("classification")

# === 5. Workflow ===
knn_wf <- workflow() %>%
  add_model(knn_spec) %>%
  add_recipe(knn_recipe)

knn_wff<- workflow() %>%
  add_model(knn_spec) %>%
  add_recipe(no_prep)

# === 6. Latih model dengan cross-validation SMOTE ===
knn_res <- fit_resamples(
  knn_wf,
  resamples = folds,
  metrics = metric_set(accuracy, recall, precision, f_meas,bal_accuracy, roc_auc, f_meas)
)

collect_metrics(knn_res)

## # A tibble: 6 × 6
##   .metric      .estimator  mean     n std_err .config             
##   <chr>        <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy     multiclass 0.727    10  0.0179 Preprocessor1_Model1
## 2 bal_accuracy macro      0.737    10  0.0257 Preprocessor1_Model1
## 3 f_meas       macro      0.577    20  0.0217 Preprocessor1_Model1
## 4 precision    macro      0.564    10  0.0275 Preprocessor1_Model1
## 5 recall       macro      0.626    10  0.0430 Preprocessor1_Model1
## 6 roc_auc      hand_till  0.794    10  0.0264 Preprocessor1_Model1

# === 6. Latih model dengan cross-validation TIDAK SMOTE ===
knn_ress <- fit_resamples(
  knn_wff,
  resamples = folds,
  metrics = metric_set(accuracy, recall, precision, f_meas,bal_accuracy, roc_auc, f_meas)
)

collect_metrics(knn_ress)

## # A tibble: 6 × 6
##   .metric      .estimator  mean     n std_err .config             
##   <chr>        <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy     multiclass 0.791    10  0.0114 Preprocessor1_Model1
## 2 bal_accuracy macro      0.678    10  0.0149 Preprocessor1_Model1
## 3 f_meas       macro      0.538    20  0.0188 Preprocessor1_Model1
## 4 precision    macro      0.603    10  0.0441 Preprocessor1_Model1
## 5 recall       macro      0.524    10  0.0227 Preprocessor1_Model1
## 6 roc_auc      hand_till  0.769    10  0.0224 Preprocessor1_Model1

# === 7. Final fit pada full training (SMOTE applied) ===
final_knn_smote <- fit(knn_wf, data = train_strata)

# === 7. Final fit pada full training ( NO SMOTE) ===
final_knn_nosmote <- fit(knn_wff, data = train_strata)

# === 8. Evaluasi di test set (SMOTE) ===
preds_smote <- predict(final_knn_smote, test_strata, type = "prob") %>%
  bind_cols(predict(final_knn_smote, test_strata)) %>%
  bind_cols(test_strata %>% select(Y))

# === 8. Evaluasi di test set (NO SMOTE) ===
preds_nosmote <- predict(final_knn_nosmote, test_strata, type = "prob") %>%
  bind_cols(predict(final_knn_nosmote, test_strata)) %>%
  bind_cols(test_strata %>% select(Y))

# con matrix SMOTE
confussion_matrix_smote <- preds_smote %>%
                      conf_mat(truth=Y,estimate=.pred_class)
autoplot(confussion_matrix_smote,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

preds_smote %>%
  conf_mat(truth=Y,estimate=.pred_class) %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.747
##  2 kap                  multiclass     0.500
##  3 sens                 macro          0.677
##  4 spec                 macro          0.857
##  5 ppv                  macro          0.615
##  6 npv                  macro          0.841
##  7 mcc                  multiclass     0.503
##  8 j_index              macro          0.534
##  9 bal_accuracy         macro          0.767
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.615
## 12 recall               macro          0.677
## 13 f_meas               macro          0.636

#metrik evaluasi NO SMOTE
confussion_matrix_nosmote<- preds_nosmote %>%
                      conf_mat(truth=Y,estimate=.pred_class)
autoplot(confussion_matrix_nosmote,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

preds_nosmote %>%
  conf_mat(truth=Y,estimate=.pred_class) %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.804
##  2 kap                  multiclass     0.549
##  3 sens                 macro          0.587
##  4 spec                 macro          0.849
##  5 ppv                  macro          0.706
##  6 npv                  macro          0.897
##  7 mcc                  multiclass     0.561
##  8 j_index              macro          0.436
##  9 bal_accuracy         macro          0.718
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.706
## 12 recall               macro          0.587
## 13 f_meas               macro          0.619

Prediksi Data Baru

Prediksi data tidak terakreditasi

TTA

## # A tibble: 2 × 6
##   Provinsi     Y                   Lit_2023 Num_2023 Lit_2024 Num_2024
##   <fct>        <fct>                  <dbl>    <dbl>    <dbl>    <dbl>
## 1 Prov. Banten Tidak Terakreditasi     66.7     66.7     0         100
## 2 Prov. Banten Tidak Terakreditasi     25       18.8     4.54       25

TTA_new <- TTA %>% select(-Y)

# Prediksi ke data baru


pred_TTA <- predict(final_knn_smote, new_data = TTA_new, type = "prob") %>%
  bind_cols(predict(final_knn_smote, new_data = TTA_new)) %>%
  bind_cols(TTA_new)

pred_TTA

## # A tibble: 2 × 9
##   .pred_A .pred_B .pred_C .pred_class Provinsi     Lit_2023 Num_2023 Lit_2024
##     <dbl>   <dbl>   <dbl> <fct>       <fct>           <dbl>    <dbl>    <dbl>
## 1       0   0.714   0.286 B           Prov. Banten     66.7     66.7     0   
## 2       0   0       1     C           Prov. Banten     25       18.8     4.54
## # ℹ 1 more variable: Num_2024 <dbl>

prediksi <- predict(final_knn_smote, new_data = TTA_new)

# Gabungkan dengan data baru
hasil_pred <- TTA %>%
  bind_cols(prediksi %>% rename(prediksi_knn_Hold_SMOTE = .pred_class))

hasil_pred

## # A tibble: 2 × 7
##   Provinsi     Y      Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_Hold_SM…¹
##   <fct>        <fct>     <dbl>    <dbl>    <dbl>    <dbl> <fct>                 
## 1 Prov. Banten Tidak…     66.7     66.7     0         100 B                     
## 2 Prov. Banten Tidak…     25       18.8     4.54       25 C                     
## # ℹ abbreviated name: ¹prediksi_knn_Hold_SMOTE

# --- Prediksi data baru ---
prediksi <- predict(final_knn_nosmote, new_data = TTA_new)

# Gabungkan dengan data baru
hasil_pred <- TTA %>%
  bind_cols(prediksi %>% rename(prediksi_knn_Hold_noSMOTE = .pred_class))

hasil_pred

## # A tibble: 2 × 7
##   Provinsi     Y      Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_Hold_no…¹
##   <fct>        <fct>     <dbl>    <dbl>    <dbl>    <dbl> <fct>                 
## 1 Prov. Banten Tidak…     66.7     66.7     0         100 B                     
## 2 Prov. Banten Tidak…     25       18.8     4.54       25 B                     
## # ℹ abbreviated name: ¹prediksi_knn_Hold_noSMOTE

# === Visualisasi Distribusi Kelas ===

# 1. Train Asli (belum di-SMOTE)
p_train <- train_strata %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Train Asli", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

# 2. Test Set (selalu asli, tidak di-SMOTE)
p_test <- test_strata %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Test Set", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

# 3. Train hasil SMOTE
smote_recipe <- recipe(Y ~ ., data = train_strata) %>%
  step_smote(Y)

smote_prep <- prep(smote_recipe)
train_smote <- juice(smote_prep)

p_smote <- train_smote %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Train Hasil SMOTE", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

# 4. Gabungkan plot
library(patchwork)
p_train + p_test + p_smote

# === Scatter Plot Train Asli vs SMOTE ===

# Gabungkan data asli dan hasil SMOTE
plot_data <-
  train_strata %>% mutate(type = "Asli")

ggplot(plot_data, aes(x = Lit_2023, y = Num_2023, color = Y, shape = type)) +
  geom_point(alpha = 0.5, size = 1.5) +
  labs(
    title = "Visualisasi Data Asli",
    subtitle = "Distribusi data sebelum balancing",
    x = "Literasi 2023",
    y = "Numerasi 2023"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Dark2")

# === Scatter Plot Train Asli vs SMOTE ===

# Gabungkan data asli dan hasil SMOTE
plot_data <- bind_rows(
  train_strata %>% mutate(type = "Asli"),
  train_smote %>% mutate(type = "SMOTE")
)

ggplot(plot_data, aes(x = Lit_2023, y = Num_2023, color = Y, shape = type)) +
  geom_point(alpha = 0.5, size = 1.5) +
  labs(
    title = "Visualisasi Data Asli vs Hasil SMOTE",
    subtitle = "Perbandingan sebaran kelas setelah balancing",
    x = "Literasi 2023",
    y = "Numerasi 2023"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Dark2")

library(tidymodels)
library(themis)
library(dplyr)
library(ggplot2)
library(patchwork)

set.seed(123)

# === 1. Split dengan stratifikasi ===
split_strata <- initial_split(df, prop = 0.8, strata = Y)
train_strata <- training(split_strata)
test_strata  <- testing(split_strata)

# === 2. SMOTE pada train ===
smote_recipe <- recipe(Y ~ ., data = train_strata) %>%
  step_smote(Y)

smote_prep <- prep(smote_recipe)
train_smote <- juice(smote_prep)

# === 3. Bar plot distribusi kelas ===
p_train <- train_strata %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Train Asli", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

p_test <- test_strata %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Test Set", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

p_smote <- train_smote %>%
  count(Y) %>%
  ggplot(aes(x = Y, y = n, fill = Y)) +
  geom_col() +
  labs(title = "Distribusi Y - Train Hasil SMOTE", x = "Kelas", y = "Jumlah") +
  scale_fill_brewer(palette = "Oranges") +
  theme_minimal()

bars <- p_train + p_test + p_smote

# === 4. Scatter plot Train Asli vs SMOTE ===
plot_data <- bind_rows(
  train_strata %>% mutate(type = "Asli"),
  train_smote %>% mutate(type = "SMOTE")
)

p_scatter <- ggplot(plot_data, aes(x = Lit_2023, y = Num_2023,
                                   color = Y, shape = type)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(
    title = "Sebaran Data Asli vs Hasil SMOTE",
    subtitle = "Literasi 2023 vs Numerasi 2023",
    x = "Literasi 2023",
    y = "Numerasi 2023"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Dark2")

# === 5. Gabungkan semua plot ===
final_plot <- bars / p_scatter
final_plot

library(yardstick)

# === 6. Final fit pada full training (dengan SMOTE) ===
final_knn <- fit(knn_wf, data = train_strata)

# === 7. Prediksi pada test set (tetap original) ===
preds <- predict(final_knn, test_strata, type = "prob") %>%
  bind_cols(predict(final_knn, test_strata)) %>%
  bind_cols(test_strata %>% select(Y))

# === 8. Confusion matrix ===
confmat <- conf_mat(preds, truth = Y, estimate = .pred_class)
confmat

##           Truth
## Prediction  A  B  C
##          A 92 14  1
##          B 13 21  0
##          C  3  7  7

# === 9. Visualisasi confusion matrix (heatmap) ===
p_confmat <- autoplot(confmat, type = "heatmap") +
  scale_fill_gradient(low = "white", high = "orange") +
  labs(title = "Confusion Matrix - KNN pada Test Set")

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

# === 10. Gabungkan dengan plot distribusi & scatter ===
final_plot_all <- (bars / p_scatter) / p_confmat
final_plot_all

K-Fold Cross Validation

library(themis)

no_prep <- recipe(Y~.,data=df)

set.seed(123)  # pastikan sama setiap running
folds <- vfold_cv(df, v = 10, strata = Y)

# -----------------------------
# DEFINISIKAN MODEL KNN
# -----------------------------
knn_tune <- nearest_neighbor(neighbors=tune(),weight_func="rectangular",dist_power = 2) %>% 
  set_engine("kknn") %>%
  set_mode("classification")

# -----------------------------
# MENENTUKAN NILAI HYPERPARAMETER YANG DIGUNAKAN
# -----------------------------
knn_grid <- grid_regular(neighbors(range = c(2,50)),levels= 50)

fitted_knn <- function(x){
  mod <- extract_fit_engine(x)
  fitted(mod)
}

smote_rec <- recipe(Y ~ ., data = df) %>%
  step_smote(Y)

wfst0 <- workflow_set(preproc = list(smote_rec=smote_rec),models = list(knn=knn_tune))

wfst1 <- workflow_set(
  preproc = list(no_prep = no_prep,
                 smote_rec = smote_rec),
  models  = list(knn = knn_tune)
)

set.seed(123)  # seed konsisten juga untuk SMOTE + tuning
knn_cv <- wfst0 %>% 
  workflow_map(fn = "tune_grid",
               verbose = TRUE,
               resamples = folds,
               grid = knn_grid,
               metrics = metric_set(accuracy),
               control = control_resamples(save_pred = TRUE))

## i 1 of 1 tuning:     smote_rec_knn

## ✔ 1 of 1 tuning:     smote_rec_knn (11.6s)

collect_metrics(knn_cv)

## # A tibble: 49 × 9
##    wflow_id      .config    preproc model .metric .estimator  mean     n std_err
##    <chr>         <chr>      <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
##  1 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.709    10 0.0102 
##  2 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.724    10 0.0119 
##  3 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.724    10 0.0125 
##  4 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.738    10 0.0122 
##  5 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.735    10 0.0131 
##  6 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.747    10 0.0133 
##  7 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.749    10 0.0134 
##  8 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.743    10 0.0103 
##  9 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.741    10 0.0102 
## 10 smote_rec_knn Preproces… recipe  near… accura… multiclass 0.741    10 0.00962
## # ℹ 39 more rows

# ambil hasil tuning
knn_res <- knn_cv %>%
  workflowsets::extract_workflow_set_result(id = "smote_rec_knn")

# pilih parameter terbaik berdasarkan metric (misal accuracy)
best_params <- select_best(knn_res, metric = "accuracy")
best_params

## # A tibble: 1 × 2
##   neighbors .config              
##       <int> <chr>                
## 1        35 Preprocessor1_Model34

confussion_matrix_cv <- knn_res %>%
  conf_mat_resampled(parameters = best_params, tidy = FALSE)

autoplot(confussion_matrix_cv, type = "heatmap") +
  scale_fill_viridis_c(direction = -1, option = "inferno", alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

confussion_matrix_cv %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.758
##  2 kap                  multiclass     0.501
##  3 sens                 macro          0.691
##  4 spec                 macro          0.856
##  5 ppv                  macro          0.611
##  6 npv                  macro          0.841
##  7 mcc                  multiclass     0.503
##  8 j_index              macro          0.547
##  9 bal_accuracy         macro          0.773
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.611
## 12 recall               macro          0.691
## 13 f_meas               macro          0.636

# pilih parameter terbaik langsung dari knn_res
best_params <- knn_res %>%
  select_best(metric = "accuracy")

best_params

## # A tibble: 1 × 2
##   neighbors .config              
##       <int> <chr>                
## 1        35 Preprocessor1_Model34

best_params_1se <- knn_res %>%
  select_by_one_std_err(metric = "accuracy", desc(neighbors))

best_params_1se

## # A tibble: 1 × 2
##   neighbors .config              
##       <int> <chr>                
## 1        50 Preprocessor1_Model49

# --- Ambil hasil tuning ---
neighbors_result <- knn_res %>% 
  collect_metrics() %>%
  filter(.metric == "accuracy")

# --- Ambil best & 1SE ---
best_neighbors <- knn_res %>% 
  select_best(metric = "accuracy")

best_neighbors_1se <- knn_res %>% 
  select_by_one_std_err(metric = "accuracy", desc(neighbors))

# --- Plot ---
neighbors_result %>% 
  ggplot(aes(x = neighbors, y = mean)) +
  geom_errorbar(aes(ymin = mean - std_err,
                    ymax = mean + std_err),
                width = 0.2, alpha = 0.6) +
  geom_point(size = 2, color = "black") +
  geom_vline(aes(xintercept = best_neighbors$neighbors,
                 color = "Highest"),
             linetype = "dashed", linewidth = 0.8) +
  geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
                 color = "1-SE-Rule"),
             linetype = "dashed", linewidth = 0.8) +
  ylab("Accuracy") +
  xlab("Jumlah Neighbors (k)") +
  scale_x_continuous(n.breaks = 12) +
  scale_color_manual(values = c("Highest" = "#03A9F4",
                                "1-SE-Rule" = "#f44e03"),
                     name = "Selection") +
  theme_bw() +
  theme(legend.position = "top")

neighbors_result %>% 
#  mutate(neighbors = factor(neighbors)) %>%
  ggplot(aes(x=neighbors, y=mean)) +
  geom_errorbar(aes(ymin=(mean-std_err),
                      ymax=(mean+std_err)))+
  geom_point()+
  geom_vline(aes(xintercept = best_neighbors$neighbors,color="Highest"),
            
             linetype="dashed",linewidth=0.8)+
  geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
                 color="1-SE-Rule"
                 ),
             linetype="dashed",linewidth=0.8)+
  ylab("Accuracy") +
  scale_x_continuous(n.breaks = 12)+
  scale_color_manual(values = c("#03A9F4","#f44e03"),
                     breaks = c("Highest","1-SE-Rule"),
                     name = "Selection"
                     )+
  theme_bw()+
  theme(legend.position = "top")

# --- Finalisasi workflow dengan parameter terbaik ---
final_wf <- workflow() %>%
  add_model(knn_tune %>% finalize_model(best_params)) %>%
  add_recipe(smote_rec)

# Fit ulang ke seluruh data training
final_fit <- final_wf %>% fit(data = df)

Prediksi Data Baru

Prediksi data tidak terakreditasi

TTA

## # A tibble: 2 × 6
##   Provinsi     Y                   Lit_2023 Num_2023 Lit_2024 Num_2024
##   <fct>        <fct>                  <dbl>    <dbl>    <dbl>    <dbl>
## 1 Prov. Banten Tidak Terakreditasi     66.7     66.7     0         100
## 2 Prov. Banten Tidak Terakreditasi     25       18.8     4.54       25

TTA_new <- TTA %>% select(-Y)

# --- Prediksi data baru ---
prediksi <- predict(final_fit, new_data = TTA_new)

# Gabungkan dengan data baru
hasil_pred <- TTA %>%
  bind_cols(prediksi %>% rename(prediksi_knn_cv_SMOTE = .pred_class))

hasil_pred

## # A tibble: 2 × 7
##   Provinsi     Y       Lit_2023 Num_2023 Lit_2024 Num_2024 prediksi_knn_cv_SMOTE
##   <fct>        <fct>      <dbl>    <dbl>    <dbl>    <dbl> <fct>                
## 1 Prov. Banten Tidak …     66.7     66.7     0         100 C                    
## 2 Prov. Banten Tidak …     25       18.8     4.54       25 C

Tugas 2 Pemklas

Nabila Syukri

2025-09-24

Import Data

Eksplorasi Data

KNN

Holdout - Imbalance dan Tanpa Imbalance

Prediksi Data Baru

K-Fold Cross Validation

Prediksi Data Baru