Klasifikasi Peringkat Akreditasi Sekolah menggunakan K-Nearest Neighbors

Author

Windi Pangesti

Published

September 18, 2025

Analisis ini bertujuan untuk memprediksi peringkat akreditasi sekolah di Sumatera Selatan dan Sumatera Barat menggunakan k-Nearest Neighbors (k-NN) , serta menilai akurasi skor literasi dan numerasi 2023–2024 sebagai prediktor akreditasi.

1 Install Package

Code
#install.packages("VGAM")
#install.packages("broom.helpers")
#install.packages("kknn")
Code
library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.5.1
Warning: package 'dplyr' was built under R version 4.5.1
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.8     ✔ rsample      1.3.0
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.8     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.1
✔ parsnip      1.3.2     ✔ yardstick    1.3.2
✔ recipes      1.3.1     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
Code
library(DataExplorer)
Warning: package 'DataExplorer' was built under R version 4.5.1
Code
library(VGAM)
Warning: package 'VGAM' was built under R version 4.5.1
Loading required package: stats4
Loading required package: splines

Attaching package: 'VGAM'

The following object is masked from 'package:workflows':

    update_formula
Code
library(broom.helpers)
library(readxl)
library(themis)
Warning: package 'themis' was built under R version 4.5.1
Code
library(corrplot)
corrplot 0.95 loaded
Code
library(ggplot2)
library(GGally)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
Code
library(kknn)
Warning: package 'kknn' was built under R version 4.5.1

2 Impor Data

Code
data <- read_excel("Data Tugas-2.xlsx")
View(data)
Code
data2<- data %>% 
  filter(provinsi %in% c("Prov. Sumatera Selatan", "Prov. Sumatera Barat"))

data2
Code
glimpse(data2)
Rows: 372
Columns: 7
$ Nomor                <dbl> 32, 38, 40, 47, 55, 86, 100, 110, 112, 133, 154, …
$ provinsi             <chr> "Prov. Sumatera Barat", "Prov. Sumatera Barat", "…
$ Peringkat_Akreditasi <chr> "B", "A", "B", "A", "A", "B", "C", "A", "A", "B",…
$ Lit_2023             <dbl> 51.11, 91.11, 12.90, 77.78, 100.00, 43.75, 50.00,…
$ Num_2023             <dbl> 44.44, 93.33, 19.35, 66.67, 96.97, 50.00, 56.25, …
$ Lit_2024             <dbl> 61.364, 93.333, 31.579, 60.000, 100.000, 50.000, …
$ Num_2024             <dbl> 60.000, 86.667, 10.526, 55.556, 96.552, 50.000, 1…
Code
# Total observasi
nrow(data2)
[1] 372
Code
# jumlah sekolah per provinsi
data2 %>%
  count(provinsi)
Code
# keluarkan nomor dan prov
data2 <- data2 %>%
  select(-Nomor, -provinsi)
Code
data2 <- data2 %>% 
      mutate(across(where(is.character),as.factor))

3 Pembagian Data

Code
set.seed(123)
split  <- initial_split(data2, 
                        strata = Peringkat_Akreditasi, 
                        prop = 0.8)
train  <- training(split)
test   <- testing(split)

4 Eksplorasi Data

4.1 Visualisasi Gambaran umum dataset

Code
# Struktur Data
plot_intro(train,theme_config = theme_classic())

Data terdiri dari 20% variabel diskrit dan 80% variabel kontinu, dengan tidak terdapat missing value

Code
# Ringkasan Data
summary(train[, c("Lit_2023","Num_2023","Lit_2024","Num_2024")])
    Lit_2023         Num_2023         Lit_2024         Num_2024     
 Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
 1st Qu.: 40.63   1st Qu.: 44.44   1st Qu.: 48.89   1st Qu.: 50.00  
 Median : 64.44   Median : 60.00   Median : 71.43   Median : 68.18  
 Mean   : 61.70   Mean   : 59.62   Mean   : 66.13   Mean   : 66.02  
 3rd Qu.: 84.44   3rd Qu.: 77.78   3rd Qu.: 86.67   3rd Qu.: 84.61  
 Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
Code
# Menampilkan baris duplikat
duplicated_rows <- train %>% filter(duplicated(.))
duplicated_rows

4.2 Visualisasi Target Variabel

Code
# Distribusi Peringkat Akreditasi 
ggplot(train, aes(x = Peringkat_Akreditasi, fill = Peringkat_Akreditasi)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), 
            vjust = -0.5, size = 4) +
  theme_classic() +
  labs(
    x = "Peringkat Akreditasi",
    y = "Jumlah Sekolah"
  )
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.

Sekolah dengan akreditasi C memiliki proporsi paling sedikit (22,6%), jauh lebih rendah dibandingkan dengan akreditasi A dan B

Code
# Agregasi & label persentase
df_pie <- train %>%
  count(Peringkat_Akreditasi, name = "n") %>%
  mutate(prop = n / sum(n),
         pct  = percent(prop, accuracy = 0.1))

# Pie chart (persentase saja)
ggplot(df_pie, aes(x = "", y = prop, fill = Peringkat_Akreditasi)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = pct), position = position_stack(vjust = 0.5), size = 4) +
  theme_void() +
  labs(title = "Distribusi Peringkat Akreditasi", fill = "Peringkat")

Berdasarkan visualisasi di atas, semakin mempertegas bahwa Target (Y) memiliki distribusi kelas yang tidak seimbang (imbalance class). Oleh karena itu, perlu untuk mempertimbangkan beberapa metode penanganan ketidakseimbangan data sebelum melakukan pemodelan.

4.3 Distribusi fitur numerik

Code
train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  facet_wrap(~ variable, scales = "free") +
  geom_density(fill = "lightgreen", alpha = 0.5, na.rm = TRUE) +
  labs(
    title = "Distribusi Fitur Numerik",
    x = "Nilai",
    y = "Kepadatan"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text.x = element_text(size = 6)
  )

4.4 Boxplot per fitur numerik

Code
train %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = "", y = value)) +
  geom_boxplot(fill = "lightgreen", alpha = 0.5, outlier.alpha = 0.4) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(title = "Boxplot Fitur Numerik", x = NULL, y = "Nilai") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        strip.text.x = element_text(size = 6))

4.5 Boxplot Skor Literasi & Numerasi per kategori

Code
plot_boxplot(data = train,
             by = "Peringkat_Akreditasi",
             geom_boxplot_args = list(fill="#FFB6B9"),
             ggtheme = theme_classic())

Sekolah berakreditasi A memiliki median literasi dan numerasi tertinggi, sedangkan akreditasi C menunjukkan variasi nilai yang lebih lebar. Selain itu, Outlier paling banyak muncul pada sekolah berakreditasi A

4.6 Korelasi Variabel prediktor

Code
cor(train[, c("Lit_2023","Num_2023","Lit_2024","Num_2024")])
          Lit_2023  Num_2023  Lit_2024  Num_2024
Lit_2023 1.0000000 0.8491384 0.7486465 0.7087073
Num_2023 0.8491384 1.0000000 0.7599339 0.6766126
Lit_2024 0.7486465 0.7599339 1.0000000 0.8275638
Num_2024 0.7087073 0.6766126 0.8275638 1.0000000
Code
# Heatmap korelasi
corrplot(cor(train[, c("Lit_2023","Num_2023","Lit_2024","Num_2024")]),
         method="color", type="upper", addCoef.col="black")

4.7 Distribusi Fitur VS Target

Code
num_cols <- train %>% select(where(is.numeric)) %>% names()

train %>%
  select(all_of(num_cols), Peringkat_Akreditasi) %>%         
  pivot_longer(cols = all_of(num_cols),                     
               names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value, fill = Peringkat_Akreditasi)) +
  facet_wrap(~ variable, scales = "free") +                 
  geom_density(alpha = 0.5) +
  scale_fill_brewer(palette = "Set1") +                     
  labs(
    title = "Distribusi Fitur Berdasarkan Peringkat Akreditasi",
    x = "Nilai",
    y = "Kepadatan",
    fill = "Peringkat"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "top")

Secara umum keempat fitur (Lit_2023, Lit_2024, Num_2023, Num_2024) menunjukkan gradasi yang rapi—kelas A terkonsentrasi pada nilai tinggi, B pada rentang menengah, dan C pada nilai rendah—sehingga fitur-fitur ini informatif untuk memisahkan peringkat; pemisahan terlihat lebih jelas pada tahun 2024 dibanding 2023, khususnya literasi, meski masih ada tumpang tindih di rentang sekitar 45–60 (B–C) dan 70–80 (A–B) yang menjadi sumber salah-klasifikasi pada k-NN.

4.8 PairPlot

Code
ggpairs(train, columns = c("Lit_2023","Num_2023","Lit_2024","Num_2024"),
        aes(color = Peringkat_Akreditasi))

Literasi dan numerasi memiliki korelasi positif yang kuat, terutama pada sekolah berakreditasi A, sedangkan akreditasi C menunjukkan korelasi lebih lemah.

5 Preprosessing

5.1 Normalisasi Data

Code
# Pisahkan label & prediktor
label_train <- train$Peringkat_Akreditasi
label_train <- as.factor(label_train)

predictor_train <- train %>% select(-Peringkat_Akreditasi)
Code
# Normalisasi training
x_train_scaled_mat <- scale(predictor_train) 
center_vec <- attr(x_train_scaled_mat, "scaled:center")
scale_vec  <- attr(x_train_scaled_mat, "scaled:scale")

# Bentuk dataset TRAIN untuk k-NN
df_knn_train <- cbind(as.data.frame(x_train_scaled_mat),
                      Peringkat_Akreditasi = label_train)

# simpan parameter skala untuk dipakai di TEST
scaler <- list(center = center_vec, scale = scale_vec, cols = colnames(predictor_train))

5.2 Imbalance Handling

Code
# Recipe SMOTE (Penanganan Ketidakseimbangan Kelas)
rec_smote <- recipe(Peringkat_Akreditasi ~ ., data = df_knn_train) %>%
  step_zv(all_predictors()) %>%
  step_smote(Peringkat_Akreditasi, over_ratio = 1, seed = 2045)
Code
# Cek Cepat Hasil SMOTE (Distribusi Kelas)
prep(rec_smote) %>% juice() %>% count(Peringkat_Akreditasi)

6 Pemodelan

6.1 K-fold Cross Validation

Code
set.seed(123)

# Cross-validation folds
folds <- vfold_cv(
  df_knn_train,
  v = 10,                              
  strata = Peringkat_Akreditasi       
)
Code
# Visualisasi Pemetaan Observasi ke Tiap Fold
tidy(folds) %>% 
  filter(Row<51) %>% 
ggplot(aes(x = Fold, y = Row,fill = Data)) +
    geom_tile()+
  theme_classic()

Code
# Proporsi Kelas pada Tiap Assessment Fold
df_knn_train %>%
  mutate(Row = row_number()) %>%
  left_join(tidy(folds), by = "Row") %>%
  count(Peringkat_Akreditasi, Fold, Data) %>%
  group_by(Fold, Data) %>%
  mutate(pct = n / sum(n)) %>%
  filter(Data == "Assessment") %>%
  ggplot(aes(x = Fold, y = pct, fill = Peringkat_Akreditasi)) +
  geom_col(position = "fill") +
  labs(title = "Proporsi Kelas di Tiap Assessment Fold",
       x = "Fold", y = "Proporsi", fill = "Kelas") +
  theme_classic()

Code
# Tabel komposisi kelas per fold (Analysis & Assessment)
komposisi_folds <- 
  df_knn_train %>%
    mutate(Row = row_number()) %>%
    select(Row, Peringkat_Akreditasi) %>%
    left_join(tidy(folds), by = "Row") %>%
    count(Fold, Data, Peringkat_Akreditasi, name = "n") %>%
    group_by(Fold, Data) %>%          
    mutate(percent = 100 * n / sum(n)) %>%
    arrange(Fold, Data, Peringkat_Akreditasi) %>%
    ungroup()

komposisi_folds

6.2 Mendefinisikan Model KNN - Hyperparameter Tuning

Code
# Model: k, jarak, bobot dituning
knn_spec <- nearest_neighbor(
  neighbors   = tune(),
  dist_power  = tune(),
  weight_func = tune()
) %>% set_engine("kknn") %>% set_mode("classification")
Code
# Workflow: recipe (punya SMOTE) + model k-NN
wf_smote <- workflow() %>% 
  add_recipe(rec_smote) %>% 
  add_model(knn_spec)
Code
# Grid + CV
grid_knn <- grid_space_filling(
  neighbors(range = c(2L, 60L)),
  dist_power(range = c(1, 2)),
  weight_func(values = c("rectangular","triangular","epanechnikov","gaussian","optimal")),
  size = 60
)

grid_knn <- dplyr::filter(grid_knn, neighbors %% 2 == 1)
Code
# Jalankan tuning grid dengan 10-fold CV
set.seed(123)
knn_tune_cv <- tune_grid(
  wf_smote,
  resamples = folds,
  grid = grid_knn,
  metrics = metric_set(
    accuracy, sens, spec,bal_accuracy, f_meas),
  control = control_grid(save_pred = TRUE, verbose = FALSE)
)
Code
# Nilai k dg akurasi terbaik
# Berdasarkan akurasi
best_neighbors_acc <- select_best(knn_tune_cv, metric = "accuracy")

# Berdasarkan balance
best_neighbors_ba <- select_best(knn_tune_cv, metric = "bal_accuracy")

# Nilai k dengan aturan 1-SE
best_neighbors_1se <- select_by_one_std_err(
  knn_tune_cv,
  metric = "accuracy",
  desc(neighbors)
)
Code
best_neighbors_acc
Code
best_neighbors_ba
Code
best_neighbors_1se
Code
# semua metrik dari hasil tuning
neighbors_result <- collect_metrics(knn_tune_cv)
Code
# ringkas per k (supaya 1 titik per k)
acc_best_by_k <- neighbors_result %>%
  dplyr::filter(.metric == "accuracy", .estimator == "multiclass") %>%
  dplyr::group_by(neighbors) %>%
  dplyr::slice_max(mean, with_ties = FALSE) %>%
  dplyr::ungroup()

# plot: 3 garis, 3 warna, 3 breaks (cocokkan label persis!)
ggplot(acc_best_by_k, aes(x = neighbors, y = mean)) +
  geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = 0) +
  geom_point() +
  geom_vline(aes(xintercept = best_neighbors_acc$neighbors,
                 color = "Highest accuracy"),
             linetype = "dashed", linewidth = 0.8) +
  geom_vline(aes(xintercept = best_neighbors_ba$neighbors,
                 color = "Highest balanced accuracy"),
             linetype = "dashed", linewidth = 0.8) +
  geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
                 color = "1-SE-Rule"),
             linetype = "dashed", linewidth = 0.8) +
  ylab("Accuracy (CV)") +
  scale_x_continuous(n.breaks = 12) +
  scale_color_manual(
    values = c(
      "Highest accuracy" = "#03A9F4",
      "Highest balanced accuracy" = "#9C27B0",
      "1-SE-Rule" = "#f44e03"
    ),
    breaks = c("Highest accuracy", "Highest balanced accuracy", "1-SE-Rule"),
    name   = "Selection"
  ) +
  theme_bw() +
  theme(legend.position = "top")

Code
# pilih konfigurasi terbaik dari hasil tuning
best_params <- select_best(knn_tune_cv, metric = "accuracy")

# finalisasi WORKFLOW (recipe + model) dgn semua parameter terbaik
final_wf   <- finalize_workflow(wf_smote, best_params)

# fit ke seluruh TRAIN yang sudah ternormalisasi
knn_fit_tr <- fit(final_wf, data = df_knn_train)
knn_fit_tr
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_zv()
• step_smote()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(27L,     data, 5), distance = ~1.03389830508475, kernel = ~"rectangular")

Type of response variable: nominal
Minimal misclassification: 0.4666667
Best kernel: rectangular
Best k: 27
Code
# Kumpulkan seluruh metrik dari hasil tuning
metrics_all <- collect_metrics(knn_tune_cv)

# URUTKAN BERDASARKAN AKURASI TERTINGGI (semua kombinasi)
acc_all <- metrics_all %>%
  filter(.metric == "accuracy", .estimator == "multiclass") %>%
  arrange(desc(mean))

# Tampilkan TOP-10 kombinasi akurasi tertinggi (lengkap dgn konfigurasi)
top10_acc <- acc_all %>%
  transmute(
    k = neighbors,
    p = dist_power,
    weight = weight_func,
    accuracy = round(mean, 4),
    se = round(std_err, 4),
    .config
  ) %>%
  slice_head(n = 10)

top10_acc
Code
# METRIK LAIN untuk KONFIGURASI TERBAIK MENURUT AKURASI
best_acc_row <- acc_all %>% slice_head(n = 1)  # baris akurasi tertinggi

best_cfg_metrics <- metrics_all %>%
  semi_join(best_acc_row %>% select(neighbors, dist_power, weight_func),
            by = c("neighbors","dist_power","weight_func")) %>%
  select(.metric, mean, std_err) %>%
  arrange(desc(.metric)) %>%
  mutate(across(c(mean, std_err), ~round(.x, 4)))

best_cfg_metrics
Code
# 1) Ambil semua hasil akurasi dari CV (tiap kombinasi hyperparameter)
acc_tbl <- collect_metrics(knn_tune_cv) %>%
  filter(.metric == "accuracy", .estimator == "multiclass") %>%
  transmute(
    k        = neighbors,
    p        = dist_power,
    weight   = weight_func,                # jika kamu men-tune weight_func
    accuracy = round(mean, 4),
    se       = round(std_err, 4),
    lower    = pmax(0, round(mean - 1.96*std_err, 4)),  # 95% CI (bawah)
    upper    = pmin(1, round(mean + 1.96*std_err, 4)),  # 95% CI (atas)
    .config
  ) %>%
  arrange(desc(accuracy), se)

# 2) Konfigurasi dengan akurasi TERTINGGI (skema terbaik)
best_acc_row <- acc_tbl %>% slice(1)
best_acc_row

6.3 Evaluasi Model Data Testing

Code
predictor_test <- test %>% dplyr::select(-Peringkat_Akreditasi)

x_test_scaled <- as.data.frame(
  scale(predictor_test, center = scaler$center, scale = scaler$scale)
)

df_knn_test <- dplyr::bind_cols(
  x_test_scaled,
  tibble::tibble(
    Peringkat_Akreditasi = factor(
      test$Peringkat_Akreditasi,
      levels = levels(df_knn_train$Peringkat_Akreditasi)
    )
  )
)
Code
# Prediksi & metrik di TEST
pred_class <- predict(knn_fit_tr, new_data = df_knn_test)
pred_prob  <- predict(knn_fit_tr, new_data = df_knn_test, type = "prob")

test_res <- dplyr::bind_cols(df_knn_test, pred_class, pred_prob)

# metrik ringkas
metrics_test <- yardstick::metric_set(
  accuracy, bal_accuracy, sens, spec, precision, recall, f_meas, kap
)
metrics_test(
  test_res, truth = Peringkat_Akreditasi, estimate = .pred_class
)
Code
# Confusion matrix + heatmap
cm_test <- conf_mat(test_res, truth = Peringkat_Akreditasi, estimate = .pred_class)
cm_test
          Truth
Prediction  A  B  C
         A 24  8  1
         B  2  7  3
         C  3 14 13
Code
autoplot(cm_test, type = "heatmap") +
  scale_fill_viridis_c(direction = -1, option = "inferno", alpha = 0.7)
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

Code
# Baseline (akurasi tebak kelas mayoritas)
baseline_acc <- data2 %>%
  count(Peringkat_Akreditasi) %>%
  summarise(baseline = max(n) / sum(n)) %>% pull(baseline)
baseline_acc
[1] 0.3870968