K-Nearest Neighboard

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(DataExplorer)

## Warning: package 'DataExplorer' was built under R version 4.4.3

##   Nomor                  provinsi Peringkat.Akreditasi Lit_2023 Num_2023
## 1     2 Prov. Nusa Tenggara Timur                    B    80.00    48.78
## 2     9 Prov. Nusa Tenggara Timur                    A    86.67    71.11
## 3    10 Prov. Nusa Tenggara Timur                    C    24.44    48.89
## 4    12 Prov. Nusa Tenggara Timur                    C    62.22    77.78
## 5    14 Prov. Nusa Tenggara Timur                    A    66.67    55.56
## 6    46 Prov. Nusa Tenggara Timur                    B    60.00    46.67
##   Lit_2024 Num_2024
## 1   82.222   62.791
## 2   88.889   86.667
## 3   17.778   28.889
## 4   72.727   72.727
## 5   66.667   62.222
## 6   58.140   39.535

glimpse(akre)

## Rows: 540
## Columns: 7
## $ Nomor                <int> 2, 9, 10, 12, 14, 46, 54, 57, 60, 72, 76, 130, 15…
## $ provinsi             <chr> "Prov. Nusa Tenggara Timur", "Prov. Nusa Tenggara…
## $ Peringkat.Akreditasi <chr> "B", "A", "C", "C", "A", "B", "B", "B", "B", "C",…
## $ Lit_2023             <dbl> 80.00, 86.67, 24.44, 62.22, 66.67, 60.00, 75.56, …
## $ Num_2023             <dbl> 48.78, 71.11, 48.89, 77.78, 55.56, 46.67, 66.67, …
## $ Lit_2024             <dbl> 82.222, 88.889, 17.778, 72.727, 66.667, 58.140, 8…
## $ Num_2024             <dbl> 62.791, 86.667, 28.889, 72.727, 62.222, 39.535, 7…

akre <- akre %>% 
            select(-Nomor)

glimpse(akre)

## Rows: 540
## Columns: 6
## $ provinsi             <chr> "Prov. Nusa Tenggara Timur", "Prov. Nusa Tenggara…
## $ Peringkat.Akreditasi <chr> "B", "A", "C", "C", "A", "B", "B", "B", "B", "C",…
## $ Lit_2023             <dbl> 80.00, 86.67, 24.44, 62.22, 66.67, 60.00, 75.56, …
## $ Num_2023             <dbl> 48.78, 71.11, 48.89, 77.78, 55.56, 46.67, 66.67, …
## $ Lit_2024             <dbl> 82.222, 88.889, 17.778, 72.727, 66.667, 58.140, 8…
## $ Num_2024             <dbl> 62.791, 86.667, 28.889, 72.727, 62.222, 39.535, 7…

#mengubah kolom target ke factor
akre <- akre %>%
  mutate(`Peringkat.Akreditasi` = as.factor(`Peringkat.Akreditasi`))

library(dplyr)
library(forcats)

akre <- akre %>%
  filter(`Peringkat.Akreditasi` != "Tidak Terakreditasi") %>%  # buang barisnya
  mutate(`Peringkat.Akreditasi` = fct_drop(`Peringkat.Akreditasi`)) # hapus level sisa

View(akre)

###EKSPLORASI DATA

plot_intro(akre,theme_config = theme_classic())

plot_boxplot(data = akre,
             by = "Peringkat.Akreditasi",
             geom_boxplot_args = list(fill="#03A9F4"),
             ggtheme = theme_classic())

plot_bar(data = akre,
             by = "Peringkat.Akreditasi",
         ggtheme = theme_classic())

###PEMODELAN KLASIFIKASI

#Membagi data dengan Hold Sample (Train & Test)
set.seed(123)
holdout_split <- initial_split(akre,
                               #sampel acak berdasarkan kelompok
                               strata = `Peringkat.Akreditasi`,
                               # proporsi untuk training data
                               prop = 0.8)
train_data <- training(holdout_split)
test_data <- testing(holdout_split)

tidy(holdout_split) %>% 
  count(Data) %>% 
  mutate(percent=n*100/sum(n))

## # A tibble: 2 × 3
##   Data           n percent
##   <chr>      <int>   <dbl>
## 1 Analysis     430    79.9
## 2 Assessment   108    20.1

akre %>% 
  mutate(Row=seq(nrow(akre))) %>% 
  select(`Peringkat.Akreditasi`,Row) %>% 
  left_join(y = tidy(holdout_split),by = "Row") %>% 
  count(`Peringkat.Akreditasi`,Data) %>% 
  group_by(`Peringkat.Akreditasi`) %>% 
  mutate(percent=n*100/sum(n),n=NULL) %>% 
  pivot_wider(id_cols = `Peringkat.Akreditasi`,names_from = Data,values_from = percent)

## # A tibble: 3 × 3
## # Groups:   Peringkat.Akreditasi [3]
##   Peringkat.Akreditasi Analysis Assessment
##   <fct>                   <dbl>      <dbl>
## 1 A                        79.9       20.1
## 2 B                        80         20  
## 3 C                        79.9       20.1

#Membagi data dengan K-Fold Cross Validation
set.seed(345)
folds <- vfold_cv(akre, v = 10,strata = `Peringkat.Akreditasi` )

#Contoh visualisasi 100 amatan pertama di setiap folds
tidy(folds) %>% 
  filter(Row<101) %>% 
ggplot(aes(x = Fold, y = Row,fill = Data)) +
    geom_tile()+
  theme_classic()

akre %>% 
  mutate(Row=seq(nrow(akre))) %>% 
  select(`Peringkat.Akreditasi`,Row) %>% 
  left_join(tidy(folds),by="Row")%>% 
  count(`Peringkat.Akreditasi`,Fold,Data) %>% 
  group_by(`Peringkat.Akreditasi`,Fold) %>% 
  mutate(percent=n*100/sum(n)) %>% 
  arrange(Fold,Data)

## # A tibble: 60 × 5
## # Groups:   Peringkat.Akreditasi, Fold [30]
##    Peringkat.Akreditasi Fold   Data           n percent
##    <fct>                <chr>  <chr>      <int>   <dbl>
##  1 A                    Fold01 Analysis     134    89.9
##  2 B                    Fold01 Analysis     193    89.8
##  3 C                    Fold01 Analysis     156    89.7
##  4 A                    Fold01 Assessment    15    10.1
##  5 B                    Fold01 Assessment    22    10.2
##  6 C                    Fold01 Assessment    18    10.3
##  7 A                    Fold02 Analysis     134    89.9
##  8 B                    Fold02 Analysis     193    89.8
##  9 C                    Fold02 Analysis     156    89.7
## 10 A                    Fold02 Assessment    15    10.1
## # ℹ 50 more rows

###Mendefinisikan Model KNN

knn <- nearest_neighbor(neighbors=10,weight_func="rectangular",dist_power = 2) %>% 
  set_engine("kknn") %>%
  set_mode("classification")
knn

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 10
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn

knn %>% translate()

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 10
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn 
## 
## Model fit template:
## kknn::train.kknn(formula = missing_arg(), data = missing_arg(), 
##     ks = min_rows(10, data, 5), kernel = "rectangular", distance = 2)

###Evaluasi Model

#dengan Hold Out Sample (Train test)
knn_hold <- knn %>% 
  fit(Peringkat.Akreditasi~ Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=train_data)

extract_fit_engine(knn_hold)

## 
## Call:
## kknn::train.kknn(formula = Peringkat.Akreditasi ~ Lit_2023 +     Num_2023 + Lit_2024 + Num_2024, data = data, ks = min_rows(10,     data, 5), distance = ~2, kernel = ~"rectangular")
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4930233
## Best kernel: rectangular
## Best k: 10

pred_knn <- knn_hold %>% 
              predict(new_data = test_data) %>% 
              bind_cols(test_data %>% select(`Peringkat.Akreditasi`))
              
pred_knn %>% 
  slice_head(n=10)

## # A tibble: 10 × 2
##    .pred_class Peringkat.Akreditasi
##    <fct>       <fct>               
##  1 A           A                   
##  2 B           C                   
##  3 A           A                   
##  4 A           A                   
##  5 A           C                   
##  6 A           A                   
##  7 C           C                   
##  8 A           B                   
##  9 B           C                   
## 10 B           B

#Kemudian hasil prediksi tersebut akan kita evaluasi dengan menggunakan Confussion Matrix
confussion_matrix <- pred_knn %>%
                      conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class)
autoplot(confussion_matrix,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

#Dari Confussion Matrix tersebut kita bisa mendapatkan beberapa metrik dengan fungsi summary
pred_knn %>%
  conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class) %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.5  
##  2 kap                  multiclass     0.240
##  3 sens                 macro          0.504
##  4 spec                 macro          0.744
##  5 ppv                  macro          0.505
##  6 npv                  macro          0.744
##  7 mcc                  multiclass     0.241
##  8 j_index              macro          0.248
##  9 bal_accuracy         macro          0.624
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.505
## 12 recall               macro          0.504
## 13 f_meas               macro          0.502

#Kemudian kita evaluasi dapat juga langsung mendapatkan metrik tertentu dengan menggunakan accuracy.
pred_knn %>% 
  accuracy(truth=`Peringkat.Akreditasi`,estimate=.pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass       0.5

##dengan K-fold Cross Validation (v=10)
no_prep <- recipe(`Peringkat.Akreditasi`~Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=akre)

wfst0 <- workflow_set(preproc = list(no_prep=no_prep),models = list(knn=knn))

knn_cv <- wfst0 %>% 
            workflow_map(fn = "fit_resamples",
             verbose = TRUE,
             seed = 2045,
             resamples = folds,
             metrics=metric_set(accuracy),
            control=control_resamples(save_pred = TRUE)
             )

## i 1 of 1 resampling: no_prep_knn

## ✔ 1 of 1 resampling: no_prep_knn (801ms)

collect_metrics(knn_cv)

## # A tibble: 1 × 9
##   wflow_id    .config       preproc model .metric .estimator  mean     n std_err
##   <chr>       <chr>         <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
## 1 no_prep_knn Preprocessor… recipe  near… accura… multiclass 0.514    10  0.0168

confussion_matrix_cv <- knn_cv %>% 
  workflowsets::extract_workflow_set_result(id = "no_prep_knn") %>% 
  conf_mat_resampled(tidy = FALSE)

autoplot(confussion_matrix_cv,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

confussion_matrix_cv %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.515
##  2 kap                  multiclass     0.260
##  3 sens                 macro          0.517
##  4 spec                 macro          0.750
##  5 ppv                  macro          0.526
##  6 npv                  macro          0.749
##  7 mcc                  multiclass     0.261
##  8 j_index              macro          0.267
##  9 bal_accuracy         macro          0.634
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.526
## 12 recall               macro          0.517
## 13 f_meas               macro          0.520

###Hyperparameter Tuning untuk KNN

knn_tune <- nearest_neighbor(neighbors=tune(),weight_func="rectangular",dist_power = 2) %>% 
  set_engine("kknn") %>%
  set_mode("classification")

knn_tune

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = tune()
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn

knn_grid <- tibble::tibble(neighbors = seq(5, 60, by = 2))  # ganjil/ genap sesuai selera

fitted_knn <- function(x){
  mod <- extract_fit_engine(x)
  fitted(mod)
}

##Kemudian kita mulai proses tuning dengan menggunakan bantuan tune_grid
wfst1 <- workflow_set(preproc = list(no_prep=no_prep),models = list(knn=knn_tune))

knn_tune_cv <- wfst1 %>% 
            workflow_map(fn = "tune_grid",
             verbose = TRUE,
             seed = 2045,
             resamples = folds,
             grid = knn_grid,
             metrics=metric_set(accuracy),
            control=control_resamples(save_pred = TRUE)
             )

## i 1 of 1 tuning:     no_prep_knn

## ✔ 1 of 1 tuning:     no_prep_knn (3.9s)

best_neighbors <- knn_tune_cv %>%
      extract_workflow_set_result("no_prep_knn") %>% 
      select_best(metric="accuracy")

best_neighbors_1se <- knn_tune_cv %>%
      extract_workflow_set_result("no_prep_knn") %>% 
      select_by_one_std_err(metric="accuracy",desc(neighbors))

neighbors_result <- knn_tune_cv %>%
                      extract_workflow_set_result("no_prep_knn") %>% 
                      collect_metrics()
neighbors_result

## # A tibble: 28 × 7
##    neighbors .metric  .estimator  mean     n std_err .config              
##        <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1         5 accuracy multiclass 0.505    10  0.0156 Preprocessor1_Model01
##  2         7 accuracy multiclass 0.529    10  0.0132 Preprocessor1_Model02
##  3         9 accuracy multiclass 0.529    10  0.0183 Preprocessor1_Model03
##  4        11 accuracy multiclass 0.524    10  0.0128 Preprocessor1_Model04
##  5        13 accuracy multiclass 0.511    10  0.0119 Preprocessor1_Model05
##  6        15 accuracy multiclass 0.509    10  0.0119 Preprocessor1_Model06
##  7        17 accuracy multiclass 0.516    10  0.0120 Preprocessor1_Model07
##  8        19 accuracy multiclass 0.531    10  0.0164 Preprocessor1_Model08
##  9        21 accuracy multiclass 0.528    10  0.0177 Preprocessor1_Model09
## 10        23 accuracy multiclass 0.530    10  0.0191 Preprocessor1_Model10
## # ℹ 18 more rows

neighbors_result %>% 
#  mutate(neighbors = factor(neighbors)) %>%
  ggplot(aes(x=neighbors, y=mean)) +
  geom_errorbar(aes(ymin=(mean-std_err),
                      ymax=(mean+std_err)))+
  geom_point()+
  geom_vline(aes(xintercept = best_neighbors$neighbors,color="Highest"),
            
             linetype="dashed",linewidth=0.8)+
  geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
                 color="1-SE-Rule"
                 ),
             linetype="dashed",linewidth=0.8)+
  ylab("Accuracy") +
  scale_x_continuous(n.breaks = 12)+
  scale_color_manual(values = c("#03A9F4","#f44e03"),
                     breaks = c("Highest","1-SE-Rule"),
                     name = "Selection"
                     )+
  theme_bw()+
  theme(legend.position = "top")

#berdasarkan nilai akurasi tertinggi
best_neighbors

## # A tibble: 1 × 2
##   neighbors .config              
##       <dbl> <chr>                
## 1        43 Preprocessor1_Model20

#berdasarkan one standard error rule (1-SE-Rule).
best_neighbors_1se

## # A tibble: 1 × 2
##   neighbors .config              
##       <dbl> <chr>                
## 1        59 Preprocessor1_Model28

#terapkan ke model dengan fungsi finalize_model
knn_opt <- knn_tune %>% 
  finalize_model(parameters = best_neighbors)
knn_opt

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 43
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn

#terapkan ke model dengan fungsi finalize_model
knn_opt1 <- knn_tune %>% 
  finalize_model(parameters = best_neighbors_1se)
knn_opt1

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 59
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn

###Mendefinisikan Model KNN

knn <- nearest_neighbor(neighbors= 59,weight_func="rectangular",dist_power = 2) %>% 
  set_engine("kknn") %>%
  set_mode("classification")
knn

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 59
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn

knn %>% translate()

## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 59
##   weight_func = rectangular
##   dist_power = 2
## 
## Computational engine: kknn 
## 
## Model fit template:
## kknn::train.kknn(formula = missing_arg(), data = missing_arg(), 
##     ks = min_rows(59, data, 5), kernel = "rectangular", distance = 2)

###Evaluasi Model

#dengan Hold Out Sample (Train test)
knn_hold <- knn %>% 
  fit(Peringkat.Akreditasi~ Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=train_data)

extract_fit_engine(knn_hold)

## 
## Call:
## kknn::train.kknn(formula = Peringkat.Akreditasi ~ Lit_2023 +     Num_2023 + Lit_2024 + Num_2024, data = data, ks = min_rows(59,     data, 5), distance = ~2, kernel = ~"rectangular")
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4604651
## Best kernel: rectangular
## Best k: 59

pred_knn <- knn_hold %>% 
              predict(new_data = test_data) %>% 
              bind_cols(test_data %>% select(`Peringkat.Akreditasi`))
              
pred_knn %>% 
  slice_head(n=10)

## # A tibble: 10 × 2
##    .pred_class Peringkat.Akreditasi
##    <fct>       <fct>               
##  1 A           A                   
##  2 B           C                   
##  3 B           A                   
##  4 A           A                   
##  5 A           C                   
##  6 A           A                   
##  7 C           C                   
##  8 A           B                   
##  9 B           C                   
## 10 B           B

#Kemudian hasil prediksi tersebut akan kita evaluasi dengan menggunakan Confussion Matrix
confussion_matrix <- pred_knn %>%
                      conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class)
autoplot(confussion_matrix,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

confussion_matrix %>% 
  summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.602
##  2 kap                  multiclass     0.393
##  3 sens                 macro          0.601
##  4 spec                 macro          0.795
##  5 ppv                  macro          0.605
##  6 npv                  macro          0.799
##  7 mcc                  multiclass     0.397
##  8 j_index              macro          0.396
##  9 bal_accuracy         macro          0.698
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.605
## 12 recall               macro          0.601
## 13 f_meas               macro          0.597

K-Nearest Neighboard

2025-09-23