library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
## Nomor provinsi Peringkat.Akreditasi Lit_2023 Num_2023
## 1 2 Prov. Nusa Tenggara Timur B 80.00 48.78
## 2 9 Prov. Nusa Tenggara Timur A 86.67 71.11
## 3 10 Prov. Nusa Tenggara Timur C 24.44 48.89
## 4 12 Prov. Nusa Tenggara Timur C 62.22 77.78
## 5 14 Prov. Nusa Tenggara Timur A 66.67 55.56
## 6 46 Prov. Nusa Tenggara Timur B 60.00 46.67
## Lit_2024 Num_2024
## 1 82.222 62.791
## 2 88.889 86.667
## 3 17.778 28.889
## 4 72.727 72.727
## 5 66.667 62.222
## 6 58.140 39.535
glimpse(akre)
## Rows: 540
## Columns: 7
## $ Nomor <int> 2, 9, 10, 12, 14, 46, 54, 57, 60, 72, 76, 130, 15…
## $ provinsi <chr> "Prov. Nusa Tenggara Timur", "Prov. Nusa Tenggara…
## $ Peringkat.Akreditasi <chr> "B", "A", "C", "C", "A", "B", "B", "B", "B", "C",…
## $ Lit_2023 <dbl> 80.00, 86.67, 24.44, 62.22, 66.67, 60.00, 75.56, …
## $ Num_2023 <dbl> 48.78, 71.11, 48.89, 77.78, 55.56, 46.67, 66.67, …
## $ Lit_2024 <dbl> 82.222, 88.889, 17.778, 72.727, 66.667, 58.140, 8…
## $ Num_2024 <dbl> 62.791, 86.667, 28.889, 72.727, 62.222, 39.535, 7…
akre <- akre %>%
select(-Nomor)
glimpse(akre)
## Rows: 540
## Columns: 6
## $ provinsi <chr> "Prov. Nusa Tenggara Timur", "Prov. Nusa Tenggara…
## $ Peringkat.Akreditasi <chr> "B", "A", "C", "C", "A", "B", "B", "B", "B", "C",…
## $ Lit_2023 <dbl> 80.00, 86.67, 24.44, 62.22, 66.67, 60.00, 75.56, …
## $ Num_2023 <dbl> 48.78, 71.11, 48.89, 77.78, 55.56, 46.67, 66.67, …
## $ Lit_2024 <dbl> 82.222, 88.889, 17.778, 72.727, 66.667, 58.140, 8…
## $ Num_2024 <dbl> 62.791, 86.667, 28.889, 72.727, 62.222, 39.535, 7…
#mengubah kolom target ke factor
akre <- akre %>%
mutate(`Peringkat.Akreditasi` = as.factor(`Peringkat.Akreditasi`))
library(dplyr)
library(forcats)
akre <- akre %>%
filter(`Peringkat.Akreditasi` != "Tidak Terakreditasi") %>% # buang barisnya
mutate(`Peringkat.Akreditasi` = fct_drop(`Peringkat.Akreditasi`)) # hapus level sisa
View(akre)
###EKSPLORASI DATA
plot_intro(akre,theme_config = theme_classic())
plot_boxplot(data = akre,
by = "Peringkat.Akreditasi",
geom_boxplot_args = list(fill="#03A9F4"),
ggtheme = theme_classic())
plot_bar(data = akre,
by = "Peringkat.Akreditasi",
ggtheme = theme_classic())
###PEMODELAN KLASIFIKASI
#Membagi data dengan Hold Sample (Train & Test)
set.seed(123)
holdout_split <- initial_split(akre,
#sampel acak berdasarkan kelompok
strata = `Peringkat.Akreditasi`,
# proporsi untuk training data
prop = 0.8)
train_data <- training(holdout_split)
test_data <- testing(holdout_split)
tidy(holdout_split) %>%
count(Data) %>%
mutate(percent=n*100/sum(n))
## # A tibble: 2 × 3
## Data n percent
## <chr> <int> <dbl>
## 1 Analysis 430 79.9
## 2 Assessment 108 20.1
akre %>%
mutate(Row=seq(nrow(akre))) %>%
select(`Peringkat.Akreditasi`,Row) %>%
left_join(y = tidy(holdout_split),by = "Row") %>%
count(`Peringkat.Akreditasi`,Data) %>%
group_by(`Peringkat.Akreditasi`) %>%
mutate(percent=n*100/sum(n),n=NULL) %>%
pivot_wider(id_cols = `Peringkat.Akreditasi`,names_from = Data,values_from = percent)
## # A tibble: 3 × 3
## # Groups: Peringkat.Akreditasi [3]
## Peringkat.Akreditasi Analysis Assessment
## <fct> <dbl> <dbl>
## 1 A 79.9 20.1
## 2 B 80 20
## 3 C 79.9 20.1
#Membagi data dengan K-Fold Cross Validation
set.seed(345)
folds <- vfold_cv(akre, v = 10,strata = `Peringkat.Akreditasi` )
#Contoh visualisasi 100 amatan pertama di setiap folds
tidy(folds) %>%
filter(Row<101) %>%
ggplot(aes(x = Fold, y = Row,fill = Data)) +
geom_tile()+
theme_classic()
akre %>%
mutate(Row=seq(nrow(akre))) %>%
select(`Peringkat.Akreditasi`,Row) %>%
left_join(tidy(folds),by="Row")%>%
count(`Peringkat.Akreditasi`,Fold,Data) %>%
group_by(`Peringkat.Akreditasi`,Fold) %>%
mutate(percent=n*100/sum(n)) %>%
arrange(Fold,Data)
## # A tibble: 60 × 5
## # Groups: Peringkat.Akreditasi, Fold [30]
## Peringkat.Akreditasi Fold Data n percent
## <fct> <chr> <chr> <int> <dbl>
## 1 A Fold01 Analysis 134 89.9
## 2 B Fold01 Analysis 193 89.8
## 3 C Fold01 Analysis 156 89.7
## 4 A Fold01 Assessment 15 10.1
## 5 B Fold01 Assessment 22 10.2
## 6 C Fold01 Assessment 18 10.3
## 7 A Fold02 Analysis 134 89.9
## 8 B Fold02 Analysis 193 89.8
## 9 C Fold02 Analysis 156 89.7
## 10 A Fold02 Assessment 15 10.1
## # ℹ 50 more rows
###Mendefinisikan Model KNN
knn <- nearest_neighbor(neighbors=10,weight_func="rectangular",dist_power = 2) %>%
set_engine("kknn") %>%
set_mode("classification")
knn
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 10
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
knn %>% translate()
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 10
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
##
## Model fit template:
## kknn::train.kknn(formula = missing_arg(), data = missing_arg(),
## ks = min_rows(10, data, 5), kernel = "rectangular", distance = 2)
###Evaluasi Model
#dengan Hold Out Sample (Train test)
knn_hold <- knn %>%
fit(Peringkat.Akreditasi~ Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=train_data)
extract_fit_engine(knn_hold)
##
## Call:
## kknn::train.kknn(formula = Peringkat.Akreditasi ~ Lit_2023 + Num_2023 + Lit_2024 + Num_2024, data = data, ks = min_rows(10, data, 5), distance = ~2, kernel = ~"rectangular")
##
## Type of response variable: nominal
## Minimal misclassification: 0.4930233
## Best kernel: rectangular
## Best k: 10
pred_knn <- knn_hold %>%
predict(new_data = test_data) %>%
bind_cols(test_data %>% select(`Peringkat.Akreditasi`))
pred_knn %>%
slice_head(n=10)
## # A tibble: 10 × 2
## .pred_class Peringkat.Akreditasi
## <fct> <fct>
## 1 A A
## 2 B C
## 3 A A
## 4 A A
## 5 A C
## 6 A A
## 7 C C
## 8 A B
## 9 B C
## 10 B B
#Kemudian hasil prediksi tersebut akan kita evaluasi dengan menggunakan Confussion Matrix
confussion_matrix <- pred_knn %>%
conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class)
autoplot(confussion_matrix,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
#Dari Confussion Matrix tersebut kita bisa mendapatkan beberapa metrik dengan fungsi summary
pred_knn %>%
conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class) %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.5
## 2 kap multiclass 0.240
## 3 sens macro 0.504
## 4 spec macro 0.744
## 5 ppv macro 0.505
## 6 npv macro 0.744
## 7 mcc multiclass 0.241
## 8 j_index macro 0.248
## 9 bal_accuracy macro 0.624
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.505
## 12 recall macro 0.504
## 13 f_meas macro 0.502
#Kemudian kita evaluasi dapat juga langsung mendapatkan metrik tertentu dengan menggunakan accuracy.
pred_knn %>%
accuracy(truth=`Peringkat.Akreditasi`,estimate=.pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.5
##dengan K-fold Cross Validation (v=10)
no_prep <- recipe(`Peringkat.Akreditasi`~Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=akre)
wfst0 <- workflow_set(preproc = list(no_prep=no_prep),models = list(knn=knn))
knn_cv <- wfst0 %>%
workflow_map(fn = "fit_resamples",
verbose = TRUE,
seed = 2045,
resamples = folds,
metrics=metric_set(accuracy),
control=control_resamples(save_pred = TRUE)
)
## i 1 of 1 resampling: no_prep_knn
## ✔ 1 of 1 resampling: no_prep_knn (801ms)
collect_metrics(knn_cv)
## # A tibble: 1 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 no_prep_knn Preprocessor… recipe near… accura… multiclass 0.514 10 0.0168
confussion_matrix_cv <- knn_cv %>%
workflowsets::extract_workflow_set_result(id = "no_prep_knn") %>%
conf_mat_resampled(tidy = FALSE)
autoplot(confussion_matrix_cv,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
confussion_matrix_cv %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.515
## 2 kap multiclass 0.260
## 3 sens macro 0.517
## 4 spec macro 0.750
## 5 ppv macro 0.526
## 6 npv macro 0.749
## 7 mcc multiclass 0.261
## 8 j_index macro 0.267
## 9 bal_accuracy macro 0.634
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.526
## 12 recall macro 0.517
## 13 f_meas macro 0.520
###Hyperparameter Tuning untuk KNN
knn_tune <- nearest_neighbor(neighbors=tune(),weight_func="rectangular",dist_power = 2) %>%
set_engine("kknn") %>%
set_mode("classification")
knn_tune
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = tune()
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
knn_grid <- tibble::tibble(neighbors = seq(5, 60, by = 2)) # ganjil/ genap sesuai selera
fitted_knn <- function(x){
mod <- extract_fit_engine(x)
fitted(mod)
}
##Kemudian kita mulai proses tuning dengan menggunakan bantuan tune_grid
wfst1 <- workflow_set(preproc = list(no_prep=no_prep),models = list(knn=knn_tune))
knn_tune_cv <- wfst1 %>%
workflow_map(fn = "tune_grid",
verbose = TRUE,
seed = 2045,
resamples = folds,
grid = knn_grid,
metrics=metric_set(accuracy),
control=control_resamples(save_pred = TRUE)
)
## i 1 of 1 tuning: no_prep_knn
## ✔ 1 of 1 tuning: no_prep_knn (3.9s)
best_neighbors <- knn_tune_cv %>%
extract_workflow_set_result("no_prep_knn") %>%
select_best(metric="accuracy")
best_neighbors_1se <- knn_tune_cv %>%
extract_workflow_set_result("no_prep_knn") %>%
select_by_one_std_err(metric="accuracy",desc(neighbors))
neighbors_result <- knn_tune_cv %>%
extract_workflow_set_result("no_prep_knn") %>%
collect_metrics()
neighbors_result
## # A tibble: 28 × 7
## neighbors .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 5 accuracy multiclass 0.505 10 0.0156 Preprocessor1_Model01
## 2 7 accuracy multiclass 0.529 10 0.0132 Preprocessor1_Model02
## 3 9 accuracy multiclass 0.529 10 0.0183 Preprocessor1_Model03
## 4 11 accuracy multiclass 0.524 10 0.0128 Preprocessor1_Model04
## 5 13 accuracy multiclass 0.511 10 0.0119 Preprocessor1_Model05
## 6 15 accuracy multiclass 0.509 10 0.0119 Preprocessor1_Model06
## 7 17 accuracy multiclass 0.516 10 0.0120 Preprocessor1_Model07
## 8 19 accuracy multiclass 0.531 10 0.0164 Preprocessor1_Model08
## 9 21 accuracy multiclass 0.528 10 0.0177 Preprocessor1_Model09
## 10 23 accuracy multiclass 0.530 10 0.0191 Preprocessor1_Model10
## # ℹ 18 more rows
neighbors_result %>%
# mutate(neighbors = factor(neighbors)) %>%
ggplot(aes(x=neighbors, y=mean)) +
geom_errorbar(aes(ymin=(mean-std_err),
ymax=(mean+std_err)))+
geom_point()+
geom_vline(aes(xintercept = best_neighbors$neighbors,color="Highest"),
linetype="dashed",linewidth=0.8)+
geom_vline(aes(xintercept = best_neighbors_1se$neighbors,
color="1-SE-Rule"
),
linetype="dashed",linewidth=0.8)+
ylab("Accuracy") +
scale_x_continuous(n.breaks = 12)+
scale_color_manual(values = c("#03A9F4","#f44e03"),
breaks = c("Highest","1-SE-Rule"),
name = "Selection"
)+
theme_bw()+
theme(legend.position = "top")
#berdasarkan nilai akurasi tertinggi
best_neighbors
## # A tibble: 1 × 2
## neighbors .config
## <dbl> <chr>
## 1 43 Preprocessor1_Model20
#berdasarkan one standard error rule (1-SE-Rule).
best_neighbors_1se
## # A tibble: 1 × 2
## neighbors .config
## <dbl> <chr>
## 1 59 Preprocessor1_Model28
#terapkan ke model dengan fungsi finalize_model
knn_opt <- knn_tune %>%
finalize_model(parameters = best_neighbors)
knn_opt
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 43
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
#terapkan ke model dengan fungsi finalize_model
knn_opt1 <- knn_tune %>%
finalize_model(parameters = best_neighbors_1se)
knn_opt1
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 59
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
###Mendefinisikan Model KNN
knn <- nearest_neighbor(neighbors= 59,weight_func="rectangular",dist_power = 2) %>%
set_engine("kknn") %>%
set_mode("classification")
knn
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 59
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
knn %>% translate()
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 59
## weight_func = rectangular
## dist_power = 2
##
## Computational engine: kknn
##
## Model fit template:
## kknn::train.kknn(formula = missing_arg(), data = missing_arg(),
## ks = min_rows(59, data, 5), kernel = "rectangular", distance = 2)
###Evaluasi Model
#dengan Hold Out Sample (Train test)
knn_hold <- knn %>%
fit(Peringkat.Akreditasi~ Lit_2023 + Num_2023 +Lit_2024 + Num_2024,data=train_data)
extract_fit_engine(knn_hold)
##
## Call:
## kknn::train.kknn(formula = Peringkat.Akreditasi ~ Lit_2023 + Num_2023 + Lit_2024 + Num_2024, data = data, ks = min_rows(59, data, 5), distance = ~2, kernel = ~"rectangular")
##
## Type of response variable: nominal
## Minimal misclassification: 0.4604651
## Best kernel: rectangular
## Best k: 59
pred_knn <- knn_hold %>%
predict(new_data = test_data) %>%
bind_cols(test_data %>% select(`Peringkat.Akreditasi`))
pred_knn %>%
slice_head(n=10)
## # A tibble: 10 × 2
## .pred_class Peringkat.Akreditasi
## <fct> <fct>
## 1 A A
## 2 B C
## 3 B A
## 4 A A
## 5 A C
## 6 A A
## 7 C C
## 8 A B
## 9 B C
## 10 B B
#Kemudian hasil prediksi tersebut akan kita evaluasi dengan menggunakan Confussion Matrix
confussion_matrix <- pred_knn %>%
conf_mat(truth=`Peringkat.Akreditasi`,estimate=.pred_class)
autoplot(confussion_matrix,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
confussion_matrix %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.602
## 2 kap multiclass 0.393
## 3 sens macro 0.601
## 4 spec macro 0.795
## 5 ppv macro 0.605
## 6 npv macro 0.799
## 7 mcc multiclass 0.397
## 8 j_index macro 0.396
## 9 bal_accuracy macro 0.698
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.605
## 12 recall macro 0.601
## 13 f_meas macro 0.597