1 library
## Loading required package: ggplot2
## Loading required package: lattice
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'iml' was built under R version 4.4.3
2 data preporation
2.1 wilayah sulawesi tenggara
https://docs.google.com/document/d/1_JLASE7oz14lXtFWylf1Vl05Wf-4nameoMuxNyIGbvY/edit?usp=sharing
2.2 data agregasi
2.3 variabel dan unit analisis
2.4 recoding
data2 <- data1 %>%
mutate(
pengangguran = case_when(
(R10A == 2 & R10B == 2 & R10C == 2 & R11 == 2 & R38A == 1) ~ 1, #Tidak bekerja dan aktif mencari pekerjaan
(R38B == 1) ~ 1, # mempersiapkan usaha
(R38A == 2 & R38B == 2 & R42A %in% c(1, 2, 3)) ~ 1, # sudah diterima,punyausaha blm mulai dan putus asa
TRUE ~ 0
)
)
sum(is.na(data2$pengangguran))
# unit analisis final yaitu umur 15-24 dan pendidikan terakhir d1-s1
data3 <- data2 %>% filter(K10 >= 15 & K10 <= 24,R6A %in% c(7,8,9))
data_finale <- data3 %>%
mutate(
wilayah = factor(
if_else(KLASIFIKAS == 1, 0L, 1L), # 1=perkotaan
levels = c(0, 1),
labels = c("Perkotaan", "Perdesaan")
),
jk = factor(
if_else(K4 == 1, 0L, 1L), # 1=laki-laki
levels = c(0, 1),
labels = c("Laki-laki", "Perempuan")
),
umur = K10,
pen_trak = factor(
if_else(R6A == 9, 0L, 1L), #9=S1
levels = c(0, 1),
labels = c("S1","D1-D4")
),
sertif = factor(
if_else(R6E == 1, 1L, 0L), # 1=ya
levels = c(0, 1),
labels = c("tidak sertif", "sertif")
),
magang_pkl = factor(
if_else(R6I_S_B == "B", 1L, 0L),
levels = c(0, 1),
labels = c("selainnya", "saat kuliah")
),
pengangguran = factor(
if_else(pengangguran == 1, 1L, 0L), #1=ya
levels = c(0, 1),
labels = c("tidak pengangguran", "pengangguran")
),
prakerja = factor(
if_else(R51G == 1, 1L, 0L), #1=ya
levels = c(0, 1),
labels = c("selainnya", "prakerja")
)) %>%
select(WEIGHT,PSU,STRATA,KODE_KAB,
wilayah,jk,umur,pen_trak,sertif,magang_pkl,prakerja,
pengangguran)
##n mengecek missing values
colSums(is.na(data_finale))2.5 design penangguran standar
desain <- svydesign(
ids = ~PSU,
strata = ~STRATA,
weights = ~WEIGHT,
data = data2,
nest = TRUE
)
# proporsi pengangguran
svymean(~pengangguran, desain)
# proporsi pengangguran 15-24
# Subset desain untuk usia 15–24 tahun
desain_15_24 <- subset(desain, K10 >= 15 & K10 <= 24)
# Hitung proporsi pengangguran
svymean(~pengangguran, desain_15_24)2.6 statistik deskriptif
desain_f <- svydesign(
ids = ~PSU,
strata = ~STRATA,
weights = ~WEIGHT,
data = data_finale,
nest = TRUE
)
# Buat daftar variabel kategorik yang ingin dibandingkan
vars_kat <- c("wilayah", "jk", "pen_trak", "sertif", "magang_pkl", "prakerja")
# Loop: buat svytable pengangguran vs masing-masing variabel
for (var in vars_kat) {
cat("\n==========\nTabel: pengangguran vs", var, "\n")
tab <- svytable(as.formula(paste("~pengangguran +", var)), desain_f)
print(tab)
# Jika ingin proporsi berdasarkan kategori (kolom)
prop <- prop.table(tab, margin = 2)
cat("\nProporsi per kategori (kolom):\n")
print(round(prop, 2))
}3 machine learning method
3.1 split data
3.2 logistik
set.seed(1) # supaya hasil bisa direproduksi
glm.fits <- glm(pengangguran~umur+wilayah+jk+pen_trak+sertif+magang_pkl+prakerja
,data=train_data,family = binomial)
summary(glm.fits)3.2.1 kebaikan model
# Prediksi probabilitas pada data training dan testing
train_probs <- predict(glm.fits, newdata = train_data, type = "response")
test_probs <- predict(glm.fits, newdata = test_data, type = "response")
levels(train_data$pengangguran) # Harus: "tidak pengangguran", "pengangguran"
# Prediksi kelas: threshold 0.5, gunakan label asli
train_pred <- ifelse(train_probs > 0.5, "pengangguran", "tidak pengangguran")
test_pred <- ifelse(test_probs > 0.5, "pengangguran", "tidak pengangguran")
# Jadikan faktor dan sesuaikan level-nya
train_pred <- factor(train_pred, levels = levels(train_data$pengangguran))
test_pred <- factor(test_pred, levels = levels(test_data$pengangguran))
# Confusion matrix dan metrik untuk training
cm_train <- confusionMatrix(train_pred, train_data$pengangguran, positive = "pengangguran")
print(cm_train)
# Confusion matrix dan metrik untuk testing
cm_test <- confusionMatrix(test_pred, test_data$pengangguran, positive = "pengangguran")
print(cm_test)
# Ambil nilai sensitifitas, spesifisitas, dan akurasi dari confusionMatrix
metrics_train <- cm_train$byClass[c("Sensitivity", "Specificity")]
metrics_train["Accuracy"] <- cm_train$overall["Accuracy"]
metrics_test <- cm_test$byClass[c("Sensitivity", "Specificity")]
metrics_test["Accuracy"] <- cm_test$overall["Accuracy"]
# Hitung F1-score manual: 2 * (Precision * Recall) / (Precision + Recall)
f1_train <- 2 * (cm_train$byClass["Pos Pred Value"] * cm_train$byClass["Sensitivity"]) /
(cm_train$byClass["Pos Pred Value"] + cm_train$byClass["Sensitivity"])
f1_test <- 2 * (cm_test$byClass["Pos Pred Value"] * cm_test$byClass["Sensitivity"]) /
(cm_test$byClass["Pos Pred Value"] + cm_test$byClass["Sensitivity"])
metrics_train["F1"] <- f1_train
metrics_test["F1"] <- f1_test
# Tampilkan ringkasan metrik
metrics_train
metrics_test3.3 naive bayes
set.seed(1) # supaya hasil bisa direproduksi
nb.fit <- naiveBayes(pengangguran~umur+wilayah+jk+pen_trak+sertif+magang_pkl+
prakerja,data=train_data)
nb.fit3.3.1 kebaikan model
# Prediksi kelas pada data training dan testing
train_pred_nb <- predict(nb.fit, newdata = train_data)
test_pred_nb <- predict(nb.fit, newdata = test_data)
# Pastikan level prediksi sesuai dengan level target
train_pred_nb <- factor(train_pred_nb, levels = levels(train_data$pengangguran))
test_pred_nb <- factor(test_pred_nb, levels = levels(test_data$pengangguran))
# Confusion matrix dan metrik untuk training
cm_train_nb <- confusionMatrix(train_pred_nb, train_data$pengangguran, positive = "pengangguran")
print(cm_train_nb)
# Confusion matrix dan metrik untuk testing
cm_test_nb <- confusionMatrix(test_pred_nb, test_data$pengangguran, positive = "pengangguran")
print(cm_test_nb)
# Ambil nilai sensitifitas, spesifisitas, dan akurasi dari confusionMatrix
metrics_train_nb <- cm_train_nb$byClass[c("Sensitivity", "Specificity")]
metrics_train_nb["Accuracy"] <- cm_train_nb$overall["Accuracy"]
metrics_test_nb <- cm_test_nb$byClass[c("Sensitivity", "Specificity")]
metrics_test_nb["Accuracy"] <- cm_test_nb$overall["Accuracy"]
# Hitung F1-score manual
f1_train_nb <- 2 * (cm_train_nb$byClass["Pos Pred Value"] * cm_train_nb$byClass["Sensitivity"]) /
(cm_train_nb$byClass["Pos Pred Value"] + cm_train_nb$byClass["Sensitivity"])
f1_test_nb <- 2 * (cm_test_nb$byClass["Pos Pred Value"] * cm_test_nb$byClass["Sensitivity"]) /
(cm_test_nb$byClass["Pos Pred Value"] + cm_test_nb$byClass["Sensitivity"])
metrics_train_nb["F1"] <- f1_train_nb
metrics_test_nb["F1"] <- f1_test_nb
# Tampilkan ringkasan metrik
metrics_train_nb
metrics_test_nb3.4 random forest
(7)^0.5 = 2,64
set.seed(1) # supaya hasil bisa direproduksi
rf.model <-randomForest(pengangguran~umur+wilayah+jk+pen_trak+sertif+magang_pkl+prakerja,data=train_data, mtry = 3, importance = TRUE)
rf.model3.4.1 kebaikan model
# Prediksi dengan memastikan level faktor sama
train_pred_rf <- predict(rf.model, train_data)
test_pred_rf <- predict(rf.model, test_data)
# Samakan level faktor prediksi dengan level target
train_pred_rf <- factor(train_pred_rf, levels = levels(train_data$pengangguran))
test_pred_rf <- factor(test_pred_rf, levels = levels(test_data$pengangguran))
# Evaluasi dengan positive class "pengangguran"
cm_train_rf <- confusionMatrix(train_pred_rf, train_data$pengangguran, positive = "pengangguran")
cm_test_rf <- confusionMatrix(test_pred_rf, test_data$pengangguran, positive = "pengangguran")
# Gunakan fungsi robust untuk menghitung metrik
get_metrics <- function(cm) {
c(
Accuracy = cm$overall["Accuracy"],
Sensitivity = cm$byClass["Sensitivity"],
Specificity = cm$byClass["Specificity"],
F1 = cm$byClass["F1"] # F1 sudah dihitung oleh confusionMatrix
)
}
metrics_train_rf <- get_metrics(cm_train_rf)
metrics_test_rf <- get_metrics(cm_test_rf)
metrics_train_rf
metrics_test_rf3.4.3 shap values
# Buat list kosong untuk menyimpan hasil
shapley_list <- list()
set.seed(1) # supaya hasil sampling bisa direproduksi
# Ambil sampel acak 100 baris dari train_data
sample_indices <- sample(nrow(train_data), 500)
shapley_list <- list()
for (i in seq_along(sample_indices)) {
idx <- sample_indices[i]
shap <- Shapley$new(
predictor,
x.interest = train_data[idx,c("umur","wilayah","jk","pen_trak","sertif","magang_pkl","prakerja")],
sample.size = 50
)
shapley_list[[i]] <- shap$results
}
all_results <- bind_rows(shapley_list, .id = "observation_id")
avg_phi <- all_results %>% filter(class=="Up") %>%
group_by(feature) %>%
summarise(mean_phi = mean(phi))
avg_phi
ggplot(avg_phi, aes(x = reorder(feature, mean_phi), y = mean_phi)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Rata-rata Nilai Shapley per Fitur",
x = "Fitur",
y = "Rata-rata Nilai Shapley (phi)"
) +
theme_minimal()