pak::pak(c(
"readr",
"tidymodels",
"dplyr",
"MASS",
"ggplot2",
"caret",
"pROC",
"MLmetrics"
))
## ✔ Updated metadata database: 4.42 MB in 3 files.
## ℹ Updating metadata database✔ Updating metadata database ... done
##
## ℹ No downloads are needed
## ✔ 8 pkgs + 108 deps: kept 89 [25.1s]
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
stress_df <- read.csv2("C:/Users/ADVAN/OneDrive/Dokumen/ANMUL/StressLevelDataset.csv")
Ini adalah tahap Eksplorasi Data Awal (EDA). Tujuannya untuk memahami karakteristik dasar dari data. summary() memberikan ringkasan statistik (seperti rata-rata, median, nilai min/max) untuk kolom numerik, sementara table() menghitung frekuensi atau jumlah kemunculan setiap kategori pada variabel target (mental_health_history).
numeric_summary <- stress_df %>%
select(where(is.numeric)) %>%
summary()
print(numeric_summary)
## anxiety_level self_esteem mental_health_history depression
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. : 0.00
## 1st Qu.: 6.00 1st Qu.:11.00 1st Qu.:0.0000 1st Qu.: 6.00
## Median :11.00 Median :19.00 Median :0.0000 Median :12.00
## Mean :11.06 Mean :17.78 Mean :0.4927 Mean :12.56
## 3rd Qu.:16.00 3rd Qu.:26.00 3rd Qu.:1.0000 3rd Qu.:19.00
## Max. :21.00 Max. :30.00 Max. :1.0000 Max. :27.00
## headache blood_pressure sleep_quality breathing_problem
## Min. :0.000 Min. :1.000 Min. :0.00 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:2.000
## Median :3.000 Median :2.000 Median :2.50 Median :3.000
## Mean :2.508 Mean :2.182 Mean :2.66 Mean :2.754
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :3.000 Max. :5.00 Max. :5.000
## noise_level living_conditions safety basic_needs
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :2.000 Median :2.000 Median :3.000
## Mean :2.649 Mean :2.518 Mean :2.737 Mean :2.773
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## academic_performance study_load teacher_student_relationship
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000
## Mean :2.773 Mean :2.622 Mean :2.648
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## future_career_concerns social_support peer_pressure
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000
## Mean :2.649 Mean :1.882 Mean :2.735
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :3.000 Max. :5.000
## extracurricular_activities bullying stress_level
## Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :2.500 Median :3.000 Median :1.0000
## Mean :2.767 Mean :2.617 Mean :0.9964
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.0000
## Max. :5.000 Max. :5.000 Max. :2.0000
Menampilkan jumlah orang yang mempunyai “mental health history” dan yang tidak mempunyai
cat("Distribusi kelas mental_health_history:\n")
## Distribusi kelas mental_health_history:
print(table(stress_df$mental_health_history))
##
## 0 1
## 558 542
df_summary <- stress_df %>%
count(mental_health_history, name = "jumlah") %>%
mutate(
persen = jumlah / sum(jumlah) * 100,
label_teks = paste0(round(persen, 1), "%")
)
ggplot(df_summary, aes(x = "", y = jumlah, fill = as.factor(mental_health_history))) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
geom_text(
aes(label = label_teks),
color = "white",
size = 5,
position = position_stack(vjust = 0.5)
) +
labs(
title = "Distribusi Kelas mental_health_history",
fill = "Kategori"
) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, size = 16))
colSums(is.na(stress_df))
## anxiety_level self_esteem
## 0 0
## mental_health_history depression
## 0 0
## headache blood_pressure
## 0 0
## sleep_quality breathing_problem
## 0 0
## noise_level living_conditions
## 0 0
## safety basic_needs
## 0 0
## academic_performance study_load
## 0 0
## teacher_student_relationship future_career_concerns
## 0 0
## social_support peer_pressure
## 0 0
## extracurricular_activities bullying
## 0 0
## stress_level
## 0
sum(duplicated(stress_df))
## [1] 0
stress_df$mental_health_history <- as.factor(stress_df$mental_health_history)
library(dplyr)
normality_results <- stress_df %>%
group_by(mental_health_history) %>%
summarise(across(where(is.numeric), ~ as.numeric(shapiro.test(.)$p.value))) %>%
ungroup()
print(normality_results)
## # A tibble: 2 × 21
## mental_health_history anxiety_level self_esteem depression headache
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 1.09e-11 1.27e-22 5.45e-16 5.65e-27
## 2 1 2.85e-14 9.35e-10 1.15e-10 2.76e-18
## # ℹ 16 more variables: blood_pressure <dbl>, sleep_quality <dbl>,
## # breathing_problem <dbl>, noise_level <dbl>, living_conditions <dbl>,
## # safety <dbl>, basic_needs <dbl>, academic_performance <dbl>,
## # study_load <dbl>, teacher_student_relationship <dbl>,
## # future_career_concerns <dbl>, social_support <dbl>, peer_pressure <dbl>,
## # extracurricular_activities <dbl>, bullying <dbl>, stress_level <dbl>
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
boxm_result <- boxM(stress_df[, -which(names(stress_df) == "mental_health_history")],
grouping = stress_df$mental_health_history)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: stress_df[, -which(names(stress_df) == "mental_health_history")]
## Chi-Sq (approx.) = 2019.9, df = 210, p-value < 2.2e-16
Transformasi dilakukan karena pada ketiga uji asumsi tidak memenuhi
library(MASS)
library(dplyr)
Ambil semua variabel numerik
numeric_vars <- names(select(stress_df, where(is.numeric)))
Cek variabel yang seluruh nilainya positif (agar bisa pakai Box-Cox)
numeric_vars_pos <- numeric_vars[sapply(stress_df[numeric_vars], function(x) all(x > 0))]
Lakukan transformasi dan simpan hasilnya ke kolom baru
for (var_name in numeric_vars_pos) {
model <- lm(as.formula(paste(var_name, "~ 1")), data = stress_df)
bc <- boxcox(model, lambda = seq(-2, 2, 0.1), plotit = FALSE)
lambda <- bc$x[which.max(bc$y)]
new_var <- paste0(var_name, "_trans")
stress_df[[new_var]] <- if (abs(lambda) < 0.1) {
log(stress_df[[var_name]])
} else {
((stress_df[[var_name]]^lambda) - 1) / lambda
}
}
Cek struktur setelah transformasi
str(stress_df)
## 'data.frame': 1100 obs. of 22 variables:
## $ anxiety_level : int 14 15 12 16 16 20 4 17 13 6 ...
## $ self_esteem : int 20 8 18 12 28 13 26 3 22 8 ...
## $ mental_health_history : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 1 2 2 1 ...
## $ depression : int 11 15 14 15 7 21 6 22 12 27 ...
## $ headache : int 2 5 2 4 2 3 1 4 3 4 ...
## $ blood_pressure : int 1 3 1 3 3 3 2 3 1 3 ...
## $ sleep_quality : int 2 1 2 1 5 1 4 1 2 1 ...
## $ breathing_problem : int 4 4 2 3 1 4 1 5 4 2 ...
## $ noise_level : int 2 3 2 4 3 3 1 3 3 0 ...
## $ living_conditions : int 3 1 2 2 2 2 4 1 3 5 ...
## $ safety : int 3 2 3 2 4 2 4 1 3 2 ...
## $ basic_needs : int 2 2 2 2 3 1 4 1 3 2 ...
## $ academic_performance : int 3 1 2 2 4 2 5 1 3 2 ...
## $ study_load : int 2 4 3 4 3 5 1 3 3 2 ...
## $ teacher_student_relationship: int 3 1 3 1 1 2 4 2 2 1 ...
## $ future_career_concerns : int 3 5 2 4 2 5 1 4 3 5 ...
## $ social_support : int 2 1 2 1 1 1 3 1 3 1 ...
## $ peer_pressure : int 3 4 3 4 5 4 2 4 3 5 ...
## $ extracurricular_activities : int 3 5 2 4 0 4 2 4 2 3 ...
## $ bullying : int 2 5 2 5 5 5 1 5 2 4 ...
## $ stress_level : int 1 2 1 2 1 2 0 2 1 1 ...
## $ blood_pressure_trans : num 0 2.28 0 2.28 2.28 ...
Ambil hanya kolom hasil transformasi
transformed_vars <- grep("_trans$", names(stress_df), value = TRUE)
normality_results_trans <- stress_df %>%
group_by(mental_health_history) %>%
summarise(across(all_of(transformed_vars), ~ as.numeric(shapiro.test(.)$p.value))) %>%
ungroup()
Tampilkan hasil
print(normality_results_trans)
## # A tibble: 2 × 2
## mental_health_history blood_pressure_trans
## <fct> <dbl>
## 1 0 1.03e-25
## 2 1 1.47e-34
Standarisasi dilakukan karena uji asumsi pada hasil transformasi tidak memenuhi Ambil nama variabel numerik
numeric_vars <- names(stress_df)[sapply(stress_df, is.numeric)]
Standarisasi (Z-score): (x - mean) / sd
stress_df_std <- stress_df
stress_df_std[numeric_vars] <- scale(stress_df[numeric_vars])
Cek ringkasan data setelah standarisasi
summary(stress_df_std[numeric_vars])
## anxiety_level self_esteem depression headache
## Min. :-1.8085 Min. :-1.9875 Min. :-1.62488 Min. :-1.780
## 1st Qu.:-0.8277 1st Qu.:-0.7577 1st Qu.:-0.84838 1st Qu.:-1.070
## Median :-0.0104 Median : 0.1367 Median :-0.07188 Median : 0.349
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.000
## 3rd Qu.: 0.8069 3rd Qu.: 0.9193 3rd Qu.: 0.83403 3rd Qu.: 0.349
## Max. : 1.6242 Max. : 1.3665 Max. : 1.86936 Max. : 1.768
## blood_pressure sleep_quality breathing_problem noise_level
## Min. :-1.4178 Min. :-1.7179 Min. :-1.9659 Min. :-1.9946
## 1st Qu.:-1.4178 1st Qu.:-1.0721 1st Qu.:-0.5380 1st Qu.:-0.4887
## Median :-0.2181 Median :-0.1033 Median : 0.1759 Median : 0.2642
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.9815 3rd Qu.: 0.8654 3rd Qu.: 0.8898 3rd Qu.: 0.2642
## Max. : 0.9815 Max. : 1.5113 Max. : 1.6037 Max. : 1.7701
## living_conditions safety basic_needs academic_performance
## Min. :-2.2500 Min. :-1.9466 Min. :-1.9339 Min. :-1.9601
## 1st Qu.:-0.4630 1st Qu.:-0.5243 1st Qu.:-0.5390 1st Qu.:-0.5463
## Median :-0.4630 Median :-0.5243 Median : 0.1585 Median :-0.5463
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4305 3rd Qu.: 0.8980 3rd Qu.: 0.8560 3rd Qu.: 0.8676
## Max. : 2.2175 Max. : 1.6091 Max. : 1.5534 Max. : 1.5745
## study_load teacher_student_relationship future_career_concerns
## Min. :-1.9926 Min. :-1.9126 Min. :-1.7321
## 1st Qu.:-0.4726 1st Qu.:-0.4681 1st Qu.:-1.0783
## Median :-0.4726 Median :-0.4681 Median :-0.4244
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2874 3rd Qu.: 0.9763 3rd Qu.: 0.8833
## Max. : 1.8074 Max. : 1.6986 Max. : 1.5372
## social_support peer_pressure extracurricular_activities
## Min. :-1.7959 Min. :-1.9186 Min. :-1.9521
## 1st Qu.:-0.8416 1st Qu.:-0.5154 1st Qu.:-0.5413
## Median : 0.1128 Median :-0.5154 Median :-0.1885
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.0671 3rd Qu.: 0.8879 3rd Qu.: 0.8696
## Max. : 1.0671 Max. : 1.5895 Max. : 1.5750
## bullying stress_level blood_pressure_trans
## Min. :-1.7096 Min. :-1.212604 Min. :-1.3950
## 1st Qu.:-1.0564 1st Qu.:-1.212604 1st Qu.:-1.3950
## Median : 0.2500 Median : 0.004426 Median :-0.2624
## Mean : 0.0000 Mean : 0.000000 Mean : 0.0000
## 3rd Qu.: 0.9032 3rd Qu.: 1.221455 3rd Qu.: 0.9945
## Max. : 1.5564 Max. : 1.221455 Max. : 0.9945
library(dplyr)
normality_results_std <- stress_df_std %>%
group_by(mental_health_history) %>%
summarise(across(where(is.numeric), ~ as.numeric(shapiro.test(.)$p.value))) %>%
ungroup()
print(normality_results_std)
## # A tibble: 2 × 22
## mental_health_history anxiety_level self_esteem depression headache
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 1.09e-11 1.27e-22 5.45e-16 5.65e-27
## 2 1 2.85e-14 9.35e-10 1.15e-10 2.76e-18
## # ℹ 17 more variables: blood_pressure <dbl>, sleep_quality <dbl>,
## # breathing_problem <dbl>, noise_level <dbl>, living_conditions <dbl>,
## # safety <dbl>, basic_needs <dbl>, academic_performance <dbl>,
## # study_load <dbl>, teacher_student_relationship <dbl>,
## # future_career_concerns <dbl>, social_support <dbl>, peer_pressure <dbl>,
## # extracurricular_activities <dbl>, bullying <dbl>, stress_level <dbl>,
## # blood_pressure_trans <dbl>
Data dibagi secara acak menjadi dua bagian: data training (80%) untuk “mengajari” model, dan data testing (20%) untuk menguji seberapa baik model tersebut. set.seed(123) memastikan pembagian acak ini bisa diulang dengan hasil yang sama persis.
set.seed(123) # untuk reproduksibilitas
sample_indices <- sample(1:nrow(stress_df), size = 0.8 * nrow(stress_df))
train_data <- stress_df[sample_indices, ]
test_data <- stress_df[-sample_indices, ]
Model ini belajar bagaimana membedakan atau mengklasifikasikan riwayat kesehatan mental berdasarkan semua variabel lainnya.
model_lda <- lda(mental_health_history ~ ., data = train_data)
Setelah model dilatih, langkah ini menggunakan model tersebut untuk membuat prediksi pada data testing.
pred <- predict(model_lda, newdata = test_data)
pred_class <- pred$class
pred_prob <- pred$posterior[, "1"]
conf_mat <- confusionMatrix(pred$class, test_data$mental_health_history)
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 22
## 1 31 80
##
## Accuracy : 0.7591
## 95% CI : (0.697, 0.814)
## No Information Rate : 0.5364
## P-Value [Acc > NIR] : 7.44e-12
##
## Kappa : 0.5185
##
## Mcnemar's Test P-Value : 0.2718
##
## Sensitivity : 0.7373
## Specificity : 0.7843
## Pos Pred Value : 0.7982
## Neg Pred Value : 0.7207
## Prevalence : 0.5364
## Detection Rate : 0.3955
## Detection Prevalence : 0.4955
## Balanced Accuracy : 0.7608
##
## 'Positive' Class : 0
##
f1_score <- conf_mat$byClass["F1"]
print(paste("F1 Score:", round(f1_score, 3)))
## [1] "F1 Score: 0.767"
roc_obj <- roc(test_data$mental_health_history, pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
print(paste("AUC:", round(auc_value, 3)))
## [1] "AUC: 0.898"
plot(roc_obj, col = "blue", main = "ROC Curve for LDA")
abline( lty = 2, col = "gray")
cat("Koefisien Fungsi Diskriminan:\n")
## Koefisien Fungsi Diskriminan:
print(model_lda$scaling)
## LD1
## anxiety_level 0.043840497
## self_esteem -0.022793770
## depression 0.017900312
## headache 0.117222571
## blood_pressure -12.741239334
## sleep_quality -0.071174254
## breathing_problem 0.001219558
## noise_level -0.032637614
## living_conditions -0.126332274
## safety 0.042581912
## basic_needs -0.108916418
## academic_performance -0.076949609
## study_load 0.093826369
## teacher_student_relationship 0.002119362
## future_career_concerns 0.088489148
## social_support 0.345600541
## peer_pressure 0.055225552
## extracurricular_activities 0.038676001
## bullying 0.151356481
## stress_level -0.071094364
## blood_pressure_trans 11.326474025
cat("\nAkurasi Model:\n")
##
## Akurasi Model:
print(conf_mat$overall["Accuracy"])
## Accuracy
## 0.7590909
cat("\nF1 Score:\n")
##
## F1 Score:
print(round(f1_score, 3))
## F1
## 0.767
cat("\nAUC Score:\n")
##
## AUC Score:
print(round(auc_value, 3))
## [1] 0.898