1 Load Library dan Data

pak::pak(c(
  "readr",
  "tidymodels",
  "dplyr",
  "MASS",
  "ggplot2",
  "caret",
  "pROC",
  "MLmetrics"
))
## ✔ Updated metadata database: 4.42 MB in 3 files.
## ℹ Updating metadata database✔ Updating metadata database ... done
##  
## ℹ No downloads are needed
## ✔ 8 pkgs + 108 deps: kept 89 [25.1s]
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

1.1 Baca data

stress_df <- read.csv2("C:/Users/ADVAN/OneDrive/Dokumen/ANMUL/StressLevelDataset.csv") 

2 Eksplorasi Dataset

2.1 Tabel ringkasan statistik deskriptif untuk variabel numerik

Ini adalah tahap Eksplorasi Data Awal (EDA). Tujuannya untuk memahami karakteristik dasar dari data. summary() memberikan ringkasan statistik (seperti rata-rata, median, nilai min/max) untuk kolom numerik, sementara table() menghitung frekuensi atau jumlah kemunculan setiap kategori pada variabel target (mental_health_history).

numeric_summary <- stress_df %>%
  select(where(is.numeric)) %>%
  summary()
print(numeric_summary)
##  anxiety_level    self_esteem    mental_health_history   depression   
##  Min.   : 0.00   Min.   : 0.00   Min.   :0.0000        Min.   : 0.00  
##  1st Qu.: 6.00   1st Qu.:11.00   1st Qu.:0.0000        1st Qu.: 6.00  
##  Median :11.00   Median :19.00   Median :0.0000        Median :12.00  
##  Mean   :11.06   Mean   :17.78   Mean   :0.4927        Mean   :12.56  
##  3rd Qu.:16.00   3rd Qu.:26.00   3rd Qu.:1.0000        3rd Qu.:19.00  
##  Max.   :21.00   Max.   :30.00   Max.   :1.0000        Max.   :27.00  
##     headache     blood_pressure  sleep_quality  breathing_problem
##  Min.   :0.000   Min.   :1.000   Min.   :0.00   Min.   :0.000    
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:2.000    
##  Median :3.000   Median :2.000   Median :2.50   Median :3.000    
##  Mean   :2.508   Mean   :2.182   Mean   :2.66   Mean   :2.754    
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.00   3rd Qu.:4.000    
##  Max.   :5.000   Max.   :3.000   Max.   :5.00   Max.   :5.000    
##   noise_level    living_conditions     safety       basic_needs   
##  Min.   :0.000   Min.   :0.000     Min.   :0.000   Min.   :0.000  
##  1st Qu.:2.000   1st Qu.:2.000     1st Qu.:2.000   1st Qu.:2.000  
##  Median :3.000   Median :2.000     Median :2.000   Median :3.000  
##  Mean   :2.649   Mean   :2.518     Mean   :2.737   Mean   :2.773  
##  3rd Qu.:3.000   3rd Qu.:3.000     3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000     Max.   :5.000   Max.   :5.000  
##  academic_performance   study_load    teacher_student_relationship
##  Min.   :0.000        Min.   :0.000   Min.   :0.000               
##  1st Qu.:2.000        1st Qu.:2.000   1st Qu.:2.000               
##  Median :2.000        Median :2.000   Median :2.000               
##  Mean   :2.773        Mean   :2.622   Mean   :2.648               
##  3rd Qu.:4.000        3rd Qu.:3.000   3rd Qu.:4.000               
##  Max.   :5.000        Max.   :5.000   Max.   :5.000               
##  future_career_concerns social_support  peer_pressure  
##  Min.   :0.000          Min.   :0.000   Min.   :0.000  
##  1st Qu.:1.000          1st Qu.:1.000   1st Qu.:2.000  
##  Median :2.000          Median :2.000   Median :2.000  
##  Mean   :2.649          Mean   :1.882   Mean   :2.735  
##  3rd Qu.:4.000          3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :5.000          Max.   :3.000   Max.   :5.000  
##  extracurricular_activities    bullying      stress_level   
##  Min.   :0.000              Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000              1st Qu.:1.000   1st Qu.:0.0000  
##  Median :2.500              Median :3.000   Median :1.0000  
##  Mean   :2.767              Mean   :2.617   Mean   :0.9964  
##  3rd Qu.:4.000              3rd Qu.:4.000   3rd Qu.:2.0000  
##  Max.   :5.000              Max.   :5.000   Max.   :2.0000

2.2 Tabel frekuensi untuk variabel target

Menampilkan jumlah orang yang mempunyai “mental health history” dan yang tidak mempunyai

cat("Distribusi kelas mental_health_history:\n")
## Distribusi kelas mental_health_history:
print(table(stress_df$mental_health_history))
## 
##   0   1 
## 558 542

2.3 Diagram Lingkaran

df_summary <- stress_df %>%
  count(mental_health_history, name = "jumlah") %>%
  mutate(
    persen = jumlah / sum(jumlah) * 100,
    label_teks = paste0(round(persen, 1), "%")
  )

ggplot(df_summary, aes(x = "", y = jumlah, fill = as.factor(mental_health_history))) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  
  geom_text(
    aes(label = label_teks), 
    color = "white", 
    size = 5,
    position = position_stack(vjust = 0.5) 
  ) +

  labs(
    title = "Distribusi Kelas mental_health_history",
    fill = "Kategori"
  ) +
  theme_void() + 
  theme(plot.title = element_text(hjust = 0.5, size = 16))

3 Preprocesing

3.1 Pemeriksaan Missing Value

colSums(is.na(stress_df))
##                anxiety_level                  self_esteem 
##                            0                            0 
##        mental_health_history                   depression 
##                            0                            0 
##                     headache               blood_pressure 
##                            0                            0 
##                sleep_quality            breathing_problem 
##                            0                            0 
##                  noise_level            living_conditions 
##                            0                            0 
##                       safety                  basic_needs 
##                            0                            0 
##         academic_performance                   study_load 
##                            0                            0 
## teacher_student_relationship       future_career_concerns 
##                            0                            0 
##               social_support                peer_pressure 
##                            0                            0 
##   extracurricular_activities                     bullying 
##                            0                            0 
##                 stress_level 
##                            0

3.2 cek data duplikat

sum(duplicated(stress_df))
## [1] 0

3.3 Target adalah faktor

stress_df$mental_health_history <- as.factor(stress_df$mental_health_history)

4 Uji Asumsi

4.1 Uji normalitas Shapiro-Wilk per kelas untuk semua variabel numerik (Jika p-value > 0.05 maka asumsi terpenuhi)

library(dplyr)
normality_results <- stress_df %>%
  group_by(mental_health_history) %>%
  summarise(across(where(is.numeric), ~ as.numeric(shapiro.test(.)$p.value))) %>%
  ungroup()
print(normality_results)
## # A tibble: 2 × 21
##   mental_health_history anxiety_level self_esteem depression headache
##   <fct>                         <dbl>       <dbl>      <dbl>    <dbl>
## 1 0                          1.09e-11    1.27e-22   5.45e-16 5.65e-27
## 2 1                          2.85e-14    9.35e-10   1.15e-10 2.76e-18
## # ℹ 16 more variables: blood_pressure <dbl>, sleep_quality <dbl>,
## #   breathing_problem <dbl>, noise_level <dbl>, living_conditions <dbl>,
## #   safety <dbl>, basic_needs <dbl>, academic_performance <dbl>,
## #   study_load <dbl>, teacher_student_relationship <dbl>,
## #   future_career_concerns <dbl>, social_support <dbl>, peer_pressure <dbl>,
## #   extracurricular_activities <dbl>, bullying <dbl>, stress_level <dbl>

4.2 Uji Homogenitas Kovarians (Jika p-value > 0.05 maka asumsi terpenuhi)

library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
boxm_result <- boxM(stress_df[, -which(names(stress_df) == "mental_health_history")], 
  grouping = stress_df$mental_health_history)
print(boxm_result)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  stress_df[, -which(names(stress_df) == "mental_health_history")]
## Chi-Sq (approx.) = 2019.9, df = 210, p-value < 2.2e-16

4.3 Cek Multikolinearitas (VIF > 5 atau 10 menunjukkan potensi multikolinearitas)

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif_model <- lm(as.numeric(mental_health_history) ~ ., data = stress_df)
vif_result <- vif(vif_model)
print(vif_result)
##                anxiety_level                  self_esteem 
##                     3.204729                     3.309054 
##                   depression                     headache 
##                     3.094113                     2.557271 
##               blood_pressure                sleep_quality 
##                     3.696737                     3.128400 
##            breathing_problem                  noise_level 
##                     1.785444                     2.041261 
##            living_conditions                       safety 
##                     1.790423                     2.819680 
##                  basic_needs         academic_performance 
##                     2.682378                     2.746103 
##                   study_load teacher_student_relationship 
##                     1.913858                     3.214123 
##       future_career_concerns               social_support 
##                     3.413818                     5.746326 
##                peer_pressure   extracurricular_activities 
##                     2.617383                     2.542425 
##                     bullying                 stress_level 
##                     3.223843                     4.911613

4.4 Transformasi

Transformasi dilakukan karena pada ketiga uji asumsi tidak memenuhi

library(MASS)
library(dplyr)

Ambil semua variabel numerik

numeric_vars <- names(select(stress_df, where(is.numeric)))

Cek variabel yang seluruh nilainya positif (agar bisa pakai Box-Cox)

numeric_vars_pos <- numeric_vars[sapply(stress_df[numeric_vars], function(x) all(x > 0))]

Lakukan transformasi dan simpan hasilnya ke kolom baru

for (var_name in numeric_vars_pos) {
  model <- lm(as.formula(paste(var_name, "~ 1")), data = stress_df)
  bc <- boxcox(model, lambda = seq(-2, 2, 0.1), plotit = FALSE)
  lambda <- bc$x[which.max(bc$y)]
  
  new_var <- paste0(var_name, "_trans")
  
  stress_df[[new_var]] <- if (abs(lambda) < 0.1) {
    log(stress_df[[var_name]])
  } else {
    ((stress_df[[var_name]]^lambda) - 1) / lambda
  }
}

Cek struktur setelah transformasi

str(stress_df)
## 'data.frame':    1100 obs. of  22 variables:
##  $ anxiety_level               : int  14 15 12 16 16 20 4 17 13 6 ...
##  $ self_esteem                 : int  20 8 18 12 28 13 26 3 22 8 ...
##  $ mental_health_history       : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 1 2 2 1 ...
##  $ depression                  : int  11 15 14 15 7 21 6 22 12 27 ...
##  $ headache                    : int  2 5 2 4 2 3 1 4 3 4 ...
##  $ blood_pressure              : int  1 3 1 3 3 3 2 3 1 3 ...
##  $ sleep_quality               : int  2 1 2 1 5 1 4 1 2 1 ...
##  $ breathing_problem           : int  4 4 2 3 1 4 1 5 4 2 ...
##  $ noise_level                 : int  2 3 2 4 3 3 1 3 3 0 ...
##  $ living_conditions           : int  3 1 2 2 2 2 4 1 3 5 ...
##  $ safety                      : int  3 2 3 2 4 2 4 1 3 2 ...
##  $ basic_needs                 : int  2 2 2 2 3 1 4 1 3 2 ...
##  $ academic_performance        : int  3 1 2 2 4 2 5 1 3 2 ...
##  $ study_load                  : int  2 4 3 4 3 5 1 3 3 2 ...
##  $ teacher_student_relationship: int  3 1 3 1 1 2 4 2 2 1 ...
##  $ future_career_concerns      : int  3 5 2 4 2 5 1 4 3 5 ...
##  $ social_support              : int  2 1 2 1 1 1 3 1 3 1 ...
##  $ peer_pressure               : int  3 4 3 4 5 4 2 4 3 5 ...
##  $ extracurricular_activities  : int  3 5 2 4 0 4 2 4 2 3 ...
##  $ bullying                    : int  2 5 2 5 5 5 1 5 2 4 ...
##  $ stress_level                : int  1 2 1 2 1 2 0 2 1 1 ...
##  $ blood_pressure_trans        : num  0 2.28 0 2.28 2.28 ...

4.5 Cek Asumsi setelah transformasi

Ambil hanya kolom hasil transformasi

transformed_vars <- grep("_trans$", names(stress_df), value = TRUE)

4.6 Uji normalitas ulang berdasarkan hasil transformasi

normality_results_trans <- stress_df %>%
  group_by(mental_health_history) %>%
  summarise(across(all_of(transformed_vars), ~ as.numeric(shapiro.test(.)$p.value))) %>%
  ungroup()

Tampilkan hasil

print(normality_results_trans)
## # A tibble: 2 × 2
##   mental_health_history blood_pressure_trans
##   <fct>                                <dbl>
## 1 0                                 1.03e-25
## 2 1                                 1.47e-34

4.7 Standarisasi

Standarisasi dilakukan karena uji asumsi pada hasil transformasi tidak memenuhi Ambil nama variabel numerik

numeric_vars <- names(stress_df)[sapply(stress_df, is.numeric)]

Standarisasi (Z-score): (x - mean) / sd

stress_df_std <- stress_df
stress_df_std[numeric_vars] <- scale(stress_df[numeric_vars])

Cek ringkasan data setelah standarisasi

summary(stress_df_std[numeric_vars])
##  anxiety_level      self_esteem        depression          headache     
##  Min.   :-1.8085   Min.   :-1.9875   Min.   :-1.62488   Min.   :-1.780  
##  1st Qu.:-0.8277   1st Qu.:-0.7577   1st Qu.:-0.84838   1st Qu.:-1.070  
##  Median :-0.0104   Median : 0.1367   Median :-0.07188   Median : 0.349  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000  
##  3rd Qu.: 0.8069   3rd Qu.: 0.9193   3rd Qu.: 0.83403   3rd Qu.: 0.349  
##  Max.   : 1.6242   Max.   : 1.3665   Max.   : 1.86936   Max.   : 1.768  
##  blood_pressure    sleep_quality     breathing_problem  noise_level     
##  Min.   :-1.4178   Min.   :-1.7179   Min.   :-1.9659   Min.   :-1.9946  
##  1st Qu.:-1.4178   1st Qu.:-1.0721   1st Qu.:-0.5380   1st Qu.:-0.4887  
##  Median :-0.2181   Median :-0.1033   Median : 0.1759   Median : 0.2642  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.9815   3rd Qu.: 0.8654   3rd Qu.: 0.8898   3rd Qu.: 0.2642  
##  Max.   : 0.9815   Max.   : 1.5113   Max.   : 1.6037   Max.   : 1.7701  
##  living_conditions     safety         basic_needs      academic_performance
##  Min.   :-2.2500   Min.   :-1.9466   Min.   :-1.9339   Min.   :-1.9601     
##  1st Qu.:-0.4630   1st Qu.:-0.5243   1st Qu.:-0.5390   1st Qu.:-0.5463     
##  Median :-0.4630   Median :-0.5243   Median : 0.1585   Median :-0.5463     
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000     
##  3rd Qu.: 0.4305   3rd Qu.: 0.8980   3rd Qu.: 0.8560   3rd Qu.: 0.8676     
##  Max.   : 2.2175   Max.   : 1.6091   Max.   : 1.5534   Max.   : 1.5745     
##    study_load      teacher_student_relationship future_career_concerns
##  Min.   :-1.9926   Min.   :-1.9126              Min.   :-1.7321       
##  1st Qu.:-0.4726   1st Qu.:-0.4681              1st Qu.:-1.0783       
##  Median :-0.4726   Median :-0.4681              Median :-0.4244       
##  Mean   : 0.0000   Mean   : 0.0000              Mean   : 0.0000       
##  3rd Qu.: 0.2874   3rd Qu.: 0.9763              3rd Qu.: 0.8833       
##  Max.   : 1.8074   Max.   : 1.6986              Max.   : 1.5372       
##  social_support    peer_pressure     extracurricular_activities
##  Min.   :-1.7959   Min.   :-1.9186   Min.   :-1.9521           
##  1st Qu.:-0.8416   1st Qu.:-0.5154   1st Qu.:-0.5413           
##  Median : 0.1128   Median :-0.5154   Median :-0.1885           
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000           
##  3rd Qu.: 1.0671   3rd Qu.: 0.8879   3rd Qu.: 0.8696           
##  Max.   : 1.0671   Max.   : 1.5895   Max.   : 1.5750           
##     bullying        stress_level       blood_pressure_trans
##  Min.   :-1.7096   Min.   :-1.212604   Min.   :-1.3950     
##  1st Qu.:-1.0564   1st Qu.:-1.212604   1st Qu.:-1.3950     
##  Median : 0.2500   Median : 0.004426   Median :-0.2624     
##  Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.0000     
##  3rd Qu.: 0.9032   3rd Qu.: 1.221455   3rd Qu.: 0.9945     
##  Max.   : 1.5564   Max.   : 1.221455   Max.   : 0.9945

4.8 Cek Asumsi Setelah Standarisasi

library(dplyr)

normality_results_std <- stress_df_std %>%
  group_by(mental_health_history) %>%
  summarise(across(where(is.numeric), ~ as.numeric(shapiro.test(.)$p.value))) %>%
  ungroup()

print(normality_results_std)
## # A tibble: 2 × 22
##   mental_health_history anxiety_level self_esteem depression headache
##   <fct>                         <dbl>       <dbl>      <dbl>    <dbl>
## 1 0                          1.09e-11    1.27e-22   5.45e-16 5.65e-27
## 2 1                          2.85e-14    9.35e-10   1.15e-10 2.76e-18
## # ℹ 17 more variables: blood_pressure <dbl>, sleep_quality <dbl>,
## #   breathing_problem <dbl>, noise_level <dbl>, living_conditions <dbl>,
## #   safety <dbl>, basic_needs <dbl>, academic_performance <dbl>,
## #   study_load <dbl>, teacher_student_relationship <dbl>,
## #   future_career_concerns <dbl>, social_support <dbl>, peer_pressure <dbl>,
## #   extracurricular_activities <dbl>, bullying <dbl>, stress_level <dbl>,
## #   blood_pressure_trans <dbl>

5 Bangun Model

5.1 Membagi data menjadi training (80%) dan testing (20%)

Data dibagi secara acak menjadi dua bagian: data training (80%) untuk “mengajari” model, dan data testing (20%) untuk menguji seberapa baik model tersebut. set.seed(123) memastikan pembagian acak ini bisa diulang dengan hasil yang sama persis.

set.seed(123)  # untuk reproduksibilitas
sample_indices <- sample(1:nrow(stress_df), size = 0.8 * nrow(stress_df))
train_data <- stress_df[sample_indices, ]
test_data <- stress_df[-sample_indices, ]

5.2 Latih model LDA

Model ini belajar bagaimana membedakan atau mengklasifikasikan riwayat kesehatan mental berdasarkan semua variabel lainnya.

model_lda <- lda(mental_health_history ~ ., data = train_data)

5.3 Prediksi

Setelah model dilatih, langkah ini menggunakan model tersebut untuk membuat prediksi pada data testing.

pred <- predict(model_lda, newdata = test_data)
pred_class <- pred$class
pred_prob <- pred$posterior[, "1"]

6 Evaluasi Model

6.1 Confusion Matrix

conf_mat <- confusionMatrix(pred$class, test_data$mental_health_history)
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 22
##          1 31 80
##                                         
##                Accuracy : 0.7591        
##                  95% CI : (0.697, 0.814)
##     No Information Rate : 0.5364        
##     P-Value [Acc > NIR] : 7.44e-12      
##                                         
##                   Kappa : 0.5185        
##                                         
##  Mcnemar's Test P-Value : 0.2718        
##                                         
##             Sensitivity : 0.7373        
##             Specificity : 0.7843        
##          Pos Pred Value : 0.7982        
##          Neg Pred Value : 0.7207        
##              Prevalence : 0.5364        
##          Detection Rate : 0.3955        
##    Detection Prevalence : 0.4955        
##       Balanced Accuracy : 0.7608        
##                                         
##        'Positive' Class : 0             
## 

6.2 F1 Score

f1_score <- conf_mat$byClass["F1"]
print(paste("F1 Score:", round(f1_score, 3)))
## [1] "F1 Score: 0.767"

6.3 ROC Curve dan AUC

roc_obj <- roc(test_data$mental_health_history, pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
print(paste("AUC:", round(auc_value, 3)))
## [1] "AUC: 0.898"

6.4 Plot ROC Curve

plot(roc_obj, col = "blue", main = "ROC Curve for LDA")
abline( lty = 2, col = "gray")

6.5 Lihat koefisien fungsi diskriminan

cat("Koefisien Fungsi Diskriminan:\n")
## Koefisien Fungsi Diskriminan:
print(model_lda$scaling)
##                                        LD1
## anxiety_level                  0.043840497
## self_esteem                   -0.022793770
## depression                     0.017900312
## headache                       0.117222571
## blood_pressure               -12.741239334
## sleep_quality                 -0.071174254
## breathing_problem              0.001219558
## noise_level                   -0.032637614
## living_conditions             -0.126332274
## safety                         0.042581912
## basic_needs                   -0.108916418
## academic_performance          -0.076949609
## study_load                     0.093826369
## teacher_student_relationship   0.002119362
## future_career_concerns         0.088489148
## social_support                 0.345600541
## peer_pressure                  0.055225552
## extracurricular_activities     0.038676001
## bullying                       0.151356481
## stress_level                  -0.071094364
## blood_pressure_trans          11.326474025

6.6 Tampilkan ringkasan performa

cat("\nAkurasi Model:\n")
## 
## Akurasi Model:
print(conf_mat$overall["Accuracy"])
##  Accuracy 
## 0.7590909
cat("\nF1 Score:\n")
## 
## F1 Score:
print(round(f1_score, 3))
##    F1 
## 0.767
cat("\nAUC Score:\n")
## 
## AUC Score:
print(round(auc_value, 3))
## [1] 0.898