ORDINAL LOGISTIC REGRESSION

Baca file dataset

cat("\nMemuat Data\n")
## 
## Memuat Data
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
data <- read_csv("D:/Kuliah Semester 4/Analisis Multivariat/stress_detection_data.csv")
## Rows: 773 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Gender, Occupation, Marital_Status, Smoking_Habit, Meditation_Pra...
## dbl  (13): Age, Sleep_Duration, Sleep_Quality, Physical_Activity, Screen_Tim...
## time  (2): Wake_Up_Time, Bed_Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Dimensi data:", dim(data), "\n")
## Dimensi data: 773 22

Cek isi data

data
## # A tibble: 773 × 22
##      Age Gender Occupation        Marital_Status Sleep_Duration Sleep_Quality
##    <dbl> <chr>  <chr>             <chr>                   <dbl>         <dbl>
##  1    30 Male   Software Engineer Single                    7               4
##  2    35 Female Marketing Manager Married                   6               3
##  3    40 Male   Data Scientist    Divorced                  7               4
##  4    35 Male   Software Engineer Single                    7               4
##  5    29 Female Teacher           Single                    8               5
##  6    45 Male   Doctor            Married                   6               3
##  7    32 Female Graphic Designer  Single                    7               4
##  8    37 Male   Civil Engineer    Married                   7.5             4
##  9    50 Male   Business Owner    Married                   5.5             2
## 10    40 Female Nurse             Divorced                  6               3
## # ℹ 763 more rows
## # ℹ 16 more variables: Wake_Up_Time <time>, Bed_Time <time>,
## #   Physical_Activity <dbl>, Screen_Time <dbl>, Caffeine_Intake <dbl>,
## #   Alcohol_Intake <dbl>, Smoking_Habit <chr>, Work_Hours <dbl>,
## #   Travel_Time <dbl>, Social_Interactions <dbl>, Meditation_Practice <chr>,
## #   Exercise_Type <chr>, Blood_Pressure <dbl>, Cholesterol_Level <dbl>,
## #   Blood_Sugar_Level <dbl>, Stress_Detection <chr>

Siapkan library

library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(rlang)

Identifikasi tipe data

numeric_vars <- names(data)[sapply(data, is.numeric)]
categorical_vars <- names(data)[sapply(data, is.character)]

cat("\nVariabel numerik:\n")
## 
## Variabel numerik:
print(numeric_vars)
##  [1] "Age"                 "Sleep_Duration"      "Sleep_Quality"      
##  [4] "Physical_Activity"   "Screen_Time"         "Caffeine_Intake"    
##  [7] "Alcohol_Intake"      "Work_Hours"          "Travel_Time"        
## [10] "Social_Interactions" "Blood_Pressure"      "Cholesterol_Level"  
## [13] "Blood_Sugar_Level"
cat("\nVariabel kategorikal:\n")
## 
## Variabel kategorikal:
print(categorical_vars)
## [1] "Gender"              "Occupation"          "Marital_Status"     
## [4] "Smoking_Habit"       "Meditation_Practice" "Exercise_Type"      
## [7] "Stress_Detection"

Plot histogram variabel numerik

cat("\nVisualisasi Histogram (Numerik)\n")
## 
## Visualisasi Histogram (Numerik)
for (var in numeric_vars) {
  p <- ggplot(data, aes(x = !!sym(var))) +
    geom_histogram(fill = "skyblue", color = "black", bins = 30) +
    labs(title = paste("Distribusi:", var), x = var, y = "Frekuensi") +
    theme_minimal()
  print(p)
}

Plot bar chart variabel kategorikal

cat("\nVisualisasi Bar Plot (Kategorikal)\n")
## 
## Visualisasi Bar Plot (Kategorikal)
for (var in categorical_vars) {
  if (length(unique(data[[var]])) < 20) {  # batasi yang kategorinya terlalu banyak
    p <- ggplot(data, aes(x = !!sym(var))) +
      geom_bar(fill = "salmon", color = "black") +
      labs(title = paste("Distribusi:", var), x = var, y = "Jumlah") +
      theme_minimal()
    print(p)
  } else {
    cat(paste("Lewati visualisasi", var, "- terlalu banyak kategori.\n"))
  }
}

## Lewati visualisasi Occupation - terlalu banyak kategori.

Statistik deskriptif / EDA variabell numerik dan kategorik

# Statistik Deskriptif untuk variabel numerik
cat("\nStatistik Deskriptif (Numerik):\n")
## 
## Statistik Deskriptif (Numerik):
print(summary(data[numeric_vars]))
##       Age        Sleep_Duration  Sleep_Quality   Physical_Activity
##  Min.   :18.00   Min.   :3.500   Min.   :2.000   Min.   :1.000    
##  1st Qu.:33.00   1st Qu.:6.000   1st Qu.:3.600   1st Qu.:2.000    
##  Median :39.00   Median :6.300   Median :3.900   Median :3.000    
##  Mean   :38.89   Mean   :6.338   Mean   :3.848   Mean   :2.979    
##  3rd Qu.:45.00   3rd Qu.:7.000   3rd Qu.:4.000   3rd Qu.:4.000    
##  Max.   :60.00   Max.   :8.000   Max.   :5.000   Max.   :5.000    
##   Screen_Time    Caffeine_Intake Alcohol_Intake     Work_Hours    
##  Min.   :2.000   Min.   :0.000   Min.   :0.0000   Min.   : 6.000  
##  1st Qu.:4.000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 8.000  
##  Median :4.000   Median :2.000   Median :1.0000   Median : 8.000  
##  Mean   :4.105   Mean   :1.819   Mean   :0.8887   Mean   : 8.259  
##  3rd Qu.:5.000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 9.000  
##  Max.   :8.000   Max.   :4.000   Max.   :2.0000   Max.   :14.000  
##   Travel_Time    Social_Interactions Blood_Pressure  Cholesterol_Level
##  Min.   :0.500   Min.   :1.000       Min.   :110.0   Min.   :150.0    
##  1st Qu.:2.000   1st Qu.:3.000       1st Qu.:130.0   1st Qu.:210.0    
##  Median :3.000   Median :3.000       Median :140.0   Median :220.0    
##  Mean   :2.858   Mean   :3.197       Mean   :137.9   Mean   :220.8    
##  3rd Qu.:4.000   3rd Qu.:4.000       3rd Qu.:150.0   3rd Qu.:230.0    
##  Max.   :5.000   Max.   :5.000       Max.   :170.0   Max.   :290.0    
##  Blood_Sugar_Level
##  Min.   : 80.0    
##  1st Qu.:105.0    
##  Median :115.0    
##  Mean   :111.8    
##  3rd Qu.:120.0    
##  Max.   :150.0
# Frekuensi untuk variabel kategorikal
cat("\nDistribusi Kategorikal:\n")
## 
## Distribusi Kategorikal:
for (var in categorical_vars) {
  cat(paste0("\n", var, ":\n"))
  print(table(data[[var]]))
}
## 
## Gender:
## 
## Female   Male 
##    384    389 
## 
## Occupation:
## 
##             Account Manager                  Accountant 
##                           1                          11 
##                       Actor       Advertising Executive 
##                           4                           1 
##         Advertising Manager                   Architect 
##                           1                          17 
##                      Artist                       Baker 
##                           7                           1 
##                Bakery Owner                Bank Manager 
##                           2                           3 
##                      Banker                   Bartender 
##                           3                           2 
##                   Biologist                  Blacksmith 
##                           1                           3 
##               Brand Manager                  Bus Driver 
##                           1                           2 
##            Business Analyst         Business Consultant 
##                           8                           4 
##              Business Owner                   Carpenter 
##                           1                           5 
##                         CEO                        Chef 
##                           1                          26 
##              Civil Engineer               Civil Servant 
##                          12                           1 
##                     Cleaner                     Cobbler 
##                           1                           2 
##       Construction Engineer        Construction Manager 
##                           1                           6 
##         Construction Worker                  Consultant 
##                          14                           3 
##             Content Creator          Content Strategist 
##                           2                           1 
##              Content Writer                  Copywriter 
##                           6                           3 
##                     Courier            Customer Support 
##                           1                           3 
##                Data Analyst               Data Engineer 
##                          10                           1 
##              Data Scientist      Database Administrator 
##                          10                           1 
##             Delivery Driver                     Dentist 
##                           1                           3 
##                    Designer                   Developer 
##                           2                           1 
##            Digital Marketer                      Doctor 
##                           2                           8 
##                      Driver                      Editor 
##                           4                           2 
##         Electrical Engineer       Electrical Technician 
##                           3                           1 
##                 Electrician                    Engineer 
##                          17                          11 
##                Entrepreneur           Event Coordinator 
##                           3                           1 
##               Event Manager               Event Planner 
##                           1                           6 
##          Executive Director              Factory Worker 
##                           1                           1 
##                      Farmer            Fashion Designer 
##                           8                           8 
##           Financial Advisor           Financial Analyst 
##                           2                           7 
##           Financial Planner                 Firefighter 
##                           1                           1 
##                 Fisherwoman          Fitness Instructor 
##                           2                           1 
##             Fitness Trainer               Flower Seller 
##                           4                           2 
##                  Freelancer            Graphic Designer 
##                           5                          18 
##                Hair Stylist           Handicrafts Maker 
##                           1                           2 
##        Healthcare Assistant                HR Executive 
##                           1                           2 
##                  HR Manager               HR Specialist 
##                          13                           8 
##             Human Resources     Human Resources Manager 
##                           3                           1 
##             Insurance Agent           Interior Designer 
##                           1                           7 
##               IT Consultant                  IT Manager 
##                           6                           1 
##               IT Specialist                  IT Support 
##                           2                           1 
##       IT Support Specialist                     Janitor 
##                           1                           1 
##                  Journalist       Laboratory Technician 
##                          16                           1 
##                      Lawyer                   Librarian 
##                           7                           7 
##                     Manager          Marketing Director 
##                           1                           1 
##         Marketing Executive           Marketing Manager 
##                           8                          11 
##        Marketing Specialist                    Mechanic 
##                           7                           8 
##         Mechanical Engineer           Medical Assistant 
##                           4                           1 
##                    Musician                       Nanny 
##                           3                           2 
##       Network Administrator            Network Engineer 
##                           2                           1 
##                       Nurse          Nurse Practitioner 
##                          23                           1 
##      Nutritional Specialist                Nutritionist 
##                           1                           1 
##          Operations Manager                     Painter 
##                           3                           2 
##            Personal Trainer                  Pharmacist 
##                           1                           6 
##                Photographer                   Physician 
##                          22                           4 
##                   Physicist             Physiotherapist 
##                           1                           2 
##                       Pilot                     Plumber 
##                           1                           9 
##              Police Officer                      Potter 
##                           6                           2 
##      Primary School Teacher            Product Designer 
##                           2                           1 
##             Product Manager             Program Manager 
##                           3                           1 
##         Project Coordinator             Project Manager 
##                           1                          15 
##                Psychologist Public Relations Specialist 
##                           6                           4 
##           Real Estate Agent                Receptionist 
##                           7                           1 
##            Research Analyst          Research Assistant 
##                           1                           3 
##          Research Scientist                  Researcher 
##                           5                           2 
##          Restaurant Manager              Retail Manager 
##                           2                           4 
##               Retail Worker                     Retired 
##                           2                           1 
##             Sales Executive               Sales Manager 
##                           1                           9 
##        Sales Representative                 Salesperson 
##                           6                           1 
##                   Scientist                  Seamstress 
##                          17                           2 
##                   Secretary              Security Guard 
##                           2                           3 
##            Security Officer              SEO Specialist 
##                           1                           5 
##                  Shopkeeper               Social Worker 
##                           2                           5 
##          Software Architect          Software Developer 
##                           2                          15 
##           Software Engineer             Software Tester 
##                          16                           2 
##               Street Vendor                     Student 
##                           2                           3 
##                     Surgeon                      Tailor 
##                           1                           5 
##                 Taxi Driver                     Teacher 
##                           2                          37 
##                  Technician                   Therapist 
##                           1                           1 
##                Truck Driver                 UX Designer 
##                           9                           1 
##            Vegetable Vendor                Veterinarian 
##                           2                           8 
##                    Waitress            Warehouse Worker 
##                           3                           1 
##                      Weaver               Web Developer 
##                           2                          14 
##                      Writer 
##                           5 
## 
## Marital_Status:
## 
## Divorced  Married   Single 
##       60      360      353 
## 
## Smoking_Habit:
## 
##  No Yes 
## 358 415 
## 
## Meditation_Practice:
## 
##  No Yes 
## 292 481 
## 
## Exercise_Type:
## 
##          Aerobics            Cardio        Meditation           Pilates 
##                 5               215                24                88 
## Strength Training           Walking              Yoga 
##               245                 5               191 
## 
## Stress_Detection:
## 
##   High    Low Medium 
##    301    162    310

Korelasi antar variabel numerik

cat("\nKorelasi Variabel Numerik:\n")
## 
## Korelasi Variabel Numerik:
cor_matrix <- cor(data[numeric_vars], use = "complete.obs")
print(round(cor_matrix, 2))
##                       Age Sleep_Duration Sleep_Quality Physical_Activity
## Age                  1.00          -0.17          0.07              0.29
## Sleep_Duration      -0.17           1.00          0.21             -0.19
## Sleep_Quality        0.07           0.21          1.00              0.10
## Physical_Activity    0.29          -0.19          0.10              1.00
## Screen_Time          0.35          -0.25          0.02              0.72
## Caffeine_Intake      0.33          -0.25          0.09              0.73
## Alcohol_Intake       0.25          -0.24          0.15              0.59
## Work_Hours           0.25           0.24          0.01              0.12
## Travel_Time          0.24          -0.32         -0.07              0.68
## Social_Interactions  0.10           0.15          0.33              0.09
## Blood_Pressure       0.40          -0.31          0.07              0.66
## Cholesterol_Level    0.46          -0.29          0.15              0.59
## Blood_Sugar_Level    0.50          -0.22          0.16              0.61
##                     Screen_Time Caffeine_Intake Alcohol_Intake Work_Hours
## Age                        0.35            0.33           0.25       0.25
## Sleep_Duration            -0.25           -0.25          -0.24       0.24
## Sleep_Quality              0.02            0.09           0.15       0.01
## Physical_Activity          0.72            0.73           0.59       0.12
## Screen_Time                1.00            0.68           0.54       0.31
## Caffeine_Intake            0.68            1.00           0.65       0.08
## Alcohol_Intake             0.54            0.65           1.00      -0.03
## Work_Hours                 0.31            0.08          -0.03       1.00
## Travel_Time                0.50            0.72           0.57      -0.11
## Social_Interactions        0.10           -0.08          -0.02       0.45
## Blood_Pressure             0.61            0.74           0.61      -0.05
## Cholesterol_Level          0.62            0.59           0.45       0.24
## Blood_Sugar_Level          0.64            0.63           0.47       0.23
##                     Travel_Time Social_Interactions Blood_Pressure
## Age                        0.24                0.10           0.40
## Sleep_Duration            -0.32                0.15          -0.31
## Sleep_Quality             -0.07                0.33           0.07
## Physical_Activity          0.68                0.09           0.66
## Screen_Time                0.50                0.10           0.61
## Caffeine_Intake            0.72               -0.08           0.74
## Alcohol_Intake             0.57               -0.02           0.61
## Work_Hours                -0.11                0.45          -0.05
## Travel_Time                1.00               -0.19           0.74
## Social_Interactions       -0.19                1.00          -0.21
## Blood_Pressure             0.74               -0.21           1.00
## Cholesterol_Level          0.52                0.09           0.81
## Blood_Sugar_Level          0.57                0.03           0.83
##                     Cholesterol_Level Blood_Sugar_Level
## Age                              0.46              0.50
## Sleep_Duration                  -0.29             -0.22
## Sleep_Quality                    0.15              0.16
## Physical_Activity                0.59              0.61
## Screen_Time                      0.62              0.64
## Caffeine_Intake                  0.59              0.63
## Alcohol_Intake                   0.45              0.47
## Work_Hours                       0.24              0.23
## Travel_Time                      0.52              0.57
## Social_Interactions              0.09              0.03
## Blood_Pressure                   0.81              0.83
## Cholesterol_Level                1.00              0.94
## Blood_Sugar_Level                0.94              1.00
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)

Cek missing value

cat("\nCek Missing Values:\n")
## 
## Cek Missing Values:
colSums(is.na(data))
##                 Age              Gender          Occupation      Marital_Status 
##                   0                   0                   0                   0 
##      Sleep_Duration       Sleep_Quality        Wake_Up_Time            Bed_Time 
##                   0                   0                   0                   0 
##   Physical_Activity         Screen_Time     Caffeine_Intake      Alcohol_Intake 
##                   0                   0                   0                   0 
##       Smoking_Habit          Work_Hours         Travel_Time Social_Interactions 
##                   0                   0                   0                   0 
## Meditation_Practice       Exercise_Type      Blood_Pressure   Cholesterol_Level 
##                   0                   0                   0                   0 
##   Blood_Sugar_Level    Stress_Detection 
##                   0                   0

Cek outlier

cat("\nBoxplot untuk Deteksi Outlier:\n")
## 
## Boxplot untuk Deteksi Outlier:
for (var in numeric_vars) {
  p <- ggplot(data, aes(y = !!sym(var))) +
    geom_boxplot(fill = "lightgreen") +
    labs(title = paste("Boxplot:", var), y = var) +
    theme_minimal()
  print(p)
}

Cek distribusi target

cat("\nDistribusi Target Variable (Stress_Detection):\n")
## 
## Distribusi Target Variable (Stress_Detection):
print(table(data$Stress_Detection))
## 
##   High    Low Medium 
##    301    162    310
barplot(table(data$Stress_Detection), col = "orange", main = "Distribusi Stress Detection")

Normalisasi

# Fungsi normalisasi min-max
normalize <- function(x) {
  return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}

# Normalisasi semua variabel numerik
data[numeric_vars] <- lapply(data[numeric_vars], normalize)

# Cek hasil normalisasi (optional)
summary(data[numeric_vars])
##       Age         Sleep_Duration   Sleep_Quality    Physical_Activity
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.3571   1st Qu.:0.5556   1st Qu.:0.5333   1st Qu.:0.2500   
##  Median :0.5000   Median :0.6222   Median :0.6333   Median :0.5000   
##  Mean   :0.4973   Mean   :0.6308   Mean   :0.6160   Mean   :0.4948   
##  3rd Qu.:0.6429   3rd Qu.:0.7778   3rd Qu.:0.6667   3rd Qu.:0.7500   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##   Screen_Time     Caffeine_Intake  Alcohol_Intake     Work_Hours    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3333   1st Qu.:0.2500   1st Qu.:0.0000   1st Qu.:0.2500  
##  Median :0.3333   Median :0.5000   Median :0.5000   Median :0.2500  
##  Mean   :0.3509   Mean   :0.4547   Mean   :0.4444   Mean   :0.2823  
##  3rd Qu.:0.5000   3rd Qu.:0.5000   3rd Qu.:0.5000   3rd Qu.:0.3750  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   Travel_Time     Social_Interactions Blood_Pressure   Cholesterol_Level
##  Min.   :0.0000   Min.   :0.0000      Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.3333   1st Qu.:0.5000      1st Qu.:0.3333   1st Qu.:0.4286   
##  Median :0.5556   Median :0.5000      Median :0.5000   Median :0.5000   
##  Mean   :0.5241   Mean   :0.5492      Mean   :0.4657   Mean   :0.5060   
##  3rd Qu.:0.7778   3rd Qu.:0.7500      3rd Qu.:0.6667   3rd Qu.:0.5714   
##  Max.   :1.0000   Max.   :1.0000      Max.   :1.0000   Max.   :1.0000   
##  Blood_Sugar_Level
##  Min.   :0.0000   
##  1st Qu.:0.3571   
##  Median :0.5000   
##  Mean   :0.4538   
##  3rd Qu.:0.5714   
##  Max.   :1.0000

Encoding (factor)

categorical_vars_except_target <- setdiff(c("Gender", "Occupation", "Marital_Status", "Smoking_Habit", "Meditation_Practice", "Exercise_Type"), "Stress_Detection")

for (var in categorical_vars_except_target) {
  data[[var]] <- as.factor(data[[var]])
}

Validasi target

data$Stress_Level <- factor(data$Stress_Detection,
                           levels = c("Low", "Medium", "High"),
                           ordered = TRUE)

Load library uji dan model

library(MASS)    # untuk polr
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)     # untuk VIF
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(brant)   # untuk Brant test
## Warning: package 'brant' was built under R version 4.4.3

Inisialisasi variabel predictor

predictors <- c(numeric_vars, categorical_vars_except_target)

Load Model

formula_olr <- as.formula(
  paste("Stress_Level ~", paste(predictors, collapse = " + "))
)

Uji Asumsi

Uji Homogenitas Levene Test

# Langkah ini HARUS dilakukan dulu:
data_reduced <- data[, !(names(data) %in% c("Occupation", "Cholesterol_Level", "Blood_Sugar_Level"))]

# Ambil variabel numerik selain target
numeric_vars <- names(Filter(is.numeric, data_reduced))
numeric_vars <- setdiff(numeric_vars, "Stress_Level")

# Inisialisasi dataframe kosong untuk menyimpan hasil
levene_results <- data.frame(
  Variabel = character(),
  Df_Group = integer(),
  Df_Residual = integer(),
  F_value = numeric(),
  p_value = numeric(),
  Significance = character(),
  stringsAsFactors = FALSE
)

# Loop tiap variabel numerik dan simpan hasil ke dalam dataframe
for (var in numeric_vars) {
  formula_levene <- as.formula(paste(var, "~ Stress_Level"))
  test_result <- leveneTest(formula_levene, data = data_reduced)
  
  # Ambil nilai dari hasil test
  df_group <- test_result[1, "Df"]
  df_residual <- test_result[2, "Df"]
  f_value <- test_result[1, "F value"]
  p_value <- test_result[1, "Pr(>F)"]
  
  # Tentukan simbol signifikansi
  sig <- if (p_value < 0.001) {
    "***"
  } else if (p_value < 0.01) {
    "**"
  } else if (p_value < 0.05) {
    "*"
  } else if (p_value < 0.1) {
    "."
  } else {
    ""
  }
  
  # Tambahkan baris ke dataframe
  levene_results <- rbind(levene_results, data.frame(
    Variabel = var,
    Df_Group = df_group,
    Df_Residual = df_residual,
    F_value = round(f_value, 4),
    p_value = signif(p_value, 5),
    Significance = sig
  ))
}

print(levene_results)
##               Variabel Df_Group Df_Residual F_value    p_value Significance
## 1                  Age        2         770  1.7085 1.8182e-01             
## 2       Sleep_Duration        2         770  2.3866 9.2618e-02            .
## 3        Sleep_Quality        2         770  8.6998 1.8355e-04          ***
## 4    Physical_Activity        2         770  4.1469 1.6168e-02            *
## 5          Screen_Time        2         770 11.7555 9.3554e-06          ***
## 6      Caffeine_Intake        2         770 14.0041 1.0619e-06          ***
## 7       Alcohol_Intake        2         770 22.0597 4.8305e-10          ***
## 8           Work_Hours        2         770  0.0326 9.6789e-01             
## 9          Travel_Time        2         770  6.4970 1.5919e-03           **
## 10 Social_Interactions        2         770 10.1524 4.4461e-05          ***
## 11      Blood_Pressure        2         770  6.4483 1.6701e-03           **

Uji Multikolinearitas VIF

formula_lm <- as.formula(
  paste("as.numeric(Stress_Level) ~", paste(predictors, collapse = " + "))
)
model_lm <- lm(formula_lm, data = data)

cat("\nUji Multikolinearitas (VIF):\n")
## 
## Uji Multikolinearitas (VIF):
vif_values <- vif(model_lm)
print(vif_values)
##                            GVIF  Df GVIF^(1/(2*Df))
## Age                    1.909720   1        1.381926
## Sleep_Duration         2.196792   1        1.482158
## Sleep_Quality          2.005374   1        1.416112
## Physical_Activity      4.937866   1        2.222131
## Screen_Time            4.408499   1        2.099643
## Caffeine_Intake        4.999134   1        2.235874
## Alcohol_Intake         3.039647   1        1.743458
## Work_Hours             3.178136   1        1.782733
## Travel_Time            4.686248   1        2.164774
## Social_Interactions    2.823614   1        1.680361
## Blood_Pressure        11.127018   1        3.335718
## Cholesterol_Level     17.341192   1        4.164276
## Blood_Sugar_Level     16.584033   1        4.072350
## Gender                 2.140902   1        1.463182
## Occupation          2768.354669 168        1.023870
## Marital_Status         2.741720   2        1.286784
## Smoking_Habit          1.832087   1        1.353546
## Meditation_Practice    2.004254   1        1.415717
## Exercise_Type         26.452024   6        1.313827

Penanganan Multikolinearitas

vars_check <- c("Blood_Pressure", "Cholesterol_Level", "Blood_Sugar_Level")

data_num <- data[vars_check]

cor_matrix_check <- cor(data_num, use = "complete.obs")

print(round(cor_matrix_check, 2))
##                   Blood_Pressure Cholesterol_Level Blood_Sugar_Level
## Blood_Pressure              1.00              0.81              0.83
## Cholesterol_Level           0.81              1.00              0.94
## Blood_Sugar_Level           0.83              0.94              1.00
# Visualisasi korelasi dengan corrplot
library(corrplot)
corrplot(cor_matrix_check, method = "number", type = "upper")

# Pastikan dplyr sudah di-load (tidak wajib jika pakai base R)
library(dplyr)

# Hapus kolom Occupation, Cholesterol_Level, dan Blood_Sugar_Level dari data
data_reduced <- data[, !(names(data) %in% c("Occupation", "Cholesterol_Level", "Blood_Sugar_Level"))]

# Update variabel prediktor, hapus 'Occupation', 'Cholesterol_Level', dan 'Blood_Sugar_Level'
predictors_reduced <- setdiff(predictors, c("Occupation", "Cholesterol_Level", "Blood_Sugar_Level"))

# Buat formula baru untuk model OLR dan linear (cek VIF)
formula_olr_reduced <- as.formula(
  paste("Stress_Level ~", paste(predictors_reduced, collapse = " + "))
)

formula_lm_reduced <- as.formula(
  paste("as.numeric(Stress_Level) ~", paste(predictors_reduced, collapse = " + "))
)

Uji Model

# Model linear untuk cek VIF
model_lm_reduced <- lm(formula_lm_reduced, data = data_reduced)
cat("\nVIF setelah hapus Occupation, Cholesterol_Level, dan Blood_Sugar_Level:\n")
## 
## VIF setelah hapus Occupation, Cholesterol_Level, dan Blood_Sugar_Level:
print(vif(model_lm_reduced))
##                         GVIF Df GVIF^(1/(2*Df))
## Age                 1.458957  1        1.207873
## Sleep_Duration      1.445326  1        1.202217
## Sleep_Quality       1.468714  1        1.211905
## Physical_Activity   3.585626  1        1.893575
## Screen_Time         3.132329  1        1.769839
## Caffeine_Intake     3.739988  1        1.933905
## Alcohol_Intake      2.244304  1        1.498100
## Work_Hours          2.145756  1        1.464840
## Travel_Time         3.611850  1        1.900487
## Social_Interactions 2.017837  1        1.420506
## Blood_Pressure      4.002524  1        2.000631
## Gender              1.335309  1        1.155556
## Marital_Status      1.509696  2        1.108466
## Smoking_Habit       1.350746  1        1.162216
## Meditation_Practice 1.436075  1        1.198364
## Exercise_Type       2.517047  6        1.079960
# Model OLR tanpa Occupation, Cholesterol_Level, dan Blood_Sugar_Level
model_olr_reduced <- polr(formula_olr_reduced, data = data_reduced, Hess = TRUE)
cat("\nSummary Model OLR tanpa Occupation, Cholesterol_Level, dan Blood_Sugar_Level:\n")
## 
## Summary Model OLR tanpa Occupation, Cholesterol_Level, dan Blood_Sugar_Level:
print(summary(model_olr_reduced))
## Call:
## polr(formula = formula_olr_reduced, data = data_reduced, Hess = TRUE)
## 
## Coefficients:
##                                   Value Std. Error t value
## Age                            -0.53885     0.4973 -1.0835
## Sleep_Duration                 -2.58445     0.5705 -4.5303
## Sleep_Quality                  -0.85584     0.5129 -1.6686
## Physical_Activity              -1.09287     0.7423 -1.4723
## Screen_Time                     2.39654     0.9909  2.4186
## Caffeine_Intake                 0.18292     0.6982  0.2620
## Alcohol_Intake                  0.79914     0.3609  2.2145
## Work_Hours                      4.90542     0.8924  5.4966
## Travel_Time                     0.08261     0.6105  0.1353
## Social_Interactions             1.29990     0.5309  2.4486
## Blood_Pressure                  5.17643     0.7355  7.0382
## GenderMale                     -0.36254     0.1777 -2.0404
## Marital_StatusMarried           0.09306     0.3185  0.2922
## Marital_StatusSingle           -0.29668     0.3193 -0.9292
## Smoking_HabitYes                0.26530     0.1753  1.5136
## Meditation_PracticeYes          0.63085     0.1832  3.4434
## Exercise_TypeCardio            -2.28877     0.9860 -2.3212
## Exercise_TypeMeditation        -0.86557     1.0738 -0.8061
## Exercise_TypePilates           -2.39004     0.9890 -2.4167
## Exercise_TypeStrength Training -1.64813     0.9837 -1.6754
## Exercise_TypeWalking           -2.51186     1.5769 -1.5929
## Exercise_TypeYoga              -2.11336     0.9830 -2.1499
## 
## Intercepts:
##             Value   Std. Error t value
## Low|Medium  -0.9444  1.1366    -0.8310
## Medium|High  1.7433  1.1361     1.5345
## 
## Residual Deviance: 1203.024 
## AIC: 1251.024

Uji Brant Test

library(brant)

brant_result <- brant(model_olr_reduced)  # model tanpa Occupation
## -------------------------------------------------------------------- 
## Test for             X2  df  probability 
## -------------------------------------------------------------------- 
## Omnibus                  43.38   22  0
## Age                  0.38    1   0.54
## Sleep_Duration               0.11    1   0.74
## Sleep_Quality                1.52    1   0.22
## Physical_Activity            2.3 1   0.13
## Screen_Time              0   1   0.97
## Caffeine_Intake          0.52    1   0.47
## Alcohol_Intake               0.17    1   0.68
## Work_Hours               5.89    1   0.02
## Travel_Time              0.29    1   0.59
## Social_Interactions          0.02    1   0.89
## Blood_Pressure               0.9 1   0.34
## GenderMale               0.09    1   0.77
## Marital_StatusMarried            0.33    1   0.57
## Marital_StatusSingle         0.23    1   0.63
## Smoking_HabitYes         0.51    1   0.48
## Meditation_PracticeYes       5.49    1   0.02
## Exercise_TypeCardio          0   1   0.99
## Exercise_TypeMeditation      0   1   0.99
## Exercise_TypePilates         0   1   0.99
## Exercise_TypeStrength Training   0   1   0.99
## Exercise_TypeWalking         0   1   1
## Exercise_TypeYoga            0   1   0.99
## -------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
## Warning in brant(model_olr_reduced): 291 combinations in table(dv,ivs) do not
## occur. Because of that, the test results might be invalid.
print(brant_result)
##                                          X2 df probability
## Omnibus                        4.338153e+01 22 0.004234094
## Age                            3.821521e-01  1 0.536453690
## Sleep_Duration                 1.097887e-01  1 0.740384781
## Sleep_Quality                  1.522510e+00  1 0.217240046
## Physical_Activity              2.303068e+00  1 0.129118777
## Screen_Time                    1.001217e-03  1 0.974757535
## Caffeine_Intake                5.235043e-01  1 0.469350661
## Alcohol_Intake                 1.690718e-01  1 0.680938052
## Work_Hours                     5.889732e+00  1 0.015229416
## Travel_Time                    2.857900e-01  1 0.592931098
## Social_Interactions            1.793536e-02  1 0.893463498
## Blood_Pressure                 8.967436e-01  1 0.343656386
## GenderMale                     8.627875e-02  1 0.768962361
## Marital_StatusMarried          3.279669e-01  1 0.566858706
## Marital_StatusSingle           2.264632e-01  1 0.634158806
## Smoking_HabitYes               5.068980e-01  1 0.476484760
## Meditation_PracticeYes         5.488855e+00  1 0.019138071
## Exercise_TypeCardio            1.678269e-04  1 0.989663854
## Exercise_TypeMeditation        1.577229e-04  1 0.989979809
## Exercise_TypePilates           1.638582e-04  1 0.989786791
## Exercise_TypeStrength Training 1.715674e-04  1 0.989549308
## Exercise_TypeWalking           2.723550e-07  1 0.999583603
## Exercise_TypeYoga              1.767143e-04  1 0.989393720
model_null <- polr(Stress_Level ~ 1, data = data_reduced, Hess = TRUE)
anova_result <- anova(model_null, model_olr_reduced, test = "Chisq")
print(anova_result)
## Likelihood ratio tests of ordinal regression models
## 
## Response: Stress_Level
##                                                                                                                                                                                                                                                         Model
## 1                                                                                                                                                                                                                                                           1
## 2 Age + Sleep_Duration + Sleep_Quality + Physical_Activity + Screen_Time + Caffeine_Intake + Alcohol_Intake + Work_Hours + Travel_Time + Social_Interactions + Blood_Pressure + Gender + Marital_Status + Smoking_Habit + Meditation_Practice + Exercise_Type
##   Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1       771   1640.595                              
## 2       749   1203.024 1 vs 2    22 437.5708       0

Cek koefisien model

exp_coef <- exp(coef(model_olr_reduced))
print(exp_coef)
##                            Age                 Sleep_Duration 
##                     0.58341749                     0.07543769 
##                  Sleep_Quality              Physical_Activity 
##                     0.42492652                     0.33525253 
##                    Screen_Time                Caffeine_Intake 
##                    10.98506882                     1.20071877 
##                 Alcohol_Intake                     Work_Hours 
##                     2.22362045                   135.01986788 
##                    Travel_Time            Social_Interactions 
##                     1.08611331                     3.66894638 
##                 Blood_Pressure                     GenderMale 
##                   177.04897441                     0.69590562 
##          Marital_StatusMarried           Marital_StatusSingle 
##                     1.09752446                     0.74327879 
##               Smoking_HabitYes         Meditation_PracticeYes 
##                     1.30381637                     1.87920855 
##            Exercise_TypeCardio        Exercise_TypeMeditation 
##                     0.10139134                     0.42081203 
##           Exercise_TypePilates Exercise_TypeStrength Training 
##                     0.09162601                     0.19241009 
##           Exercise_TypeWalking              Exercise_TypeYoga 
##                     0.08111703                     0.12083094

Prediksi dan Evaluasi

# Prediksi kategori stress
predicted_stress <- predict(model_olr_reduced, newdata = data_reduced)

# Convert predicted_stress to an ordered factor with the same levels as Stress_Level
predicted_stress <- factor(predicted_stress, 
                          levels = levels(data_reduced$Stress_Level), 
                          ordered = TRUE)

# Buat confusion matrix
table(Predicted = predicted_stress, Actual = data_reduced$Stress_Level)
##          Actual
## Predicted Low Medium High
##    Low     70     34    1
##    Medium  79    222   89
##    High    13     54  211
# Hitung akurasi sederhana
mean(predicted_stress == data_reduced$Stress_Level)
## [1] 0.6507115

Confusion Matrix

library(ggplot2)
conf_matrix <- table(Predicted = predicted_stress, Actual = data_reduced$Stress_Level)

conf_df <- as.data.frame(conf_matrix)
ggplot(conf_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Confusion Matrix: Prediksi vs Aktual")

Uji kemungkinan model

library(MASS)    # Untuk polr (Ordinal Logistic Regression)
library(car)     # Untuk VIF
library(brant)   # Untuk Brant test
library(ggplot2) # Untuk visualisasi
library(dplyr)   # Untuk manipulasi data
library(gridExtra) # Untuk menampilkan plot secara bersamaan

# Definisikan variabel prediktor berdasarkan tabel
# Variabel yang digunakan setelah penghapusan (berdasarkan data_reduced)
predictors_reduced <- setdiff(names(data_reduced), c("Stress_Level", "Stress_Detection"))

# Subset prediktor numerik dan kategorikal (tanpa Occupation, Cholesterol_Level, Blood_Sugar_Level)
numeric_vars_reduced <- names(Filter(is.numeric, data_reduced))
numeric_vars_reduced <- setdiff(numeric_vars_reduced, c("Stress_Level", "Stress_Detection"))
categorical_vars_reduced <- names(Filter(is.factor, data_reduced))
categorical_vars_reduced <- setdiff(categorical_vars_reduced, c("Stress_Level", "Stress_Detection"))

Model OLR Biasa

predictors_olr_biasa <- c("Sleep_Quality", "Screen_Time", "Age", "Sleep_Duration", "Physical_Activity", "Caffeine_Intake", "Alcohol_Intake", "Travel_Time", "Social_Interactions", "Blood_Pressure", "Gender", "Marital_Status","Smoking_Habit", "Exercise_Type")
formula_olr_biasa <- as.formula(
  paste("Stress_Level ~", paste(predictors_olr_biasa, collapse = " + "))
)
model_olr_biasa <- polr(formula_olr_biasa, data = data_reduced, Hess = TRUE)
# Prediksi dan akurasi
pred_olr_biasa <- predict(model_olr_biasa, newdata = data_reduced)
pred_olr_biasa <- factor(pred_olr_biasa, levels = levels(data_reduced$Stress_Level), ordered = TRUE)
acc_olr_biasa <- mean(pred_olr_biasa == data_reduced$Stress_Level)

Model OLR + Partial

# Gunakan semua variabel prediktor + Work_Hours sebagai mediator
predictors_olr_partial <- predictors_reduced
formula_olr_partial <- as.formula(
  paste("Stress_Level ~", paste(predictors_olr_partial, collapse = " + "))
)
model_olr_partial <- polr(formula_olr_partial, data = data_reduced, Hess = TRUE)

# Prediksi dan akurasi
pred_olr_partial <- predict(model_olr_partial, newdata = data_reduced)
pred_olr_partial <- factor(pred_olr_partial, levels = levels(data_reduced$Stress_Level), ordered = TRUE)
acc_olr_partial <- mean(pred_olr_partial == data_reduced$Stress_Level)

Model OLR Parsial

# Hanya gunakan variabel paling signifikan (misalnya berdasarkan summary model_olr_partial)
predictors_olr_parsimoni <- c("Work_Hours", "Meditation_Practice") # Sesuaikan berdasarkan signifikansi
formula_olr_parsimoni <- as.formula(
  paste("Stress_Level ~", paste(predictors_olr_parsimoni, collapse = " + "))
)
model_olr_parsimoni <- polr(formula_olr_parsimoni, data = data_reduced, Hess = TRUE)

# Prediksi dan akurasi
pred_olr_parsimoni <- predict(model_olr_parsimoni, newdata = data_reduced)
pred_olr_parsimoni <- factor(pred_olr_parsimoni, levels = levels(data_reduced$Stress_Level), ordered = TRUE)
acc_olr_parsimoni <- mean(pred_olr_parsimoni == data_reduced$Stress_Level)

Perbandingan model

model_comparison <- data.frame(
  Model = c("OLR Biasa", "OLR + Partial", "OLR Parsial"),
  AIC = c(AIC(model_olr_biasa), AIC(model_olr_partial), AIC(model_olr_parsimoni)),
  Accuracy = c(acc_olr_biasa, acc_olr_partial, acc_olr_parsimoni)
)

# Tampilkan perbandingan
cat("\n[Perbandingan Model]\n")
## 
## [Perbandingan Model]
print(model_comparison)
##           Model      AIC  Accuracy
## 1     OLR Biasa 1292.412 0.6287193
## 2 OLR + Partial 1242.585 0.6636481
## 3   OLR Parsial 1561.791 0.5355757
# OLR Biasa
conf_matrix_biasa <- table(Predicted = pred_olr_biasa, Actual = data_reduced$Stress_Level)
conf_df_biasa <- as.data.frame(conf_matrix_biasa)
p1 <- ggplot(conf_df_biasa, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Confusion Matrix: OLR Biasa")

# OLR + Partial
conf_matrix_partial <- table(Predicted = pred_olr_partial, Actual = data_reduced$Stress_Level)
conf_df_partial <- as.data.frame(conf_matrix_partial)
p2 <- ggplot(conf_df_partial, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Confusion Matrix: OLR + Partial")

# OLR Parsial
conf_matrix_parsimoni <- table(Predicted = pred_olr_parsimoni, Actual = data_reduced$Stress_Level)
conf_df_parsimoni <- as.data.frame(conf_matrix_parsimoni)
p3 <- ggplot(conf_df_parsimoni, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Confusion Matrix: OLR Parsial")

# Tampilkan semua plot
grid.arrange(p1, p2, p3, ncol = 2)

ANALISIS DISKRIMINAN

Inisialisasi Library

library(dplyr)
library(MASS)
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(car)       
#install.packages("heplots")
library(heplots)      
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
## Warning: package 'broom' was built under R version 4.4.3
#install.packages("MVN")
library(MVN) 
## Warning: package 'MVN' was built under R version 4.4.3
library(stats)
#install.packages("ggfortify")
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3

load Data

data <- read.csv("D:/Kuliah Semester 4/Analisis Multivariat/stress_detection_data.csv")

categorical_vars <- c("Gender", "Occupation", "Marital_Status", "Smoking_Habit", 
                      "Meditation_Practice", "Exercise_Type", "Stress_Detection")
data[categorical_vars] <- lapply(data[categorical_vars], factor)

Pre-Processing

time_to_decimal <- function(time_str) {
  parsed <- strptime(time_str, format = "%I:%M %p")
  hour_decimal <- as.numeric(format(parsed, "%H")) + as.numeric(format(parsed, "%M")) / 60
  return(hour_decimal)
}
data$Wake_Up_Time_num <- time_to_decimal(data$Wake_Up_Time)
data$Bed_Time_num <- time_to_decimal(data$Bed_Time)

# Variabel numerik
numerik_vars <- c("Age", "Sleep_Duration", "Sleep_Quality", "Physical_Activity", "Screen_Time",
                  "Caffeine_Intake", "Alcohol_Intake", "Work_Hours", "Travel_Time",
                  "Social_Interactions", "Blood_Pressure", "Cholesterol_Level", "Blood_Sugar_Level",
                  "Wake_Up_Time_num", "Bed_Time_num")

# Ubah Stress_Detection ke numerik 
data$Stress_Num <- as.numeric(data$Stress_Detection)

Seleksi Fitur

# Korelasi numerik dengan target
cor_values <- sapply(data[, numerik_vars], function(x) cor(x, data$Stress_Num, use = "complete.obs"))
selected_numeric <- names(cor_values[abs(cor_values) > 0.1])

cat("Fitur numerik lolos korelasi:\n")
## Fitur numerik lolos korelasi:
print(selected_numeric)
##  [1] "Age"                 "Sleep_Duration"      "Physical_Activity"  
##  [4] "Screen_Time"         "Caffeine_Intake"     "Alcohol_Intake"     
##  [7] "Work_Hours"          "Travel_Time"         "Social_Interactions"
## [10] "Blood_Pressure"      "Cholesterol_Level"   "Blood_Sugar_Level"  
## [13] "Wake_Up_Time_num"
# Uji Chi-square fitur kategorikal dan tampilkan p-value
selected_categorical <- c()
for (v in categorical_vars[-length(categorical_vars)]) {
  tbl <- table(data[[v]], data$Stress_Detection)
  
  chi <- suppressWarnings(chisq.test(tbl))
  pval <- chi$p.value
  
  if (pval < 0.05) {
    selected_categorical <- c(selected_categorical, v)
  }
  
  cat(sprintf("- %s: p-value = %.2e %s\n", v, pval,
              ifelse(pval < 0.05, "--> Lolos", "--> Tidak Lolos")))
}
## - Gender: p-value = 2.57e-04 --> Lolos
## - Occupation: p-value = 1.11e-02 --> Lolos
## - Marital_Status: p-value = 6.80e-14 --> Lolos
## - Smoking_Habit: p-value = 2.50e-17 --> Lolos
## - Meditation_Practice: p-value = 6.36e-19 --> Lolos
## - Exercise_Type: p-value = 4.48e-08 --> Lolos
cat("\nFitur kategorikal yang lolos uji Chi-square:\n")
## 
## Fitur kategorikal yang lolos uji Chi-square:
print(selected_categorical)
## [1] "Gender"              "Occupation"          "Marital_Status"     
## [4] "Smoking_Habit"       "Meditation_Practice" "Exercise_Type"
selected_features <- c(selected_numeric, selected_categorical)
data_selected <- data[, c(selected_features, "Stress_Detection")]
# Distribusi Kelas
class_dist <- data_selected %>%
  count(Stress_Detection) %>%
  mutate(Percent = round(n / sum(n) * 100, 1))

cat("\n Distribusi kelas:\n")
## 
##  Distribusi kelas:
print(class_dist)
##   Stress_Detection   n Percent
## 1             High 301    38.9
## 2              Low 162    21.0
## 3           Medium 310    40.1

Feature Engineering

# Pisah numerik dan kategorikal
numeric_selected <- intersect(selected_features, numerik_vars)
categorical_selected <- setdiff(selected_features, numeric_selected)

# Skewness numerik
cat("\n Skewness Fitur Numerik:\n")
## 
##  Skewness Fitur Numerik:
for (var in numeric_selected) {
  sk <- skewness(data_selected[[var]], na.rm = TRUE)
  tag <- ifelse(abs(sk) > 0.5, "Bisa dinormalisasi", "Standarisasi cukup")
  cat(sprintf(" - %s: skewness = %.2f --> %s\n", var, sk, tag))
}
##  - Age: skewness = -0.01 --> Standarisasi cukup
##  - Sleep_Duration: skewness = -0.80 --> Bisa dinormalisasi
##  - Physical_Activity: skewness = -0.23 --> Standarisasi cukup
##  - Screen_Time: skewness = -0.03 --> Standarisasi cukup
##  - Caffeine_Intake: skewness = 0.03 --> Standarisasi cukup
##  - Alcohol_Intake: skewness = 0.14 --> Standarisasi cukup
##  - Work_Hours: skewness = 0.77 --> Bisa dinormalisasi
##  - Travel_Time: skewness = 0.10 --> Standarisasi cukup
##  - Social_Interactions: skewness = 0.13 --> Standarisasi cukup
##  - Blood_Pressure: skewness = 0.06 --> Standarisasi cukup
##  - Cholesterol_Level: skewness = -0.12 --> Standarisasi cukup
##  - Blood_Sugar_Level: skewness = -0.18 --> Standarisasi cukup
##  - Wake_Up_Time_num: skewness = 0.18 --> Standarisasi cukup
# Transformasi numerik
to_standardize <- c("Age", "Physical_Activity", "Screen_Time", "Caffeine_Intake", 
                    "Alcohol_Intake", "Travel_Time", "Social_Interactions",
                    "Blood_Pressure", "Cholesterol_Level", "Blood_Sugar_Level", 
                    "Wake_Up_Time_num")
to_normalize <- c("Sleep_Duration", "Work_Hours")

# Standarisasi
data_selected[to_standardize] <- scale(data_selected[to_standardize])

# Normalisasi
minmax_norm <- function(x) {(x - min(x)) / (max(x) - min(x))}
data_selected[to_normalize] <- lapply(data_selected[to_normalize], minmax_norm)

cat("\nSkewness fitur numerik setelah transformasi:\n")
## 
## Skewness fitur numerik setelah transformasi:
for (var in numeric_selected) {
  sk <- skewness(data_selected[[var]], na.rm = TRUE)
  tag <- ifelse(abs(sk) > 0.5, "Masih miring (perlu perhatian)", "Sudah cukup normal")
  cat(sprintf(" - %s: skewness = %.2f --> %s\n", var, sk, tag))
}
##  - Age: skewness = -0.01 --> Sudah cukup normal
##  - Sleep_Duration: skewness = -0.80 --> Masih miring (perlu perhatian)
##  - Physical_Activity: skewness = -0.23 --> Sudah cukup normal
##  - Screen_Time: skewness = -0.03 --> Sudah cukup normal
##  - Caffeine_Intake: skewness = 0.03 --> Sudah cukup normal
##  - Alcohol_Intake: skewness = 0.14 --> Sudah cukup normal
##  - Work_Hours: skewness = 0.77 --> Masih miring (perlu perhatian)
##  - Travel_Time: skewness = 0.10 --> Sudah cukup normal
##  - Social_Interactions: skewness = 0.13 --> Sudah cukup normal
##  - Blood_Pressure: skewness = 0.06 --> Sudah cukup normal
##  - Cholesterol_Level: skewness = -0.12 --> Sudah cukup normal
##  - Blood_Sugar_Level: skewness = -0.18 --> Sudah cukup normal
##  - Wake_Up_Time_num: skewness = 0.18 --> Sudah cukup normal
# Boxcox (karena masih terlalu miring pada beberapa fitur)
boxcox_transform <- function(x) {
  x_shifted <- x + 0.001
  bc <- boxcox(x_shifted ~ 1, lambda = seq(-2, 2, 0.1), plotit = FALSE)
  lambda_opt <- bc$x[which.max(bc$y)]
  
  if (abs(lambda_opt) < 1e-5) {
    return(log(x_shifted))
  } else {
    return((x_shifted^lambda_opt - 1) / lambda_opt)
  }
}

# Transformasi tambahan dengan Box-Cox ke Sleep_Duration dan Work_Hours
data_selected$Sleep_Duration_boxcox <- boxcox_transform(data_selected$Sleep_Duration)
data_selected$Work_Hours_boxcox <- boxcox_transform(data_selected$Work_Hours)

cat("Distribusi numerik Sleep_Duration (Box-Cox):\n")
## Distribusi numerik Sleep_Duration (Box-Cox):
print(summary(data_selected$Sleep_Duration_boxcox))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7142 -0.3998 -0.3458 -0.3311 -0.2110  0.0010
cat(sprintf("Skewness: %.4f\n\n", skewness(data_selected$Sleep_Duration_boxcox, na.rm = TRUE)))
## Skewness: -0.0238
cat("Distribusi numerik Work_Hours (Box-Cox):\n")
## Distribusi numerik Work_Hours (Box-Cox):
print(summary(data_selected$Work_Hours_boxcox))
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.4172239 -0.8857293 -0.8857293 -0.8525954 -0.7082382  0.0009999
cat(sprintf("Skewness: %.4f\n", skewness(data_selected$Work_Hours_boxcox, na.rm = TRUE)))
## Skewness: 0.1167

Uji Asumsi Analisis Diskriminan

# Normalitas multivariat per kelas (Mardia test)
cat("\nUji Normalitas Multivariat (Mardia Test) per kelas:\n")
## 
## Uji Normalitas Multivariat (Mardia Test) per kelas:
library(MVN)  # Pastikan paket MVN sudah dimuat
for (grp in levels(data_selected$Stress_Detection)) {
  cat("\nKelas:", grp, "\n")
  subset_data <- data_selected[data_selected$Stress_Detection == grp, numeric_selected]
  mvn_test <- mvn(subset_data)  # Hapus mvnTest = "mardia"
  print(mvn_test$multivariateNormality)
}
## 
## Kelas: High 
## NULL
## 
## Kelas: Low 
## NULL
## 
## Kelas: Medium 
## NULL
# Homogenitas Kovarians (Box's M test)
cat("\n Uji Homogenitas Kovarians (Box's M Test):\n")
## 
##  Uji Homogenitas Kovarians (Box's M Test):
boxm_res <- boxM(data_selected[, numeric_selected], data_selected$Stress_Detection)
print(boxm_res)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_selected[, numeric_selected]
## Chi-Sq (approx.) = 844.6, df = 182, p-value < 2.2e-16
# Multikolinearitas (VIF)
cat("\n Uji Multikolinearitas (VIF):\n")
## 
##  Uji Multikolinearitas (VIF):
vif_values <- sapply(numeric_selected, function(var) {
  others <- setdiff(numeric_selected, var)
  form <- as.formula(paste(var, "~", paste(others, collapse = "+")))
  vif_model <- lm(form, data = data_selected)
  vif_val <- vif(vif_model)[1]
  return(vif_val)
})
print(vif_values)
##      Age.Sleep_Duration      Sleep_Duration.Age   Physical_Activity.Age 
##                1.542308                1.399028                1.428361 
##         Screen_Time.Age     Caffeine_Intake.Age      Alcohol_Intake.Age 
##                1.429564                1.428860                1.429927 
##          Work_Hours.Age         Travel_Time.Age Social_Interactions.Age 
##                1.392150                1.424854                1.421479 
##      Blood_Pressure.Age   Cholesterol_Level.Age   Blood_Sugar_Level.Age 
##                1.419996                1.410630                1.359280 
##    Wake_Up_Time_num.Age 
##                1.430795
# Linearitas 
cat("\n Visualisasi Linearitas antar variabel numerik:\n")
## 
##  Visualisasi Linearitas antar variabel numerik:
pairs(data_selected[, numeric_selected], col = as.numeric(data_selected$Stress_Detection),
      pch=19, main = "Scatterplot Matrix - Cek Linearitas antar Variabel Numerik")

Pemodelan

# LDA modeling dan evaluasi
cat("\n Training model LDA dan evaluasi:\n")
## 
##  Training model LDA dan evaluasi:
model_lda <- lda(Stress_Detection ~ ., data = data_selected)
pred <- predict(model_lda)

# Confusion matrix 
conf_matrix <- table(Predicted = pred$class, Actual = data_selected$Stress_Detection)
accuracy <- mean(pred$class == data_selected$Stress_Detection)

cat("\n Confusion Matrix LDA:\n")
## 
##  Confusion Matrix LDA:
print(conf_matrix)
##          Actual
## Predicted High Low Medium
##    High    249   9     31
##    Low       9 102     27
##    Medium   43  51    252
cat(sprintf("\n Akurasi LDA: %.4f\n", accuracy))
## 
##  Akurasi LDA: 0.7801

Perceptual Mapping

# MDS
lda_df <- data.frame(LD1 = pred$x[,1],
                     LD2 = pred$x[,2],
                     Stress_Detection = data_selected$Stress_Detection)

ggplot(lda_df, aes(x = LD1, y = LD2, color = Stress_Detection)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "Visualisasi LDA: Persebaran Kelas Stress pada LD1 dan LD2",
       x = "Linear Discriminant 1",
       y = "Linear Discriminant 2") +
  theme_minimal() +
  theme(legend.position = "right")

dist_matrix <- dist(scale(data_selected[, numeric_selected]))
mds_res <- cmdscale(dist_matrix, k = 2)

mds_df <- data.frame(MDS1 = mds_res[,1],
                     MDS2 = mds_res[,2],
                     Stress_Detection = data_selected$Stress_Detection)

ggplot(mds_df, aes(x = MDS1, y = MDS2, color = Stress_Detection)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "Perceptual Mapping dengan MDS",
       x = "Dimensi 1",
       y = "Dimensi 2") +
  theme_minimal() +
  theme(legend.position = "right")

# Biplot 
pca_res <- prcomp(scale(data_selected[, numeric_selected]), center = TRUE, scale. = TRUE)
loadings <- data.frame(pca_res$rotation[, 1:2])
loadings$Feature <- rownames(loadings)

ggplot(loadings, aes(x = PC1, y = PC2)) +
  geom_segment(aes(x = 0, y = 0, xend = PC1, yend = PC2), 
               arrow = arrow(length = unit(0.1, "cm")), color = "blue") +
  geom_text_repel(aes(label = Feature), size = 3, max.overlaps = 20) +
  xlim(c(min(loadings$PC1) - 0.1, max(loadings$PC1) + 0.1)) +
  ylim(c(min(loadings$PC2) - 0.1, max(loadings$PC2) + 0.1)) +
  ggtitle("PCA Loading Biplot") +
  theme_minimal()