Script Metode Kelompok 7_Sains Data 2023D

options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("ggplot2")

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("dplyr")

## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("psych")

## package 'psych' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("factoextra")

## package 'factoextra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("corrplot")

## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("naniar")

## package 'naniar' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("caret")

## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

install.packages("biotools")

## package 'biotools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages

Membaca Data

data <- read.csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv", stringsAsFactors = TRUE)

Struktur Data

str(data)

## 'data.frame':    27901 obs. of  18 variables:
##  $ Student_ID                         : int  2 8 26 30 32 33 52 56 59 62 ...
##  $ Gender                             : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 2 1 2 2 ...
##  $ Age                                : int  33 24 31 28 25 29 30 30 28 31 ...
##  $ City                               : Factor w/ 52 levels "'Less Delhi'",..: 52 6 45 50 19 40 47 9 34 38 ...
##  $ Profession                         : Factor w/ 13 levels "'Civil Engineer'",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ Academic.Pressure                  : int  5 2 3 3 4 2 3 2 3 2 ...
##  $ Work.Pressure                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CGPA                               : num  8.97 5.9 7.03 5.59 8.13 5.7 9.54 8.04 9.79 8.38 ...
##  $ Study.Satisfaction                 : int  2 5 5 2 3 3 4 4 1 3 ...
##  $ Job.Satisfaction                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sleep.Duration                     : Factor w/ 5 levels "'5-6 hours'",..: 1 1 3 2 1 3 2 3 2 3 ...
##  $ Dietary.Habits                     : Factor w/ 4 levels "Healthy","Moderate",..: 1 2 1 2 2 1 1 4 2 2 ...
##  $ Degree                             : Factor w/ 28 levels "'Class 12'","B.Arch",..: 5 12 7 9 18 28 12 1 4 13 ...
##  $ Have.you.ever.had.suicidal.thoughts: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 1 1 2 2 ...
##  $ Work.Study.Hours                   : int  3 3 9 4 1 4 1 0 12 2 ...
##  $ Financial.Stress                   : Factor w/ 6 levels "?","1","2","3",..: 2 3 2 6 2 2 3 2 4 6 ...
##  $ Family.History.of.Mental.Illness   : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 1 1 ...
##  $ Depression                         : int  1 0 0 1 0 0 0 0 1 1 ...

summary(data)

##    Student_ID        Gender           Age                 City      
##  Min.   :     2   Female:12354   Min.   :18.00   Kalyan     : 1570  
##  1st Qu.: 35039   Male  :15547   1st Qu.:21.00   Srinagar   : 1372  
##  Median : 70684                  Median :25.00   Hyderabad  : 1340  
##  Mean   : 70442                  Mean   :25.82   Vasai-Virar: 1290  
##  3rd Qu.:105818                  3rd Qu.:30.00   Lucknow    : 1155  
##  Max.   :140699                  Max.   :59.00   Thane      : 1139  
##                                                  (Other)    :20035  
##               Profession    Academic.Pressure Work.Pressure    
##  Student           :27871   Min.   :0.000     Min.   :0.00000  
##  Architect         :    8   1st Qu.:2.000     1st Qu.:0.00000  
##  Teacher           :    6   Median :3.000     Median :0.00000  
##  'Digital Marketer':    3   Mean   :3.141     Mean   :0.00043  
##  'Content Writer'  :    2   3rd Qu.:4.000     3rd Qu.:0.00000  
##  Chef              :    2   Max.   :5.000     Max.   :5.00000  
##  (Other)           :    9                                      
##       CGPA        Study.Satisfaction Job.Satisfaction  
##  Min.   : 0.000   Min.   :0.000      Min.   :0.000000  
##  1st Qu.: 6.290   1st Qu.:2.000      1st Qu.:0.000000  
##  Median : 7.770   Median :3.000      Median :0.000000  
##  Mean   : 7.656   Mean   :2.944      Mean   :0.000681  
##  3rd Qu.: 8.920   3rd Qu.:4.000      3rd Qu.:0.000000  
##  Max.   :10.000   Max.   :5.000      Max.   :4.000000  
##                                                        
##              Sleep.Duration   Dietary.Habits         Degree     
##  '5-6 hours'        :6183   Healthy  : 7651   'Class 12': 6080  
##  '7-8 hours'        :7346   Moderate : 9921   B.Ed      : 1867  
##  'Less than 5 hours':8310   Others   :   12   B.Com     : 1506  
##  'More than 8 hours':6044   Unhealthy:10317   B.Arch    : 1478  
##  Others             :  18                     BCA       : 1433  
##                                               MSc       : 1190  
##                                               (Other)   :14347  
##  Have.you.ever.had.suicidal.thoughts Work.Study.Hours Financial.Stress
##  No :10245                           Min.   : 0.000   ?:   3          
##  Yes:17656                           1st Qu.: 4.000   1:5121          
##                                      Median : 8.000   2:5061          
##                                      Mean   : 7.157   3:5226          
##                                      3rd Qu.:10.000   4:5775          
##                                      Max.   :12.000   5:6715          
##                                                                       
##  Family.History.of.Mental.Illness   Depression    
##  No :14398                        Min.   :0.0000  
##  Yes:13503                        1st Qu.:0.0000  
##                                   Median :1.0000  
##                                   Mean   :0.5855  
##                                   3rd Qu.:1.0000  
##                                   Max.   :1.0000  
##

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(readr)

data <- read_csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 27901 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Gender, City, Profession, Sleep Duration, Dietary Habits, Degree, ...
## dbl (10): Student_ID, Age, Academic Pressure, Work Pressure, CGPA, Study Sat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

categorical_cols <- c("Gender", "City", "Profession", "Sleep Duration", 
                      "Dietary Habits", "Degree", "Have you ever had suicidal thoughts", 
                      "Financial Stress", "Family History of Mental Illness")
numerical_cols <- c("Age", "Academic Pressure", "Work Pressure", 
                    "CGPA", "Study Satisfaction", "Job Satisfaction", 
                    "Work/Study Hours")

for (col in categorical_cols) {
  print(ggplot(data, aes(x = .data[[col]])) +
          geom_bar(fill = "steelblue") +
          labs(title = paste("Bar Plot of", col), x = col, y = "Count") +
          theme(axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(hjust = 0.5)))
}

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_count()`).

for (col in numerical_cols) {
  print(ggplot(data, aes(x = .data[[col]], y = Depression)) +
          geom_jitter(alpha = 0.5, color = "tomato") +
          labs(title = paste("Scatter Plot of", col, "vs Depression"),
               x = col, y = "Depression") +
          theme(plot.title = element_text(hjust = 0.5)))
}

library(ggplot2)
library(readr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

data <- read_csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 27901 Columns: 18

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Gender, City, Profession, Sleep Duration, Dietary Habits, Degree, ...
## dbl (10): Student_ID, Age, Academic Pressure, Work Pressure, CGPA, Study Sat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

categorical_cols <- c("Gender", "City", "Profession", "Sleep Duration", 
                      "Dietary Habits", "Degree", "Have you ever had suicidal thoughts", 
                      "Financial Stress", "Family History of Mental Illness")
numerical_cols <- c("Age", "Academic Pressure", "Work Pressure", 
                    "CGPA", "Study Satisfaction", "Job Satisfaction", 
                    "Work/Study Hours")

data[categorical_cols] <- lapply(data[categorical_cols], as.character)

data_categorical_long <- data[, categorical_cols] %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_categorical_long, aes(x = Value)) +
  geom_bar(fill = "steelblue") +
  facet_wrap(~ Variable, scales = "free_x") +
  labs(title = "Bar Plot of Categorical Variables", x = "Category", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))

data_numerical_long <- data[, c(numerical_cols, "Depression")] %>%
  pivot_longer(cols = -Depression, names_to = "Variable", values_to = "Value")

ggplot(data_numerical_long, aes(x = Value, y = Depression)) +
  geom_jitter(alpha = 0.5, color = "tomato") +
  facet_wrap(~ Variable, scales = "free_x") +
  labs(title = "Scatter Plot of Numerical Variables vs Depression",
       x = "Value", y = "Depression") +
  theme(plot.title = element_text(hjust = 0.5))

Statistik Deskriptif

library(dplyr)
library(psych)

## Warning: package 'psych' was built under R version 4.4.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

numeric_vars <- select_if(data, is.numeric)
describe(numeric_vars)

##                    vars     n     mean       sd   median  trimmed      mad min
## Student_ID            1 27901 70442.15 40641.18 70684.00 70454.87 52412.88   2
## Age                   2 27901    25.82     4.91    25.00    25.77     5.93  18
## Academic Pressure     3 27901     3.14     1.38     3.00     3.18     1.48   0
## Work Pressure         4 27901     0.00     0.04     0.00     0.00     0.00   0
## CGPA                  5 27901     7.66     1.47     7.77     7.67     1.88   0
## Study Satisfaction    6 27901     2.94     1.36     3.00     2.93     1.48   0
## Job Satisfaction      7 27901     0.00     0.04     0.00     0.00     0.00   0
## Work/Study Hours      8 27901     7.16     3.71     8.00     7.40     4.45   0
## Depression            9 27901     0.59     0.49     1.00     0.61     0.00   0
##                       max  range   skew kurtosis     se
## Student_ID         140699 140697  -0.01    -1.21 243.31
## Age                    59     41   0.13    -0.85   0.03
## Academic Pressure       5      5  -0.14    -1.16   0.01
## Work Pressure           5      5 108.58 12107.60   0.00
## CGPA                   10     10  -0.11    -1.02   0.01
## Study Satisfaction      5      5   0.01    -1.22   0.01
## Job Satisfaction        4      4  74.10  5925.51   0.00
## Work/Study Hours       12     12  -0.45    -1.00   0.02
## Depression              1      1  -0.35    -1.88   0.00

Histogram

library(ggplot2)

ggplot(data, aes(x = CGPA)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
  labs(title = "Histogram: Distribusi CGPA", x = "CGPA", y = "Frekuensi") +
  theme_minimal() +
  theme(text = element_text(size = 14))

Contoh Histogram Durasi Tidur

data$SleepDuration <- gsub("'", "", data$'Sleep Duration')

ggplot(data, aes(x = SleepDuration)) +
  geom_bar(fill = "darkgreen") +
  labs(title = "Histogram: Distribusi Durasi Tidur", x = "Durasi Tidur", y = "Frekuensi") +
  theme_minimal() +
  theme(text = element_text(size = 14))

Plot Numerik vs Label

library(ggplot2)
library(dplyr)

numeric_names <- colnames(numeric_vars)

for (feature in numeric_names) {

  binned <- data %>%
    mutate(binned_feature = cut(.data[[feature]], breaks = 5)) %>%  
    group_by(binned_feature) %>%
    summarise(mean_depression = mean(Depression, na.rm = TRUE))

  print(
    ggplot(binned, aes(x = binned_feature, y = mean_depression)) +
      geom_bar(stat = "identity", fill = "steelblue") +
      labs(title = paste("Bar Plot:", feature, "(binned) vs Mean Depression"),
           x = paste(feature, "(binned)"), y = "Mean Depression") +
      theme_minimal() +
      theme(text = element_text(size = 14))
  )
}

Boxplot

for (feature in numeric_names) {
  print(
    ggplot(data, aes(x = factor(Depression), y = .data[[feature]])) +
      geom_boxplot(fill = "skyblue") +
      labs(title = paste("Boxplot:", feature, "berdasarkan Status Depression"),
           x = "Depression (0 = Tidak, 1 = Ya)", y = feature) +
      theme_minimal() +
      theme(text = element_text(size = 14))
  )
}

Bar Plot sleep duration berdasarkan label

ggplot(data, aes(x = SleepDuration, fill = factor(Depression))) +
  geom_bar(position = "dodge") +
  labs(title = "Bar Plot: Durasi Tidur berdasarkan Status Depression",
       x = "Durasi Tidur", y = "Frekuensi", fill = "Depression") +
  theme_minimal() +
  theme(text = element_text(size = 14))

Density Plot CGPA berrdasarkan label

ggplot(data, aes(x = CGPA, fill = factor(Depression))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot: Distribusi CGPA berdasarkan Status Depression",
       x = "CGPA", fill = "Depression") +
  theme_minimal() +
  theme(text = element_text(size = 14))

Pairs Plot

library(dplyr)
library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

pair_data <- dplyr::select(data, 
                           "Academic Pressure", 
                           "Work Pressure", 
                           "CGPA", 
                           "Study Satisfaction", 
                           "Job Satisfaction", 
                           "Work/Study Hours", 
                           "Depression")

ggpairs(pair_data, aes(color = factor(Depression), alpha = 0.6)) +
  theme_bw() +
  ggtitle("Pairs Plot: Hubungan antar Fitur Numerik dan Status Depression") +
  theme(text = element_text(size = 14), plot.title = element_text(hjust = 0.5))

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero

Heatmap

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(dplyr)

cor_data <- dplyr::select(data, where(is.numeric))
cor_matrix <- cor(cor_data, use = "complete.obs", method = "pearson")
print(round(cor_matrix, 2))

##                    Student_ID   Age Academic Pressure Work Pressure  CGPA
## Student_ID               1.00  0.00              0.01          0.00 -0.01
## Age                      0.00  1.00             -0.08          0.00  0.01
## Academic Pressure        0.01 -0.08              1.00         -0.02 -0.02
## Work Pressure            0.00  0.00             -0.02          1.00 -0.05
## CGPA                    -0.01  0.01             -0.02         -0.05  1.00
## Study Satisfaction       0.01  0.01             -0.11         -0.02 -0.04
## Job Satisfaction         0.00  0.00             -0.02          0.77 -0.05
## Work/Study Hours         0.00 -0.03              0.10         -0.01  0.00
## Depression               0.00 -0.23              0.47          0.00  0.02
##                    Study Satisfaction Job Satisfaction Work/Study Hours
## Student_ID                       0.01             0.00             0.00
## Age                              0.01             0.00            -0.03
## Academic Pressure               -0.11            -0.02             0.10
## Work Pressure                   -0.02             0.77            -0.01
## CGPA                            -0.04            -0.05             0.00
## Study Satisfaction               1.00            -0.02            -0.04
## Job Satisfaction                -0.02             1.00            -0.01
## Work/Study Hours                -0.04            -0.01             1.00
## Depression                      -0.17             0.00             0.21
##                    Depression
## Student_ID               0.00
## Age                     -0.23
## Academic Pressure        0.47
## Work Pressure            0.00
## CGPA                     0.02
## Study Satisfaction      -0.17
## Job Satisfaction         0.00
## Work/Study Hours         0.21
## Depression               1.00

corrplot(cor_matrix, method = "color", type = "upper",
         tl.col = "black", tl.srt = 45, addCoef.col = "black",
         number.cex = 0.8, col = colorRampPalette(c("blue", "white", "red"))(200),
         title = "Heatmap: Korelasi antar Fitur Numerik", mar = c(0,0,2,0))

Cek Missing Value

library(naniar)

## Warning: package 'naniar' was built under R version 4.4.3

missing_summary <- sapply(data, function(x) sum(is.na(x)))
print(missing_summary)

##                          Student_ID                              Gender 
##                                   0                                   0 
##                                 Age                                City 
##                                   0                                   0 
##                          Profession                   Academic Pressure 
##                                   0                                   0 
##                       Work Pressure                                CGPA 
##                                   0                                   0 
##                  Study Satisfaction                    Job Satisfaction 
##                                   0                                   0 
##                      Sleep Duration                      Dietary Habits 
##                                   0                                   0 
##                              Degree Have you ever had suicidal thoughts 
##                                   0                                   0 
##                    Work/Study Hours                    Financial Stress 
##                                   0                                   3 
##    Family History of Mental Illness                          Depression 
##                                   0                                   0 
##                       SleepDuration 
##                                   0

Cek Outliers

library(ggplot2)

# Ganti nama kolom yang bermasalah menjadi format aman
colnames(data) <- make.names(colnames(data))

# Pastikan nama kolom numerik yang digunakan sesuai dengan nama yang sudah diperbaiki
numeric_vars <- data[, c("Age", "Academic.Pressure", "Work.Pressure", 
                         "CGPA", "Study.Satisfaction", "Job.Satisfaction", 
                         "Work.Study.Hours")]

# 1. Visualisasi boxplot untuk deteksi outlier
for (feature in names(numeric_vars)) {
  print(
    ggplot(data, aes_string(y = feature)) +
      geom_boxplot(fill = "orange", outlier.color = "red", outlier.shape = 8) +
      labs(title = paste("Boxplot Deteksi Outlier:", feature)) +
      theme_minimal() + 
      theme(text = element_text(size = 14))
  )
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Deteksi outlier secara programatik
outliers <- list()
outlier_counts <- numeric()

for (col in names(numeric_vars)) {
  Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR

  outlier_indices <- which(data[[col]] < lower_bound | data[[col]] > upper_bound)
  outliers[[col]] <- outlier_indices
  outlier_counts[col] <- length(outlier_indices)
}

# 3. Tampilkan hasil deteksi outlier
cat("Jumlah outlier per variabel:\n")

## Jumlah outlier per variabel:

print(outlier_counts)

##                Age  Academic.Pressure      Work.Pressure               CGPA 
##                 12                  0                  3                  9 
## Study.Satisfaction   Job.Satisfaction   Work.Study.Hours 
##                  0                  8                  0

cols_with_outliers <- names(outlier_counts[outlier_counts > 0])
cat("\nKolom yang mengandung outlier:\n")

## 
## Kolom yang mengandung outlier:

print(cols_with_outliers)

## [1] "Age"              "Work.Pressure"    "CGPA"             "Job.Satisfaction"

Handling Ouliters

library(dplyr)

numeric_cols <- dplyr::select(data, where(is.numeric))

means <- sapply(numeric_cols, mean, na.rm = TRUE)
sds <- sapply(numeric_cols, sd, na.rm = TRUE)

lower_bounds <- means - 3 * sds
upper_bounds <- means + 3 * sds

for (col in names(numeric_cols)) {
  numeric_cols[[col]][numeric_cols[[col]] < lower_bounds[col]] <- lower_bounds[col]
  numeric_cols[[col]][numeric_cols[[col]] > upper_bounds[col]] <- upper_bounds[col]
}

data[names(numeric_cols)] <- numeric_cols

Cek Duplikat

dup_count <- sum(duplicated(data))
cat("Jumlah baris duplikat:", dup_count, "\n")

## Jumlah baris duplikat: 0

if (dup_count > 0) {
  data <- data[!duplicated(data), ]
  cat("Duplikat sudah dihapus\n")
}

Cek Inkonsisten Data

library(dplyr)

sapply(dplyr::select(data, where(is.factor)), levels)

## named list()

library(dplyr)
library(stringr)


factor_cols <- dplyr::select(data, names(data)[sapply(data, is.factor)])

clean_levels <- function(x) {
  x <- as.character(x)
  x <- str_replace_all(x, "\\\\'", "")         
  x <- str_replace_all(x, "''", "'")           
  x <- str_replace_all(x, "^'+|'+$", "")       
  x <- str_trim(x)                             
  x[x == "?"] <- NA                            
  return(factor(x))
}

factor_cleaned <- mutate_all(factor_cols, clean_levels)

data[names(factor_cleaned)] <- factor_cleaned

sapply(dplyr::select(data, names(factor_cleaned)), levels)

## named list()

Normalisasi Data (opsional)

library(dplyr)

normalize <- function(x) {
  return ((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}

data_scaled <- data
num_cols <- names(dplyr::select(data, where(is.numeric)))
data_scaled[num_cols] <- lapply(data_scaled[num_cols], normalize)

summary(data_scaled[num_cols])

##    Student_ID          Age         Academic.Pressure Work.Pressure      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000    Min.   :0.0000000  
##  1st Qu.:0.2490   1st Qu.:0.1331   1st Qu.:0.4000    1st Qu.:0.0000000  
##  Median :0.5024   Median :0.3106   Median :0.6000    Median :0.0000000  
##  Mean   :0.5007   Mean   :0.3468   Mean   :0.6282    Mean   :0.0001075  
##  3rd Qu.:0.7521   3rd Qu.:0.5324   3rd Qu.:0.8000    3rd Qu.:0.0000000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000    Max.   :1.0000000  
##       CGPA        Study.Satisfaction Job.Satisfaction    Work.Study.Hours
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.0000000   Min.   :0.0000  
##  1st Qu.:0.4509   1st Qu.:0.4000     1st Qu.:0.0000000   1st Qu.:0.3333  
##  Median :0.6699   Median :0.6000     Median :0.0000000   Median :0.6667  
##  Mean   :0.6532   Mean   :0.5888     Mean   :0.0002867   Mean   :0.5964  
##  3rd Qu.:0.8401   3rd Qu.:0.8000     3rd Qu.:0.0000000   3rd Qu.:0.8333  
##  Max.   :1.0000   Max.   :1.0000     Max.   :1.0000000   Max.   :1.0000  
##    Depression    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5855  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

Distribusi Label

table(data$Depression)

## 
##     0     1 
## 11565 16336

ggplot(data, aes(x = factor(Depression))) +
  geom_bar(fill = "purple") +
  labs(title = "Distribusi Kelas Depression", x = "Depression", y = "Jumlah") +
  theme_minimal() + theme(text = element_text(size = 14))

data

## # A tibble: 27,901 × 19
##    Student_ID Gender   Age City       Profession Academic.Pressure Work.Pressure
##         <dbl> <chr>  <dbl> <chr>      <chr>                  <dbl>         <dbl>
##  1          2 Male      33 Visakhapa… Student                    5             0
##  2          8 Female    24 Bangalore  Student                    2             0
##  3         26 Male      31 Srinagar   Student                    3             0
##  4         30 Female    28 Varanasi   Student                    3             0
##  5         32 Female    25 Jaipur     Student                    4             0
##  6         33 Male      29 Pune       Student                    2             0
##  7         52 Male      30 Thane      Student                    3             0
##  8         56 Female    30 Chennai    Student                    2             0
##  9         59 Male      28 Nagpur     Student                    3             0
## 10         62 Male      31 Nashik     Student                    2             0
## # ℹ 27,891 more rows
## # ℹ 12 more variables: CGPA <dbl>, Study.Satisfaction <dbl>,
## #   Job.Satisfaction <dbl>, Sleep.Duration <chr>, Dietary.Habits <chr>,
## #   Degree <chr>, Have.you.ever.had.suicidal.thoughts <chr>,
## #   Work.Study.Hours <dbl>, Financial.Stress <chr>,
## #   Family.History.of.Mental.Illness <chr>, Depression <dbl>,
## #   SleepDuration <chr>

cat_features <- names(data)[sapply(data, function(x) is.factor(x) | is.character(x))]

label_encode <- function(x) {
  factor_x <- as.factor(x)
  as.numeric(factor_x) - 1
}

df_encoded <- data  

for (feat in cat_features) {
  df_encoded[[feat]] <- label_encode(data[[feat]])
}

cat("Fitur kategorikal yang di-label encoding:\n")

## Fitur kategorikal yang di-label encoding:

print(cat_features)

##  [1] "Gender"                              "City"                               
##  [3] "Profession"                          "Sleep.Duration"                     
##  [5] "Dietary.Habits"                      "Degree"                             
##  [7] "Have.you.ever.had.suicidal.thoughts" "Financial.Stress"                   
##  [9] "Family.History.of.Mental.Illness"    "SleepDuration"

df_encoded

## # A tibble: 27,901 × 19
##    Student_ID Gender   Age  City Profession Academic.Pressure Work.Pressure
##         <dbl>  <dbl> <dbl> <dbl>      <dbl>             <dbl>         <dbl>
##  1          2      1    33    51         11                 5             0
##  2          8      0    24     5         11                 2             0
##  3         26      1    31    44         11                 3             0
##  4         30      0    28    49         11                 3             0
##  5         32      0    25    18         11                 4             0
##  6         33      1    29    39         11                 2             0
##  7         52      1    30    46         11                 3             0
##  8         56      0    30     8         11                 2             0
##  9         59      1    28    33         11                 3             0
## 10         62      1    31    37         11                 2             0
## # ℹ 27,891 more rows
## # ℹ 12 more variables: CGPA <dbl>, Study.Satisfaction <dbl>,
## #   Job.Satisfaction <dbl>, Sleep.Duration <dbl>, Dietary.Habits <dbl>,
## #   Degree <dbl>, Have.you.ever.had.suicidal.thoughts <dbl>,
## #   Work.Study.Hours <dbl>, Financial.Stress <dbl>,
## #   Family.History.of.Mental.Illness <dbl>, Depression <dbl>,
## #   SleepDuration <dbl>

library(dplyr)

data$Depression <- as.numeric(as.character(data$Depression))

num_vars <- dplyr::select(data, where(is.numeric))

Feature Selection

library(psych)
fa_result <- fa(num_vars, nfactors = 2, rotate = "varimax")

loadings_mat <- as.matrix(fa_result$loadings)
loadings_df <- as.data.frame(loadings_mat)

library(dplyr)
library(psych)
library(ggplot2)

num_vars <- df_encoded %>%
  select_if(is.numeric)

drop_cols <- c("Student_ID", "Gender", "City", "Profession", "Degree", "SleepDuration")
num_vars <- num_vars[, ! names(num_vars) %in% drop_cols ]

variances <- sapply(num_vars, var, na.rm = TRUE)
num_vars <- num_vars[, variances > 0]

cat("Variabel numerik untuk analisis:\n")

## Variabel numerik untuk analisis:

print(names(num_vars))

##  [1] "Age"                                 "Academic.Pressure"                  
##  [3] "Work.Pressure"                       "CGPA"                               
##  [5] "Study.Satisfaction"                  "Job.Satisfaction"                   
##  [7] "Sleep.Duration"                      "Dietary.Habits"                     
##  [9] "Have.you.ever.had.suicidal.thoughts" "Work.Study.Hours"                   
## [11] "Financial.Stress"                    "Family.History.of.Mental.Illness"   
## [13] "Depression"

num_complete <- na.omit(num_vars)

kmo_res      <- KMO(num_complete)
bartlett_res <- cortest.bartlett(cor(num_complete), n = nrow(num_complete))

cat("\n--- KMO Test ---\n")

## 
## --- KMO Test ---

print(kmo_res)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = num_complete)
## Overall MSA =  0.62
## MSA for each item = 
##                                 Age                   Academic.Pressure 
##                                0.70                                0.70 
##                       Work.Pressure                                CGPA 
##                                0.50                                0.50 
##                  Study.Satisfaction                    Job.Satisfaction 
##                                0.76                                0.50 
##                      Sleep.Duration                      Dietary.Habits 
##                                0.56                                0.77 
## Have.you.ever.had.suicidal.thoughts                    Work.Study.Hours 
##                                0.69                                0.76 
##                    Financial.Stress    Family.History.of.Mental.Illness 
##                                0.75                                0.70 
##                          Depression 
##                                0.61

cat("\n--- Bartlett’s Test ---\n")

## 
## --- Bartlett’s Test ---

print(bartlett_res)

## $chisq
## [1] 39351.64
## 
## $p.value
## [1] 0
## 
## $df
## [1] 78

keep_vars <- names(kmo_res$MSAi)[ kmo_res$MSAi >= 0.5 ]
num_vars   <- num_vars[, keep_vars]

cat("\nVariabel setelah seleksi MSA >= 0.5:\n")

## 
## Variabel setelah seleksi MSA >= 0.5:

print(keep_vars)

##  [1] "Age"                                 "Academic.Pressure"                  
##  [3] "Work.Pressure"                       "Study.Satisfaction"                 
##  [5] "Job.Satisfaction"                    "Sleep.Duration"                     
##  [7] "Dietary.Habits"                      "Have.you.ever.had.suicidal.thoughts"
##  [9] "Work.Study.Hours"                    "Financial.Stress"                   
## [11] "Family.History.of.Mental.Illness"    "Depression"

fa.parallel(na.omit(num_vars),
            fm     = "pa",       
            fa     = "fa",
            n.iter = 100,
            main   = "Parallel Analysis untuk FA")

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero

## Parallel analysis suggests that the number of factors =  5  and the number of components =  NA

nf <- 2

fa_res <- fa(na.omit(num_vars),
             nfactors = nf,
             fm       = "pa",       
             rotate   = "oblimin")

## Loading required namespace: GPArotation

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

cat("\n--- Hasil FA (principal axis + oblimin) ---\n")

## 
## --- Hasil FA (principal axis + oblimin) ---

print(fa_res, digits = 2, cutoff = 0.30)

## Factor Analysis using method =  pa
## Call: fa(r = na.omit(num_vars), nfactors = nf, rotate = "oblimin", 
##     fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                                       PA1   PA2      h2      u2 com
## Age                                 -0.21 -0.01 0.04609  0.9539 1.0
## Academic.Pressure                    0.47 -0.03 0.22001  0.7800 1.0
## Work.Pressure                        0.00  0.78 0.60972  0.3903 1.0
## Study.Satisfaction                  -0.17 -0.03 0.02957  0.9704 1.1
## Job.Satisfaction                     0.00  0.78 0.61501  0.3850 1.0
## Sleep.Duration                      -0.02 -0.01 0.00057  0.9994 1.1
## Dietary.Habits                       0.21  0.00 0.04249  0.9575 1.0
## Have.you.ever.had.suicidal.thoughts  0.55  0.00 0.30139  0.6986 1.0
## Work.Study.Hours                     0.21 -0.01 0.04311  0.9569 1.0
## Financial.Stress                     0.37  0.01 0.13330  0.8667 1.0
## Family.History.of.Mental.Illness     0.05 -0.01 0.00260  0.9974 1.0
## Depression                           1.00  0.00 1.00638 -0.0064 1.0
## 
##                        PA1  PA2
## SS loadings           1.82 1.23
## Proportion Var        0.15 0.10
## Cumulative Var        0.15 0.25
## Proportion Explained  0.60 0.40
## Cumulative Proportion 0.60 1.00
## 
##  With factor correlations of 
##       PA1   PA2
## PA1  1.00 -0.01
## PA2 -0.01  1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  66  with the objective function =  1.4 0.3 with Chi Square =  39186.68
## df of  the model are 43  and the objective function was  0.01 
##  0.3
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.01 
##  0.3
## The harmonic n.obs is  27898 with the empirical chi square  380.32  with prob <  1.4e-55 
##  0.3The total n.obs was  27898  with Likelihood Chi Square =  248.62  with prob <  9.6e-31 
##  0.3
## Tucker Lewis Index of factoring reliability =  0.992
## RMSEA index =  0.013  and the 90 % confidence intervals are  0.012 0.015 0.3
## BIC =  -191.54
## Fit based upon off diagonal values = 1

fa.diagram(fa_res)

cat("\n--- Goodness‐of‐Fit FA ---\n")

## 
## --- Goodness‐of‐Fit FA ---

cat("RMSEA =", round(fa_res$RMSEA[1], 3),
    " (90% CI:", paste(round(fa_res$RMSEA[2:3], 3), collapse = "–"), ")\n")

## RMSEA = 0.013  (90% CI: 0.012–0.015 )

cat("TLI   =", round(fa_res$TLI, 3), "\n")

## TLI   = 0.992

cat("RMSR  =", round(fa_res$rms, 3), "\n")

## RMSR  = 0.01

load_df <- as.data.frame(unclass(fa_res$loadings))
load_df$Feature <- rownames(load_df)

selected_fa <- load_df %>%
  filter(abs(PA1) >= 0.40 | abs(PA2) >= 0.40) %>%
  pull(Feature)

cat("\nFitur terpilih dari FA (|loading| >= 0.4):\n")

## 
## Fitur terpilih dari FA (|loading| >= 0.4):

print(selected_fa)

## [1] "Academic.Pressure"                   "Work.Pressure"                      
## [3] "Job.Satisfaction"                    "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"

num_pca <- num_vars %>%
  mutate_all(~ ifelse(is.infinite(.), NA, .)) %>%
  na.omit()

pca_res <- prcomp(num_pca, scale. = TRUE)

var_exp <- cumsum(pca_res$sdev^2 / sum(pca_res$sdev^2))
plot(var_exp, type = "b",
     xlab = "Jumlah Komponen Principal",
     ylab = "Varians Kumulatif Dijelaskan",
     main = "Varians Kumulatif Dijelaskan oleh PCA")

pca_load_df <- as.data.frame(pca_res$rotation[, 1:2])
pca_load_df$Feature <- rownames(pca_load_df)

selected_pca <- pca_load_df %>%
  filter(abs(PC1) > 0.40 | abs(PC2) > 0.40) %>%
  pull(Feature)

cat("\nFitur terpilih dari PCA (|loading| > 0.4 di PC1/PC2):\n")

## 
## Fitur terpilih dari PCA (|loading| > 0.4 di PC1/PC2):

print(selected_pca)

## [1] "Academic.Pressure"                   "Work.Pressure"                      
## [3] "Job.Satisfaction"                    "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"

idx <- as.numeric(rownames(num_pca))
Depression_clean <- df_encoded$Depression[idx]

cors <- sapply(num_pca, function(x)
  cor(x, Depression_clean, use = "pairwise.complete.obs"))

cors_sorted <- sort(cors, decreasing = TRUE)
cat("\nKorelasi fitur terhadap Depression:\n")

## 
## Korelasi fitur terhadap Depression:

print(cors_sorted)

##                          Depression Have.you.ever.had.suicidal.thoughts 
##                         0.163506012                         0.091111924 
##                   Academic.Pressure                    Financial.Stress 
##                         0.080186422                         0.067805055 
##                      Dietary.Habits                    Work.Study.Hours 
##                         0.031904021                         0.029979106 
##    Family.History.of.Mental.Illness                      Sleep.Duration 
##                         0.007323482                        -0.004034219 
##                    Job.Satisfaction                       Work.Pressure 
##                        -0.011533817                        -0.012325078 
##                  Study.Satisfaction                                 Age 
##                        -0.035061812                        -0.035495158

cor_df <- data.frame(Feature     = names(cors_sorted),
                     Correlation = cors_sorted)

ggplot(cor_df, aes(x = reorder(Feature, Correlation), y = Correlation)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Korelasi Fitur Numerik terhadap Depression",
       x = "Fitur", y = "Korelasi") +
  theme_minimal() +
  theme(text = element_text(size = 14))

manual_selected <- names(cors)[abs(cors) > 0.30]

cat("\nFitur terpilih dari korelasi manual (|r| > 0.3):\n")

## 
## Fitur terpilih dari korelasi manual (|r| > 0.3):

print(manual_selected)

## character(0)

final_features <- unique(c(selected_fa,
                           selected_pca,
                           manual_selected))

cat("\nFitur final terpilih untuk analisis selanjutnya:\n")

## 
## Fitur final terpilih untuk analisis selanjutnya:

print(final_features)

## [1] "Academic.Pressure"                   "Work.Pressure"                      
## [3] "Job.Satisfaction"                    "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"

if(!exists("final_features")) {
  stop("Variabel final_features belum tersedia.")
}
if(!exists("df_encoded")) {
  stop("Variabel df_encoded belum tersedia.")
}

data <- df_encoded

target_var <- "Depression"
if(!(target_var %in% names(data))) {
  stop(paste0("Variabel target '", target_var, "' tidak ditemukan di data."))
}

features_exist <- intersect(final_features, names(data))
if(length(features_exist) < length(final_features)) {
  warning("Beberapa fitur di final_features tidak ditemukan di df_encoded dan akan diabaikan:\n",
          paste(setdiff(final_features, features_exist), collapse = ", "))
}

df_selected <- data[, c(features_exist, target_var), drop = FALSE]

is_cat <- sapply(df_selected[, features_exist, drop = FALSE],
                 function(col) is.factor(col) || is.character(col))

cat_features <- names(is_cat)[is_cat]
if(length(cat_features) > 0) {
  for(feat in cat_features) {
    df_selected[[feat]] <- as.integer(factor(df_selected[[feat]])) - 1
  }
  message("Fitur kategorikal di‐label encoding: ", paste(cat_features, collapse = ", "))
} else {
  message("Tidak ada fitur kategorikal.")
}

## Tidak ada fitur kategorikal.

out_file <- "D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv"
write.csv(df_selected, out_file, row.names = FALSE)
message("Data dengan fitur terpilih disimpan ke: ", out_file)

## Data dengan fitur terpilih disimpan ke: D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv

Modelling

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(dplyr)
library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

library(corrplot)
library(pROC)

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(biotools)

## Warning: package 'biotools' was built under R version 4.4.3

## ---
## biotools version 4.3

df <- read.csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv")
target_var <- "Depression.1"
df[[target_var]] <- as.factor(df[[target_var]])

df <- na.omit(df)
feature_cols <- setdiff(colnames(df), target_var)

check_constant_per_group <- function(df, feature_cols, target_var) {
  constant_features <- c()
  groups <- levels(df[[target_var]])
  for (f in feature_cols) {
    is_const <- TRUE
    for (g in groups) {
      vals <- df[df[[target_var]] == g, f]
      if (length(unique(vals)) > 1) {
        is_const <- FALSE
        break
      }
    }
    if (is_const) constant_features <- c(constant_features, f)
  }
  return(constant_features)
}

total_constant <- sapply(df[, feature_cols], function(x) length(unique(x)) == 1)
feature_cols <- feature_cols[!total_constant]

group_constant <- check_constant_per_group(df, feature_cols, target_var)
feature_cols <- setdiff(feature_cols, group_constant)

df_selected <- df[, c(feature_cols, target_var)]

set.seed(123)
train_index <- createDataPartition(df_selected[[target_var]], p = 0.8, list = FALSE)
train_data <- df_selected[train_index, ]
test_data <- df_selected[-train_index, ]

model_vif <- lm(as.formula(paste(target_var, "~ .")), data = train_data)

## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

vif_vals <- vif(model_vif)

## Warning in Ops.factor(r, 2): '^' not meaningful for factors

## Warning in cov2cor(v): diag(V) had non-positive or NA entries; the non-finite
## result may be dubious

print(vif_vals)

##                   Academic.Pressure                       Work.Pressure 
##                                 NaN                                 NaN 
##                    Job.Satisfaction Have.you.ever.had.suicidal.thoughts 
##                                 NaN                                 NaN

high_vif <- names(vif_vals[vif_vals > 10])
feature_cols <- setdiff(feature_cols, high_vif)
df_selected <- df_selected[, c(feature_cols, target_var)]
train_data <- df_selected[train_index, ]
test_data <- df_selected[-train_index, ]

cor_matrix <- cor(train_data[, feature_cols])
corrplot(cor_matrix, method = "color", tl.cex = 0.6)

cat("\n===== [1] Pembentukan Model (Estimasi Parameter) - Regresi Logistik =====\n")

## 
## ===== [1] Pembentukan Model (Estimasi Parameter) - Regresi Logistik =====

logit_model <- glm(as.formula(paste(target_var, "~ .")), data = train_data, family = binomial())
summary(logit_model)

## 
## Call:
## glm(formula = as.formula(paste(target_var, "~ .")), family = binomial(), 
##     data = train_data)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -3.72121    0.05743 -64.791   <2e-16 ***
## Academic.Pressure                    0.82378    0.01463  56.318   <2e-16 ***
## Work.Pressure                       -5.27350   13.09207  -0.403    0.687    
## Job.Satisfaction                    13.04652    8.23370   1.585    0.113    
## Have.you.ever.had.suicidal.thoughts  2.54364    0.03821  66.578   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 30288  on 22320  degrees of freedom
## Residual deviance: 19410  on 22316  degrees of freedom
## AIC: 19420
## 
## Number of Fisher Scoring iterations: 5

cat("\n===== [2A] Uji Signifikansi Parsial (Wald Test) =====\n")

## 
## ===== [2A] Uji Signifikansi Parsial (Wald Test) =====

print(coef(summary(logit_model))[, 4])

##                         (Intercept)                   Academic.Pressure 
##                           0.0000000                           0.0000000 
##                       Work.Pressure                    Job.Satisfaction 
##                           0.6870944                           0.1130737 
## Have.you.ever.had.suicidal.thoughts 
##                           0.0000000

cat("\n===== [2B] Uji Signifikansi Serentak (Likelihood Ratio Test) =====\n")

## 
## ===== [2B] Uji Signifikansi Serentak (Likelihood Ratio Test) =====

null_model <- glm(as.formula(paste(target_var, "~ 1")), data = train_data, family = binomial())
lrt_result <- anova(null_model, logit_model, test = "Chisq")
print(lrt_result)

## Analysis of Deviance Table
## 
## Model 1: Depression.1 ~ 1
## Model 2: Depression.1 ~ Academic.Pressure + Work.Pressure + Job.Satisfaction + 
##     Have.you.ever.had.suicidal.thoughts
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     22320      30288                          
## 2     22316      19410  4    10877 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n===== [3] Akurasi Model - Regresi Logistik =====\n")

## 
## ===== [3] Akurasi Model - Regresi Logistik =====

logit_prob <- predict(logit_model, newdata = test_data, type = "response")
logit_pred <- ifelse(logit_prob > 0.5, levels(df_selected[[target_var]])[2], levels(df_selected[[target_var]])[1])
logit_pred <- factor(logit_pred, levels = levels(df_selected[[target_var]]))
logit_cm <- confusionMatrix(logit_pred, test_data[[target_var]])
print(logit_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1685  488
##          1  628 2779
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7893, 0.8104)
##     No Information Rate : 0.5855          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5843          
##                                           
##  Mcnemar's Test P-Value : 3.171e-05       
##                                           
##             Sensitivity : 0.7285          
##             Specificity : 0.8506          
##          Pos Pred Value : 0.7754          
##          Neg Pred Value : 0.8157          
##              Prevalence : 0.4145          
##          Detection Rate : 0.3020          
##    Detection Prevalence : 0.3894          
##       Balanced Accuracy : 0.7896          
##                                           
##        'Positive' Class : 0               
##

cat("\n===== [4] Interpretasi Odds Ratio =====\n")

## 
## ===== [4] Interpretasi Odds Ratio =====

or <- exp(cbind(OddsRatio = coef(logit_model), confint(logit_model)))

## Waiting for profiling to be done...

print(round(or, 3))

##                                      OddsRatio  2.5 %       97.5 %
## (Intercept)                              0.024  0.022 2.700000e-02
## Academic.Pressure                        2.279  2.215 2.346000e+00
## Work.Pressure                            0.005  0.000 6.828217e+08
## Job.Satisfaction                    463481.324  0.039 2.331603e+13
## Have.you.ever.had.suicidal.thoughts     12.726 11.811 1.372000e+01

for (f in names(coef(logit_model))[-1]) {
  val <- round(or[f, 1], 2)
  cat(paste0("Jika ", f, " meningkat 1 unit, maka odds terhadap 1 meningkat sebesar ", val, " kali lipat.\n"))
}

## Jika Academic.Pressure meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 2.28 kali lipat.
## Jika Work.Pressure meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 0.01 kali lipat.
## Jika Job.Satisfaction meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 463481.32 kali lipat.
## Jika Have.you.ever.had.suicidal.thoughts meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 12.73 kali lipat.

cat("\n===== AUC & ROC Curve - Regresi Logistik =====\n")

## 
## ===== AUC & ROC Curve - Regresi Logistik =====

roc_obj <- roc(test_data[[target_var]], logit_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

cat("AUC:", auc(roc_obj), "\n")

## AUC: 0.8639032

plot(roc_obj, col = "blue", main = "ROC Curve Logistic Regression")

cat("\n===== LDA - Linear Discriminant Analysis =====\n")

## 
## ===== LDA - Linear Discriminant Analysis =====

cat("\n===== [1A] Uji Normalitas Multivariat (Manual - Mahalanobis) =====\n")

## 
## ===== [1A] Uji Normalitas Multivariat (Manual - Mahalanobis) =====

mahal_dist <- mahalanobis(
  train_data[, feature_cols],
  colMeans(train_data[, feature_cols]),
  cov(train_data[, feature_cols])
)

alpha <- 0.001
df_mvn <- length(feature_cols)
threshold <- qchisq(1 - alpha, df = df_mvn)
outlier_index <- which(mahal_dist > threshold)
cat("Jumlah outlier multivariat:", length(outlier_index), "\n")

## Jumlah outlier multivariat: 8

cat("Persentase outlier:", round(length(outlier_index) / nrow(train_data) * 100, 2), "%\n")

## Persentase outlier: 0.04 %

if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)

ggplot(data.frame(Index = 1:length(mahal_dist), MD = mahal_dist), aes(x = Index, y = MD)) +
  geom_point(color = "steelblue") +
  geom_hline(yintercept = threshold, color = "red", linetype = "dashed") +
  labs(title = "Mahalanobis Distance untuk Deteksi Outlier",
       subtitle = paste("Threshold =", round(threshold, 2)),
       y = "Mahalanobis Distance") +
  theme_minimal()

cat("\n===== [1B] Uji Homogenitas Kovarian (Box's M Test) =====\n")

## 
## ===== [1B] Uji Homogenitas Kovarian (Box's M Test) =====

boxM_res <- boxM(train_data[, feature_cols], train_data[[target_var]])
print(boxM_res)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train_data[, feature_cols]
## Chi-Sq (approx.) = 3869.2, df = 10, p-value < 2.2e-16

cat("\n===== [2] Estimasi Model & Koefisien =====\n")

## 
## ===== [2] Estimasi Model & Koefisien =====

lda_model <- lda(as.formula(paste(target_var, "~ .")), data = train_data)
print(lda_model)

## Call:
## lda(as.formula(paste(target_var, "~ .")), data = train_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4144976 0.5855024 
## 
## Group means:
##   Academic.Pressure Work.Pressure Job.Satisfaction
## 0          2.366083  2.862218e-05     5.787469e-05
## 1          3.702196  1.013132e-05     4.097151e-05
##   Have.you.ever.had.suicidal.thoughts
## 0                           0.3182015
## 1                           0.8543882
## 
## Coefficients of linear discriminants:
##                                            LD1
## Academic.Pressure                    0.5272663
## Work.Pressure                       -3.6685356
## Job.Satisfaction                     7.7929404
## Have.you.ever.had.suicidal.thoughts  1.9107315

cat("\n===== [3] Akurasi Model - Diskriminan =====\n")

## 
## ===== [3] Akurasi Model - Diskriminan =====

lda_pred <- predict(lda_model, test_data)
lda_class <- factor(lda_pred$class, levels = levels(df_selected[[target_var]]))
lda_cm <- confusionMatrix(lda_class, test_data[[target_var]])
print(lda_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1685  488
##          1  628 2779
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7893, 0.8104)
##     No Information Rate : 0.5855          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5843          
##                                           
##  Mcnemar's Test P-Value : 3.171e-05       
##                                           
##             Sensitivity : 0.7285          
##             Specificity : 0.8506          
##          Pos Pred Value : 0.7754          
##          Neg Pred Value : 0.8157          
##              Prevalence : 0.4145          
##          Detection Rate : 0.3020          
##    Detection Prevalence : 0.3894          
##       Balanced Accuracy : 0.7896          
##                                           
##        'Positive' Class : 0               
##

cat("\n===== [4] Interpretasi Model - Koefisien Fungsi Diskriminan =====\n")

## 
## ===== [4] Interpretasi Model - Koefisien Fungsi Diskriminan =====

print(lda_model$scaling)

##                                            LD1
## Academic.Pressure                    0.5272663
## Work.Pressure                       -3.6685356
## Job.Satisfaction                     7.7929404
## Have.you.ever.had.suicidal.thoughts  1.9107315

cat("\nVariabel dengan kontribusi terbesar:\n")

## 
## Variabel dengan kontribusi terbesar:

loading_order <- order(abs(lda_model$scaling), decreasing = TRUE)
print(lda_model$scaling[loading_order, , drop = FALSE])

##                                            LD1
## Job.Satisfaction                     7.7929404
## Work.Pressure                       -3.6685356
## Have.you.ever.had.suicidal.thoughts  1.9107315
## Academic.Pressure                    0.5272663

Script Metode Kelompok 7_Sains Data 2023D

2025-05-26