options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("ggplot2")
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("dplyr")
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("psych")
## package 'psych' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("factoextra")
## package 'factoextra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("corrplot")
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("naniar")
## package 'naniar' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("caret")
## package 'caret' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
install.packages("biotools")
## package 'biotools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpKy3Qal\downloaded_packages
Membaca Data
data <- read.csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv", stringsAsFactors = TRUE)
Struktur Data
str(data)
## 'data.frame': 27901 obs. of 18 variables:
## $ Student_ID : int 2 8 26 30 32 33 52 56 59 62 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 2 1 2 2 ...
## $ Age : int 33 24 31 28 25 29 30 30 28 31 ...
## $ City : Factor w/ 52 levels "'Less Delhi'",..: 52 6 45 50 19 40 47 9 34 38 ...
## $ Profession : Factor w/ 13 levels "'Civil Engineer'",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ Academic.Pressure : int 5 2 3 3 4 2 3 2 3 2 ...
## $ Work.Pressure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CGPA : num 8.97 5.9 7.03 5.59 8.13 5.7 9.54 8.04 9.79 8.38 ...
## $ Study.Satisfaction : int 2 5 5 2 3 3 4 4 1 3 ...
## $ Job.Satisfaction : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sleep.Duration : Factor w/ 5 levels "'5-6 hours'",..: 1 1 3 2 1 3 2 3 2 3 ...
## $ Dietary.Habits : Factor w/ 4 levels "Healthy","Moderate",..: 1 2 1 2 2 1 1 4 2 2 ...
## $ Degree : Factor w/ 28 levels "'Class 12'","B.Arch",..: 5 12 7 9 18 28 12 1 4 13 ...
## $ Have.you.ever.had.suicidal.thoughts: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 1 1 2 2 ...
## $ Work.Study.Hours : int 3 3 9 4 1 4 1 0 12 2 ...
## $ Financial.Stress : Factor w/ 6 levels "?","1","2","3",..: 2 3 2 6 2 2 3 2 4 6 ...
## $ Family.History.of.Mental.Illness : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 1 1 ...
## $ Depression : int 1 0 0 1 0 0 0 0 1 1 ...
summary(data)
## Student_ID Gender Age City
## Min. : 2 Female:12354 Min. :18.00 Kalyan : 1570
## 1st Qu.: 35039 Male :15547 1st Qu.:21.00 Srinagar : 1372
## Median : 70684 Median :25.00 Hyderabad : 1340
## Mean : 70442 Mean :25.82 Vasai-Virar: 1290
## 3rd Qu.:105818 3rd Qu.:30.00 Lucknow : 1155
## Max. :140699 Max. :59.00 Thane : 1139
## (Other) :20035
## Profession Academic.Pressure Work.Pressure
## Student :27871 Min. :0.000 Min. :0.00000
## Architect : 8 1st Qu.:2.000 1st Qu.:0.00000
## Teacher : 6 Median :3.000 Median :0.00000
## 'Digital Marketer': 3 Mean :3.141 Mean :0.00043
## 'Content Writer' : 2 3rd Qu.:4.000 3rd Qu.:0.00000
## Chef : 2 Max. :5.000 Max. :5.00000
## (Other) : 9
## CGPA Study.Satisfaction Job.Satisfaction
## Min. : 0.000 Min. :0.000 Min. :0.000000
## 1st Qu.: 6.290 1st Qu.:2.000 1st Qu.:0.000000
## Median : 7.770 Median :3.000 Median :0.000000
## Mean : 7.656 Mean :2.944 Mean :0.000681
## 3rd Qu.: 8.920 3rd Qu.:4.000 3rd Qu.:0.000000
## Max. :10.000 Max. :5.000 Max. :4.000000
##
## Sleep.Duration Dietary.Habits Degree
## '5-6 hours' :6183 Healthy : 7651 'Class 12': 6080
## '7-8 hours' :7346 Moderate : 9921 B.Ed : 1867
## 'Less than 5 hours':8310 Others : 12 B.Com : 1506
## 'More than 8 hours':6044 Unhealthy:10317 B.Arch : 1478
## Others : 18 BCA : 1433
## MSc : 1190
## (Other) :14347
## Have.you.ever.had.suicidal.thoughts Work.Study.Hours Financial.Stress
## No :10245 Min. : 0.000 ?: 3
## Yes:17656 1st Qu.: 4.000 1:5121
## Median : 8.000 2:5061
## Mean : 7.157 3:5226
## 3rd Qu.:10.000 4:5775
## Max. :12.000 5:6715
##
## Family.History.of.Mental.Illness Depression
## No :14398 Min. :0.0000
## Yes:13503 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5855
## 3rd Qu.:1.0000
## Max. :1.0000
##
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(readr)
data <- read_csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 27901 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Gender, City, Profession, Sleep Duration, Dietary Habits, Degree, ...
## dbl (10): Student_ID, Age, Academic Pressure, Work Pressure, CGPA, Study Sat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
categorical_cols <- c("Gender", "City", "Profession", "Sleep Duration",
"Dietary Habits", "Degree", "Have you ever had suicidal thoughts",
"Financial Stress", "Family History of Mental Illness")
numerical_cols <- c("Age", "Academic Pressure", "Work Pressure",
"CGPA", "Study Satisfaction", "Job Satisfaction",
"Work/Study Hours")
for (col in categorical_cols) {
print(ggplot(data, aes(x = .data[[col]])) +
geom_bar(fill = "steelblue") +
labs(title = paste("Bar Plot of", col), x = col, y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)))
}
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_count()`).
for (col in numerical_cols) {
print(ggplot(data, aes(x = .data[[col]], y = Depression)) +
geom_jitter(alpha = 0.5, color = "tomato") +
labs(title = paste("Scatter Plot of", col, "vs Depression"),
x = col, y = "Depression") +
theme(plot.title = element_text(hjust = 0.5)))
}
library(ggplot2)
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
data <- read_csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/student_depression_dataset.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 27901 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Gender, City, Profession, Sleep Duration, Dietary Habits, Degree, ...
## dbl (10): Student_ID, Age, Academic Pressure, Work Pressure, CGPA, Study Sat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
categorical_cols <- c("Gender", "City", "Profession", "Sleep Duration",
"Dietary Habits", "Degree", "Have you ever had suicidal thoughts",
"Financial Stress", "Family History of Mental Illness")
numerical_cols <- c("Age", "Academic Pressure", "Work Pressure",
"CGPA", "Study Satisfaction", "Job Satisfaction",
"Work/Study Hours")
data[categorical_cols] <- lapply(data[categorical_cols], as.character)
data_categorical_long <- data[, categorical_cols] %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(data_categorical_long, aes(x = Value)) +
geom_bar(fill = "steelblue") +
facet_wrap(~ Variable, scales = "free_x") +
labs(title = "Bar Plot of Categorical Variables", x = "Category", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5))
data_numerical_long <- data[, c(numerical_cols, "Depression")] %>%
pivot_longer(cols = -Depression, names_to = "Variable", values_to = "Value")
ggplot(data_numerical_long, aes(x = Value, y = Depression)) +
geom_jitter(alpha = 0.5, color = "tomato") +
facet_wrap(~ Variable, scales = "free_x") +
labs(title = "Scatter Plot of Numerical Variables vs Depression",
x = "Value", y = "Depression") +
theme(plot.title = element_text(hjust = 0.5))
Statistik Deskriptif
library(dplyr)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
numeric_vars <- select_if(data, is.numeric)
describe(numeric_vars)
## vars n mean sd median trimmed mad min
## Student_ID 1 27901 70442.15 40641.18 70684.00 70454.87 52412.88 2
## Age 2 27901 25.82 4.91 25.00 25.77 5.93 18
## Academic Pressure 3 27901 3.14 1.38 3.00 3.18 1.48 0
## Work Pressure 4 27901 0.00 0.04 0.00 0.00 0.00 0
## CGPA 5 27901 7.66 1.47 7.77 7.67 1.88 0
## Study Satisfaction 6 27901 2.94 1.36 3.00 2.93 1.48 0
## Job Satisfaction 7 27901 0.00 0.04 0.00 0.00 0.00 0
## Work/Study Hours 8 27901 7.16 3.71 8.00 7.40 4.45 0
## Depression 9 27901 0.59 0.49 1.00 0.61 0.00 0
## max range skew kurtosis se
## Student_ID 140699 140697 -0.01 -1.21 243.31
## Age 59 41 0.13 -0.85 0.03
## Academic Pressure 5 5 -0.14 -1.16 0.01
## Work Pressure 5 5 108.58 12107.60 0.00
## CGPA 10 10 -0.11 -1.02 0.01
## Study Satisfaction 5 5 0.01 -1.22 0.01
## Job Satisfaction 4 4 74.10 5925.51 0.00
## Work/Study Hours 12 12 -0.45 -1.00 0.02
## Depression 1 1 -0.35 -1.88 0.00
Histogram
library(ggplot2)
ggplot(data, aes(x = CGPA)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
labs(title = "Histogram: Distribusi CGPA", x = "CGPA", y = "Frekuensi") +
theme_minimal() +
theme(text = element_text(size = 14))
Contoh Histogram Durasi Tidur
data$SleepDuration <- gsub("'", "", data$'Sleep Duration')
ggplot(data, aes(x = SleepDuration)) +
geom_bar(fill = "darkgreen") +
labs(title = "Histogram: Distribusi Durasi Tidur", x = "Durasi Tidur", y = "Frekuensi") +
theme_minimal() +
theme(text = element_text(size = 14))
Plot Numerik vs Label
library(ggplot2)
library(dplyr)
numeric_names <- colnames(numeric_vars)
for (feature in numeric_names) {
binned <- data %>%
mutate(binned_feature = cut(.data[[feature]], breaks = 5)) %>%
group_by(binned_feature) %>%
summarise(mean_depression = mean(Depression, na.rm = TRUE))
print(
ggplot(binned, aes(x = binned_feature, y = mean_depression)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = paste("Bar Plot:", feature, "(binned) vs Mean Depression"),
x = paste(feature, "(binned)"), y = "Mean Depression") +
theme_minimal() +
theme(text = element_text(size = 14))
)
}
Boxplot
for (feature in numeric_names) {
print(
ggplot(data, aes(x = factor(Depression), y = .data[[feature]])) +
geom_boxplot(fill = "skyblue") +
labs(title = paste("Boxplot:", feature, "berdasarkan Status Depression"),
x = "Depression (0 = Tidak, 1 = Ya)", y = feature) +
theme_minimal() +
theme(text = element_text(size = 14))
)
}
Bar Plot sleep duration berdasarkan label
ggplot(data, aes(x = SleepDuration, fill = factor(Depression))) +
geom_bar(position = "dodge") +
labs(title = "Bar Plot: Durasi Tidur berdasarkan Status Depression",
x = "Durasi Tidur", y = "Frekuensi", fill = "Depression") +
theme_minimal() +
theme(text = element_text(size = 14))
Density Plot CGPA berrdasarkan label
ggplot(data, aes(x = CGPA, fill = factor(Depression))) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot: Distribusi CGPA berdasarkan Status Depression",
x = "CGPA", fill = "Depression") +
theme_minimal() +
theme(text = element_text(size = 14))
Pairs Plot
library(dplyr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
pair_data <- dplyr::select(data,
"Academic Pressure",
"Work Pressure",
"CGPA",
"Study Satisfaction",
"Job Satisfaction",
"Work/Study Hours",
"Depression")
ggpairs(pair_data, aes(color = factor(Depression), alpha = 0.6)) +
theme_bw() +
ggtitle("Pairs Plot: Hubungan antar Fitur Numerik dan Status Depression") +
theme(text = element_text(size = 14), plot.title = element_text(hjust = 0.5))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
Heatmap
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(dplyr)
cor_data <- dplyr::select(data, where(is.numeric))
cor_matrix <- cor(cor_data, use = "complete.obs", method = "pearson")
print(round(cor_matrix, 2))
## Student_ID Age Academic Pressure Work Pressure CGPA
## Student_ID 1.00 0.00 0.01 0.00 -0.01
## Age 0.00 1.00 -0.08 0.00 0.01
## Academic Pressure 0.01 -0.08 1.00 -0.02 -0.02
## Work Pressure 0.00 0.00 -0.02 1.00 -0.05
## CGPA -0.01 0.01 -0.02 -0.05 1.00
## Study Satisfaction 0.01 0.01 -0.11 -0.02 -0.04
## Job Satisfaction 0.00 0.00 -0.02 0.77 -0.05
## Work/Study Hours 0.00 -0.03 0.10 -0.01 0.00
## Depression 0.00 -0.23 0.47 0.00 0.02
## Study Satisfaction Job Satisfaction Work/Study Hours
## Student_ID 0.01 0.00 0.00
## Age 0.01 0.00 -0.03
## Academic Pressure -0.11 -0.02 0.10
## Work Pressure -0.02 0.77 -0.01
## CGPA -0.04 -0.05 0.00
## Study Satisfaction 1.00 -0.02 -0.04
## Job Satisfaction -0.02 1.00 -0.01
## Work/Study Hours -0.04 -0.01 1.00
## Depression -0.17 0.00 0.21
## Depression
## Student_ID 0.00
## Age -0.23
## Academic Pressure 0.47
## Work Pressure 0.00
## CGPA 0.02
## Study Satisfaction -0.17
## Job Satisfaction 0.00
## Work/Study Hours 0.21
## Depression 1.00
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
number.cex = 0.8, col = colorRampPalette(c("blue", "white", "red"))(200),
title = "Heatmap: Korelasi antar Fitur Numerik", mar = c(0,0,2,0))
Cek Missing Value
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
missing_summary <- sapply(data, function(x) sum(is.na(x)))
print(missing_summary)
## Student_ID Gender
## 0 0
## Age City
## 0 0
## Profession Academic Pressure
## 0 0
## Work Pressure CGPA
## 0 0
## Study Satisfaction Job Satisfaction
## 0 0
## Sleep Duration Dietary Habits
## 0 0
## Degree Have you ever had suicidal thoughts
## 0 0
## Work/Study Hours Financial Stress
## 0 3
## Family History of Mental Illness Depression
## 0 0
## SleepDuration
## 0
Cek Outliers
library(ggplot2)
# Ganti nama kolom yang bermasalah menjadi format aman
colnames(data) <- make.names(colnames(data))
# Pastikan nama kolom numerik yang digunakan sesuai dengan nama yang sudah diperbaiki
numeric_vars <- data[, c("Age", "Academic.Pressure", "Work.Pressure",
"CGPA", "Study.Satisfaction", "Job.Satisfaction",
"Work.Study.Hours")]
# 1. Visualisasi boxplot untuk deteksi outlier
for (feature in names(numeric_vars)) {
print(
ggplot(data, aes_string(y = feature)) +
geom_boxplot(fill = "orange", outlier.color = "red", outlier.shape = 8) +
labs(title = paste("Boxplot Deteksi Outlier:", feature)) +
theme_minimal() +
theme(text = element_text(size = 14))
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 2. Deteksi outlier secara programatik
outliers <- list()
outlier_counts <- numeric()
for (col in names(numeric_vars)) {
Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outlier_indices <- which(data[[col]] < lower_bound | data[[col]] > upper_bound)
outliers[[col]] <- outlier_indices
outlier_counts[col] <- length(outlier_indices)
}
# 3. Tampilkan hasil deteksi outlier
cat("Jumlah outlier per variabel:\n")
## Jumlah outlier per variabel:
print(outlier_counts)
## Age Academic.Pressure Work.Pressure CGPA
## 12 0 3 9
## Study.Satisfaction Job.Satisfaction Work.Study.Hours
## 0 8 0
cols_with_outliers <- names(outlier_counts[outlier_counts > 0])
cat("\nKolom yang mengandung outlier:\n")
##
## Kolom yang mengandung outlier:
print(cols_with_outliers)
## [1] "Age" "Work.Pressure" "CGPA" "Job.Satisfaction"
Handling Ouliters
library(dplyr)
numeric_cols <- dplyr::select(data, where(is.numeric))
means <- sapply(numeric_cols, mean, na.rm = TRUE)
sds <- sapply(numeric_cols, sd, na.rm = TRUE)
lower_bounds <- means - 3 * sds
upper_bounds <- means + 3 * sds
for (col in names(numeric_cols)) {
numeric_cols[[col]][numeric_cols[[col]] < lower_bounds[col]] <- lower_bounds[col]
numeric_cols[[col]][numeric_cols[[col]] > upper_bounds[col]] <- upper_bounds[col]
}
data[names(numeric_cols)] <- numeric_cols
Cek Duplikat
dup_count <- sum(duplicated(data))
cat("Jumlah baris duplikat:", dup_count, "\n")
## Jumlah baris duplikat: 0
if (dup_count > 0) {
data <- data[!duplicated(data), ]
cat("Duplikat sudah dihapus\n")
}
Cek Inkonsisten Data
library(dplyr)
sapply(dplyr::select(data, where(is.factor)), levels)
## named list()
library(dplyr)
library(stringr)
factor_cols <- dplyr::select(data, names(data)[sapply(data, is.factor)])
clean_levels <- function(x) {
x <- as.character(x)
x <- str_replace_all(x, "\\\\'", "")
x <- str_replace_all(x, "''", "'")
x <- str_replace_all(x, "^'+|'+$", "")
x <- str_trim(x)
x[x == "?"] <- NA
return(factor(x))
}
factor_cleaned <- mutate_all(factor_cols, clean_levels)
data[names(factor_cleaned)] <- factor_cleaned
sapply(dplyr::select(data, names(factor_cleaned)), levels)
## named list()
Normalisasi Data (opsional)
library(dplyr)
normalize <- function(x) {
return ((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}
data_scaled <- data
num_cols <- names(dplyr::select(data, where(is.numeric)))
data_scaled[num_cols] <- lapply(data_scaled[num_cols], normalize)
summary(data_scaled[num_cols])
## Student_ID Age Academic.Pressure Work.Pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.2490 1st Qu.:0.1331 1st Qu.:0.4000 1st Qu.:0.0000000
## Median :0.5024 Median :0.3106 Median :0.6000 Median :0.0000000
## Mean :0.5007 Mean :0.3468 Mean :0.6282 Mean :0.0001075
## 3rd Qu.:0.7521 3rd Qu.:0.5324 3rd Qu.:0.8000 3rd Qu.:0.0000000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000000
## CGPA Study.Satisfaction Job.Satisfaction Work.Study.Hours
## Min. :0.0000 Min. :0.0000 Min. :0.0000000 Min. :0.0000
## 1st Qu.:0.4509 1st Qu.:0.4000 1st Qu.:0.0000000 1st Qu.:0.3333
## Median :0.6699 Median :0.6000 Median :0.0000000 Median :0.6667
## Mean :0.6532 Mean :0.5888 Mean :0.0002867 Mean :0.5964
## 3rd Qu.:0.8401 3rd Qu.:0.8000 3rd Qu.:0.0000000 3rd Qu.:0.8333
## Max. :1.0000 Max. :1.0000 Max. :1.0000000 Max. :1.0000
## Depression
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5855
## 3rd Qu.:1.0000
## Max. :1.0000
Distribusi Label
table(data$Depression)
##
## 0 1
## 11565 16336
ggplot(data, aes(x = factor(Depression))) +
geom_bar(fill = "purple") +
labs(title = "Distribusi Kelas Depression", x = "Depression", y = "Jumlah") +
theme_minimal() + theme(text = element_text(size = 14))
data
## # A tibble: 27,901 × 19
## Student_ID Gender Age City Profession Academic.Pressure Work.Pressure
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 2 Male 33 Visakhapa… Student 5 0
## 2 8 Female 24 Bangalore Student 2 0
## 3 26 Male 31 Srinagar Student 3 0
## 4 30 Female 28 Varanasi Student 3 0
## 5 32 Female 25 Jaipur Student 4 0
## 6 33 Male 29 Pune Student 2 0
## 7 52 Male 30 Thane Student 3 0
## 8 56 Female 30 Chennai Student 2 0
## 9 59 Male 28 Nagpur Student 3 0
## 10 62 Male 31 Nashik Student 2 0
## # ℹ 27,891 more rows
## # ℹ 12 more variables: CGPA <dbl>, Study.Satisfaction <dbl>,
## # Job.Satisfaction <dbl>, Sleep.Duration <chr>, Dietary.Habits <chr>,
## # Degree <chr>, Have.you.ever.had.suicidal.thoughts <chr>,
## # Work.Study.Hours <dbl>, Financial.Stress <chr>,
## # Family.History.of.Mental.Illness <chr>, Depression <dbl>,
## # SleepDuration <chr>
cat_features <- names(data)[sapply(data, function(x) is.factor(x) | is.character(x))]
label_encode <- function(x) {
factor_x <- as.factor(x)
as.numeric(factor_x) - 1
}
df_encoded <- data
for (feat in cat_features) {
df_encoded[[feat]] <- label_encode(data[[feat]])
}
cat("Fitur kategorikal yang di-label encoding:\n")
## Fitur kategorikal yang di-label encoding:
print(cat_features)
## [1] "Gender" "City"
## [3] "Profession" "Sleep.Duration"
## [5] "Dietary.Habits" "Degree"
## [7] "Have.you.ever.had.suicidal.thoughts" "Financial.Stress"
## [9] "Family.History.of.Mental.Illness" "SleepDuration"
df_encoded
## # A tibble: 27,901 × 19
## Student_ID Gender Age City Profession Academic.Pressure Work.Pressure
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1 33 51 11 5 0
## 2 8 0 24 5 11 2 0
## 3 26 1 31 44 11 3 0
## 4 30 0 28 49 11 3 0
## 5 32 0 25 18 11 4 0
## 6 33 1 29 39 11 2 0
## 7 52 1 30 46 11 3 0
## 8 56 0 30 8 11 2 0
## 9 59 1 28 33 11 3 0
## 10 62 1 31 37 11 2 0
## # ℹ 27,891 more rows
## # ℹ 12 more variables: CGPA <dbl>, Study.Satisfaction <dbl>,
## # Job.Satisfaction <dbl>, Sleep.Duration <dbl>, Dietary.Habits <dbl>,
## # Degree <dbl>, Have.you.ever.had.suicidal.thoughts <dbl>,
## # Work.Study.Hours <dbl>, Financial.Stress <dbl>,
## # Family.History.of.Mental.Illness <dbl>, Depression <dbl>,
## # SleepDuration <dbl>
library(dplyr)
data$Depression <- as.numeric(as.character(data$Depression))
num_vars <- dplyr::select(data, where(is.numeric))
Feature Selection
library(psych)
fa_result <- fa(num_vars, nfactors = 2, rotate = "varimax")
loadings_mat <- as.matrix(fa_result$loadings)
loadings_df <- as.data.frame(loadings_mat)
library(dplyr)
library(psych)
library(ggplot2)
num_vars <- df_encoded %>%
select_if(is.numeric)
drop_cols <- c("Student_ID", "Gender", "City", "Profession", "Degree", "SleepDuration")
num_vars <- num_vars[, ! names(num_vars) %in% drop_cols ]
variances <- sapply(num_vars, var, na.rm = TRUE)
num_vars <- num_vars[, variances > 0]
cat("Variabel numerik untuk analisis:\n")
## Variabel numerik untuk analisis:
print(names(num_vars))
## [1] "Age" "Academic.Pressure"
## [3] "Work.Pressure" "CGPA"
## [5] "Study.Satisfaction" "Job.Satisfaction"
## [7] "Sleep.Duration" "Dietary.Habits"
## [9] "Have.you.ever.had.suicidal.thoughts" "Work.Study.Hours"
## [11] "Financial.Stress" "Family.History.of.Mental.Illness"
## [13] "Depression"
num_complete <- na.omit(num_vars)
kmo_res <- KMO(num_complete)
bartlett_res <- cortest.bartlett(cor(num_complete), n = nrow(num_complete))
cat("\n--- KMO Test ---\n")
##
## --- KMO Test ---
print(kmo_res)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = num_complete)
## Overall MSA = 0.62
## MSA for each item =
## Age Academic.Pressure
## 0.70 0.70
## Work.Pressure CGPA
## 0.50 0.50
## Study.Satisfaction Job.Satisfaction
## 0.76 0.50
## Sleep.Duration Dietary.Habits
## 0.56 0.77
## Have.you.ever.had.suicidal.thoughts Work.Study.Hours
## 0.69 0.76
## Financial.Stress Family.History.of.Mental.Illness
## 0.75 0.70
## Depression
## 0.61
cat("\n--- Bartlett’s Test ---\n")
##
## --- Bartlett’s Test ---
print(bartlett_res)
## $chisq
## [1] 39351.64
##
## $p.value
## [1] 0
##
## $df
## [1] 78
keep_vars <- names(kmo_res$MSAi)[ kmo_res$MSAi >= 0.5 ]
num_vars <- num_vars[, keep_vars]
cat("\nVariabel setelah seleksi MSA >= 0.5:\n")
##
## Variabel setelah seleksi MSA >= 0.5:
print(keep_vars)
## [1] "Age" "Academic.Pressure"
## [3] "Work.Pressure" "Study.Satisfaction"
## [5] "Job.Satisfaction" "Sleep.Duration"
## [7] "Dietary.Habits" "Have.you.ever.had.suicidal.thoughts"
## [9] "Work.Study.Hours" "Financial.Stress"
## [11] "Family.History.of.Mental.Illness" "Depression"
fa.parallel(na.omit(num_vars),
fm = "pa",
fa = "fa",
n.iter = 100,
main = "Parallel Analysis untuk FA")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Warning in cor(sampledata, use = use): the standard deviation is zero
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
nf <- 2
fa_res <- fa(na.omit(num_vars),
nfactors = nf,
fm = "pa",
rotate = "oblimin")
## Loading required namespace: GPArotation
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
cat("\n--- Hasil FA (principal axis + oblimin) ---\n")
##
## --- Hasil FA (principal axis + oblimin) ---
print(fa_res, digits = 2, cutoff = 0.30)
## Factor Analysis using method = pa
## Call: fa(r = na.omit(num_vars), nfactors = nf, rotate = "oblimin",
## fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 h2 u2 com
## Age -0.21 -0.01 0.04609 0.9539 1.0
## Academic.Pressure 0.47 -0.03 0.22001 0.7800 1.0
## Work.Pressure 0.00 0.78 0.60972 0.3903 1.0
## Study.Satisfaction -0.17 -0.03 0.02957 0.9704 1.1
## Job.Satisfaction 0.00 0.78 0.61501 0.3850 1.0
## Sleep.Duration -0.02 -0.01 0.00057 0.9994 1.1
## Dietary.Habits 0.21 0.00 0.04249 0.9575 1.0
## Have.you.ever.had.suicidal.thoughts 0.55 0.00 0.30139 0.6986 1.0
## Work.Study.Hours 0.21 -0.01 0.04311 0.9569 1.0
## Financial.Stress 0.37 0.01 0.13330 0.8667 1.0
## Family.History.of.Mental.Illness 0.05 -0.01 0.00260 0.9974 1.0
## Depression 1.00 0.00 1.00638 -0.0064 1.0
##
## PA1 PA2
## SS loadings 1.82 1.23
## Proportion Var 0.15 0.10
## Cumulative Var 0.15 0.25
## Proportion Explained 0.60 0.40
## Cumulative Proportion 0.60 1.00
##
## With factor correlations of
## PA1 PA2
## PA1 1.00 -0.01
## PA2 -0.01 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 66 with the objective function = 1.4 0.3 with Chi Square = 39186.68
## df of the model are 43 and the objective function was 0.01
## 0.3
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.01
## 0.3
## The harmonic n.obs is 27898 with the empirical chi square 380.32 with prob < 1.4e-55
## 0.3The total n.obs was 27898 with Likelihood Chi Square = 248.62 with prob < 9.6e-31
## 0.3
## Tucker Lewis Index of factoring reliability = 0.992
## RMSEA index = 0.013 and the 90 % confidence intervals are 0.012 0.015 0.3
## BIC = -191.54
## Fit based upon off diagonal values = 1
fa.diagram(fa_res)
cat("\n--- Goodness‐of‐Fit FA ---\n")
##
## --- Goodness‐of‐Fit FA ---
cat("RMSEA =", round(fa_res$RMSEA[1], 3),
" (90% CI:", paste(round(fa_res$RMSEA[2:3], 3), collapse = "–"), ")\n")
## RMSEA = 0.013 (90% CI: 0.012–0.015 )
cat("TLI =", round(fa_res$TLI, 3), "\n")
## TLI = 0.992
cat("RMSR =", round(fa_res$rms, 3), "\n")
## RMSR = 0.01
load_df <- as.data.frame(unclass(fa_res$loadings))
load_df$Feature <- rownames(load_df)
selected_fa <- load_df %>%
filter(abs(PA1) >= 0.40 | abs(PA2) >= 0.40) %>%
pull(Feature)
cat("\nFitur terpilih dari FA (|loading| >= 0.4):\n")
##
## Fitur terpilih dari FA (|loading| >= 0.4):
print(selected_fa)
## [1] "Academic.Pressure" "Work.Pressure"
## [3] "Job.Satisfaction" "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"
num_pca <- num_vars %>%
mutate_all(~ ifelse(is.infinite(.), NA, .)) %>%
na.omit()
pca_res <- prcomp(num_pca, scale. = TRUE)
var_exp <- cumsum(pca_res$sdev^2 / sum(pca_res$sdev^2))
plot(var_exp, type = "b",
xlab = "Jumlah Komponen Principal",
ylab = "Varians Kumulatif Dijelaskan",
main = "Varians Kumulatif Dijelaskan oleh PCA")
pca_load_df <- as.data.frame(pca_res$rotation[, 1:2])
pca_load_df$Feature <- rownames(pca_load_df)
selected_pca <- pca_load_df %>%
filter(abs(PC1) > 0.40 | abs(PC2) > 0.40) %>%
pull(Feature)
cat("\nFitur terpilih dari PCA (|loading| > 0.4 di PC1/PC2):\n")
##
## Fitur terpilih dari PCA (|loading| > 0.4 di PC1/PC2):
print(selected_pca)
## [1] "Academic.Pressure" "Work.Pressure"
## [3] "Job.Satisfaction" "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"
idx <- as.numeric(rownames(num_pca))
Depression_clean <- df_encoded$Depression[idx]
cors <- sapply(num_pca, function(x)
cor(x, Depression_clean, use = "pairwise.complete.obs"))
cors_sorted <- sort(cors, decreasing = TRUE)
cat("\nKorelasi fitur terhadap Depression:\n")
##
## Korelasi fitur terhadap Depression:
print(cors_sorted)
## Depression Have.you.ever.had.suicidal.thoughts
## 0.163506012 0.091111924
## Academic.Pressure Financial.Stress
## 0.080186422 0.067805055
## Dietary.Habits Work.Study.Hours
## 0.031904021 0.029979106
## Family.History.of.Mental.Illness Sleep.Duration
## 0.007323482 -0.004034219
## Job.Satisfaction Work.Pressure
## -0.011533817 -0.012325078
## Study.Satisfaction Age
## -0.035061812 -0.035495158
cor_df <- data.frame(Feature = names(cors_sorted),
Correlation = cors_sorted)
ggplot(cor_df, aes(x = reorder(Feature, Correlation), y = Correlation)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Korelasi Fitur Numerik terhadap Depression",
x = "Fitur", y = "Korelasi") +
theme_minimal() +
theme(text = element_text(size = 14))
manual_selected <- names(cors)[abs(cors) > 0.30]
cat("\nFitur terpilih dari korelasi manual (|r| > 0.3):\n")
##
## Fitur terpilih dari korelasi manual (|r| > 0.3):
print(manual_selected)
## character(0)
final_features <- unique(c(selected_fa,
selected_pca,
manual_selected))
cat("\nFitur final terpilih untuk analisis selanjutnya:\n")
##
## Fitur final terpilih untuk analisis selanjutnya:
print(final_features)
## [1] "Academic.Pressure" "Work.Pressure"
## [3] "Job.Satisfaction" "Have.you.ever.had.suicidal.thoughts"
## [5] "Depression"
if(!exists("final_features")) {
stop("Variabel final_features belum tersedia.")
}
if(!exists("df_encoded")) {
stop("Variabel df_encoded belum tersedia.")
}
data <- df_encoded
target_var <- "Depression"
if(!(target_var %in% names(data))) {
stop(paste0("Variabel target '", target_var, "' tidak ditemukan di data."))
}
features_exist <- intersect(final_features, names(data))
if(length(features_exist) < length(final_features)) {
warning("Beberapa fitur di final_features tidak ditemukan di df_encoded dan akan diabaikan:\n",
paste(setdiff(final_features, features_exist), collapse = ", "))
}
df_selected <- data[, c(features_exist, target_var), drop = FALSE]
is_cat <- sapply(df_selected[, features_exist, drop = FALSE],
function(col) is.factor(col) || is.character(col))
cat_features <- names(is_cat)[is_cat]
if(length(cat_features) > 0) {
for(feat in cat_features) {
df_selected[[feat]] <- as.integer(factor(df_selected[[feat]])) - 1
}
message("Fitur kategorikal di‐label encoding: ", paste(cat_features, collapse = ", "))
} else {
message("Tidak ada fitur kategorikal.")
}
## Tidak ada fitur kategorikal.
out_file <- "D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv"
write.csv(df_selected, out_file, row.names = FALSE)
message("Data dengan fitur terpilih disimpan ke: ", out_file)
## Data dengan fitur terpilih disimpan ke: D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv
Modelling
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
df <- read.csv("D:/Mahasiswa/Semester 4/Analisis Multivariat/project/selected_features_data.csv")
target_var <- "Depression.1"
df[[target_var]] <- as.factor(df[[target_var]])
df <- na.omit(df)
feature_cols <- setdiff(colnames(df), target_var)
check_constant_per_group <- function(df, feature_cols, target_var) {
constant_features <- c()
groups <- levels(df[[target_var]])
for (f in feature_cols) {
is_const <- TRUE
for (g in groups) {
vals <- df[df[[target_var]] == g, f]
if (length(unique(vals)) > 1) {
is_const <- FALSE
break
}
}
if (is_const) constant_features <- c(constant_features, f)
}
return(constant_features)
}
total_constant <- sapply(df[, feature_cols], function(x) length(unique(x)) == 1)
feature_cols <- feature_cols[!total_constant]
group_constant <- check_constant_per_group(df, feature_cols, target_var)
feature_cols <- setdiff(feature_cols, group_constant)
df_selected <- df[, c(feature_cols, target_var)]
set.seed(123)
train_index <- createDataPartition(df_selected[[target_var]], p = 0.8, list = FALSE)
train_data <- df_selected[train_index, ]
test_data <- df_selected[-train_index, ]
model_vif <- lm(as.formula(paste(target_var, "~ .")), data = train_data)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
vif_vals <- vif(model_vif)
## Warning in Ops.factor(r, 2): '^' not meaningful for factors
## Warning in cov2cor(v): diag(V) had non-positive or NA entries; the non-finite
## result may be dubious
print(vif_vals)
## Academic.Pressure Work.Pressure
## NaN NaN
## Job.Satisfaction Have.you.ever.had.suicidal.thoughts
## NaN NaN
high_vif <- names(vif_vals[vif_vals > 10])
feature_cols <- setdiff(feature_cols, high_vif)
df_selected <- df_selected[, c(feature_cols, target_var)]
train_data <- df_selected[train_index, ]
test_data <- df_selected[-train_index, ]
cor_matrix <- cor(train_data[, feature_cols])
corrplot(cor_matrix, method = "color", tl.cex = 0.6)
cat("\n===== [1] Pembentukan Model (Estimasi Parameter) - Regresi Logistik =====\n")
##
## ===== [1] Pembentukan Model (Estimasi Parameter) - Regresi Logistik =====
logit_model <- glm(as.formula(paste(target_var, "~ .")), data = train_data, family = binomial())
summary(logit_model)
##
## Call:
## glm(formula = as.formula(paste(target_var, "~ .")), family = binomial(),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.72121 0.05743 -64.791 <2e-16 ***
## Academic.Pressure 0.82378 0.01463 56.318 <2e-16 ***
## Work.Pressure -5.27350 13.09207 -0.403 0.687
## Job.Satisfaction 13.04652 8.23370 1.585 0.113
## Have.you.ever.had.suicidal.thoughts 2.54364 0.03821 66.578 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30288 on 22320 degrees of freedom
## Residual deviance: 19410 on 22316 degrees of freedom
## AIC: 19420
##
## Number of Fisher Scoring iterations: 5
cat("\n===== [2A] Uji Signifikansi Parsial (Wald Test) =====\n")
##
## ===== [2A] Uji Signifikansi Parsial (Wald Test) =====
print(coef(summary(logit_model))[, 4])
## (Intercept) Academic.Pressure
## 0.0000000 0.0000000
## Work.Pressure Job.Satisfaction
## 0.6870944 0.1130737
## Have.you.ever.had.suicidal.thoughts
## 0.0000000
cat("\n===== [2B] Uji Signifikansi Serentak (Likelihood Ratio Test) =====\n")
##
## ===== [2B] Uji Signifikansi Serentak (Likelihood Ratio Test) =====
null_model <- glm(as.formula(paste(target_var, "~ 1")), data = train_data, family = binomial())
lrt_result <- anova(null_model, logit_model, test = "Chisq")
print(lrt_result)
## Analysis of Deviance Table
##
## Model 1: Depression.1 ~ 1
## Model 2: Depression.1 ~ Academic.Pressure + Work.Pressure + Job.Satisfaction +
## Have.you.ever.had.suicidal.thoughts
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 22320 30288
## 2 22316 19410 4 10877 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n===== [3] Akurasi Model - Regresi Logistik =====\n")
##
## ===== [3] Akurasi Model - Regresi Logistik =====
logit_prob <- predict(logit_model, newdata = test_data, type = "response")
logit_pred <- ifelse(logit_prob > 0.5, levels(df_selected[[target_var]])[2], levels(df_selected[[target_var]])[1])
logit_pred <- factor(logit_pred, levels = levels(df_selected[[target_var]]))
logit_cm <- confusionMatrix(logit_pred, test_data[[target_var]])
print(logit_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1685 488
## 1 628 2779
##
## Accuracy : 0.8
## 95% CI : (0.7893, 0.8104)
## No Information Rate : 0.5855
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5843
##
## Mcnemar's Test P-Value : 3.171e-05
##
## Sensitivity : 0.7285
## Specificity : 0.8506
## Pos Pred Value : 0.7754
## Neg Pred Value : 0.8157
## Prevalence : 0.4145
## Detection Rate : 0.3020
## Detection Prevalence : 0.3894
## Balanced Accuracy : 0.7896
##
## 'Positive' Class : 0
##
cat("\n===== [4] Interpretasi Odds Ratio =====\n")
##
## ===== [4] Interpretasi Odds Ratio =====
or <- exp(cbind(OddsRatio = coef(logit_model), confint(logit_model)))
## Waiting for profiling to be done...
print(round(or, 3))
## OddsRatio 2.5 % 97.5 %
## (Intercept) 0.024 0.022 2.700000e-02
## Academic.Pressure 2.279 2.215 2.346000e+00
## Work.Pressure 0.005 0.000 6.828217e+08
## Job.Satisfaction 463481.324 0.039 2.331603e+13
## Have.you.ever.had.suicidal.thoughts 12.726 11.811 1.372000e+01
for (f in names(coef(logit_model))[-1]) {
val <- round(or[f, 1], 2)
cat(paste0("Jika ", f, " meningkat 1 unit, maka odds terhadap 1 meningkat sebesar ", val, " kali lipat.\n"))
}
## Jika Academic.Pressure meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 2.28 kali lipat.
## Jika Work.Pressure meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 0.01 kali lipat.
## Jika Job.Satisfaction meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 463481.32 kali lipat.
## Jika Have.you.ever.had.suicidal.thoughts meningkat 1 unit, maka odds terhadap 1 meningkat sebesar 12.73 kali lipat.
cat("\n===== AUC & ROC Curve - Regresi Logistik =====\n")
##
## ===== AUC & ROC Curve - Regresi Logistik =====
roc_obj <- roc(test_data[[target_var]], logit_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cat("AUC:", auc(roc_obj), "\n")
## AUC: 0.8639032
plot(roc_obj, col = "blue", main = "ROC Curve Logistic Regression")
cat("\n===== LDA - Linear Discriminant Analysis =====\n")
##
## ===== LDA - Linear Discriminant Analysis =====
cat("\n===== [1A] Uji Normalitas Multivariat (Manual - Mahalanobis) =====\n")
##
## ===== [1A] Uji Normalitas Multivariat (Manual - Mahalanobis) =====
mahal_dist <- mahalanobis(
train_data[, feature_cols],
colMeans(train_data[, feature_cols]),
cov(train_data[, feature_cols])
)
alpha <- 0.001
df_mvn <- length(feature_cols)
threshold <- qchisq(1 - alpha, df = df_mvn)
outlier_index <- which(mahal_dist > threshold)
cat("Jumlah outlier multivariat:", length(outlier_index), "\n")
## Jumlah outlier multivariat: 8
cat("Persentase outlier:", round(length(outlier_index) / nrow(train_data) * 100, 2), "%\n")
## Persentase outlier: 0.04 %
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
ggplot(data.frame(Index = 1:length(mahal_dist), MD = mahal_dist), aes(x = Index, y = MD)) +
geom_point(color = "steelblue") +
geom_hline(yintercept = threshold, color = "red", linetype = "dashed") +
labs(title = "Mahalanobis Distance untuk Deteksi Outlier",
subtitle = paste("Threshold =", round(threshold, 2)),
y = "Mahalanobis Distance") +
theme_minimal()
cat("\n===== [1B] Uji Homogenitas Kovarian (Box's M Test) =====\n")
##
## ===== [1B] Uji Homogenitas Kovarian (Box's M Test) =====
boxM_res <- boxM(train_data[, feature_cols], train_data[[target_var]])
print(boxM_res)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train_data[, feature_cols]
## Chi-Sq (approx.) = 3869.2, df = 10, p-value < 2.2e-16
cat("\n===== [2] Estimasi Model & Koefisien =====\n")
##
## ===== [2] Estimasi Model & Koefisien =====
lda_model <- lda(as.formula(paste(target_var, "~ .")), data = train_data)
print(lda_model)
## Call:
## lda(as.formula(paste(target_var, "~ .")), data = train_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4144976 0.5855024
##
## Group means:
## Academic.Pressure Work.Pressure Job.Satisfaction
## 0 2.366083 2.862218e-05 5.787469e-05
## 1 3.702196 1.013132e-05 4.097151e-05
## Have.you.ever.had.suicidal.thoughts
## 0 0.3182015
## 1 0.8543882
##
## Coefficients of linear discriminants:
## LD1
## Academic.Pressure 0.5272663
## Work.Pressure -3.6685356
## Job.Satisfaction 7.7929404
## Have.you.ever.had.suicidal.thoughts 1.9107315
cat("\n===== [3] Akurasi Model - Diskriminan =====\n")
##
## ===== [3] Akurasi Model - Diskriminan =====
lda_pred <- predict(lda_model, test_data)
lda_class <- factor(lda_pred$class, levels = levels(df_selected[[target_var]]))
lda_cm <- confusionMatrix(lda_class, test_data[[target_var]])
print(lda_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1685 488
## 1 628 2779
##
## Accuracy : 0.8
## 95% CI : (0.7893, 0.8104)
## No Information Rate : 0.5855
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5843
##
## Mcnemar's Test P-Value : 3.171e-05
##
## Sensitivity : 0.7285
## Specificity : 0.8506
## Pos Pred Value : 0.7754
## Neg Pred Value : 0.8157
## Prevalence : 0.4145
## Detection Rate : 0.3020
## Detection Prevalence : 0.3894
## Balanced Accuracy : 0.7896
##
## 'Positive' Class : 0
##
cat("\n===== [4] Interpretasi Model - Koefisien Fungsi Diskriminan =====\n")
##
## ===== [4] Interpretasi Model - Koefisien Fungsi Diskriminan =====
print(lda_model$scaling)
## LD1
## Academic.Pressure 0.5272663
## Work.Pressure -3.6685356
## Job.Satisfaction 7.7929404
## Have.you.ever.had.suicidal.thoughts 1.9107315
cat("\nVariabel dengan kontribusi terbesar:\n")
##
## Variabel dengan kontribusi terbesar:
loading_order <- order(abs(lda_model$scaling), decreasing = TRUE)
print(lda_model$scaling[loading_order, , drop = FALSE])
## LD1
## Job.Satisfaction 7.7929404
## Work.Pressure -3.6685356
## Have.you.ever.had.suicidal.thoughts 1.9107315
## Academic.Pressure 0.5272663