# ---------------------------------------------------------
# 1. DATA LOADING & PRE-PROCESSING
# ---------------------------------------------------------
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
# Load the dataset
data <- read.csv("C:/Users/ASUS/OneDrive/New folder/university_student_stress_dataset.csv", stringsAsFactors = TRUE)
# Selecting continuous variables for PCA and FA
# These items are key to measuring student pressure and lifestyle
num_cols <- c("Study_Hours", "Class_Attendance", "Exam_Frequency",
"Assignment_Load", "Sleep_Hours", "Social_Media_Use",
"Screen_Time", "Peer_Pressure", "Family_Support", "Anxiety_Level")
data_num <- data[, num_cols]
# Standardization: Scaling is required because variables have different units
# (e.g., Attendance is 0-100, while Sleep is 0-24)
scaled_data <- scale(data_num)
# ---------------------------------------------------------
# 2. ASSUMPTION CHECKING (KMO & BARTLETT'S)
# ---------------------------------------------------------
# Bartlett's test checks if the correlation matrix is an identity matrix (p < 0.05)
bartlett_test <- cortest.bartlett(cor(data_num), n = nrow(data_num))
print(bartlett_test)
## $chisq
## [1] 42.46741
##
## $p.value
## [1] 0.5798296
##
## $df
## [1] 45
# KMO Test: Aiming for > 0.9 for "Marvelous" sampling adequacy
kmo_test <- KMO(cor(data_num))
print(kmo_test)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(data_num))
## Overall MSA = 0.49
## MSA for each item =
## Study_Hours Class_Attendance Exam_Frequency Assignment_Load
## 0.50 0.49 0.50 0.50
## Sleep_Hours Social_Media_Use Screen_Time Peer_Pressure
## 0.48 0.50 0.49 0.49
## Family_Support Anxiety_Level
## 0.49 0.49
# ---------------------------------------------------------
# 3. TECHNIQUE 1: PRINCIPAL COMPONENT ANALYSIS (PCA)
# ---------------------------------------------------------
pca_res <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# Scree Plot to determine number of components
fviz_eig(pca_res, addlabels = TRUE, main = "Scree Plot of Variance")

# Biplot: Shows how variables like Anxiety and Screen Time relate
fviz_pca_biplot(pca_res, repel = TRUE,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# ---------------------------------------------------------
# 4. TECHNIQUE 2: EXPLORATORY FACTOR ANALYSIS (EFA)
# ---------------------------------------------------------
# Using Parallel Analysis to confirm number of factors
fa_parallel <- fa.parallel(data_num, fm = "ml", fa = "fa")

## Parallel analysis suggests that the number of factors = 0 and the number of components = NA
# Running Factor Analysis with Varimax rotation for interpretability
fa_fit <- fa(data_num, nfactors = 3, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)
##
## Loadings:
## ML1 ML2 ML3
## Study_Hours
## Class_Attendance 0.942
## Exam_Frequency
## Assignment_Load
## Sleep_Hours
## Social_Media_Use
## Screen_Time 0.978
## Peer_Pressure
## Family_Support
## Anxiety_Level 0.992
##
## ML1 ML2 ML3
## SS loadings 1.011 0.993 0.929
## Proportion Var 0.101 0.099 0.093
## Cumulative Var 0.101 0.200 0.293
# Visualize latent factors
fa.diagram(fa_fit)

# ---------------------------------------------------------
# 5. TECHNIQUE 3: DISCRIMINANT ANALYSIS (LDA)
# ---------------------------------------------------------
set.seed(2026) # Setting seed for reproducibility
train_idx <- createDataPartition(data$Stress_Level, p = 0.7, list = FALSE)
train_set <- data[train_idx, ]
test_set <- data[-train_idx, ]
# Model: Predicting Stress_Level (Low, Medium, High)
lda_model <- lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours +
Screen_Time + Anxiety_Level + Peer_Pressure,
data = train_set)
# Model Evaluation
lda_pred <- predict(lda_model, test_set)
conf_matrix <- confusionMatrix(lda_pred$class, test_set$Stress_Level)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low Medium
## High 28 0 12
## Low 0 273 90
## Medium 69 99 327
##
## Overall Statistics
##
## Accuracy : 0.6993
## 95% CI : (0.6682, 0.7292)
## No Information Rate : 0.4777
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4673
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: High Class: Low Class: Medium
## Sensitivity 0.28866 0.7339 0.7622
## Specificity 0.98502 0.8289 0.6418
## Pos Pred Value 0.70000 0.7521 0.6606
## Neg Pred Value 0.91958 0.8150 0.7469
## Prevalence 0.10802 0.4143 0.4777
## Detection Rate 0.03118 0.3040 0.3641
## Detection Prevalence 0.04454 0.4042 0.5512
## Balanced Accuracy 0.63684 0.7814 0.7020