library(psych) # For KMO, Bartlett's, and Factor Analysis
## Warning: package 'psych' was built under R version 4.5.3
library(FactoMineR) # For PCA
## Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra) # For elegant PCA visualization
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MASS) # For Discriminant Analysis (LDA)
## Warning: package 'MASS' was built under R version 4.5.3
library(caret) # For data splitting and confusion matrix
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(ggplot2) # For general plotting
library(reshape2) # For data reshaping in EDA
## Warning: package 'reshape2' was built under R version 4.5.3
# Load the dataset
# Ensure the file is in your current working directory
data <- read.csv("C:/Users/ASUS/OneDrive/New folder/university_student_stress_dataset.csv", stringsAsFactors = TRUE)
# Checking for Missing Values
print("Missing values per column:")
## [1] "Missing values per column:"
print(colSums(is.na(data))) # The output shows 0 missing values, ensuring data integrity.
## Age Gender Study_Hours Class_Attendance
## 0 0 0 0
## Tuition Exam_Frequency Assignment_Load Sleep_Hours
## 0 0 0 0
## Physical_Exercise Social_Media_Use Screen_Time Family_Income_Level
## 0 0 0 0
## Peer_Pressure Family_Support Anxiety_Level University_Type
## 0 0 0 0
## Stress_Score Stress_Level
## 0 0
# Exploratory Data Analysis (EDA)
# Summary statistics
summary(data)
## Age Gender Study_Hours Class_Attendance Tuition
## Min. :19.00 Female:1538 Min. :0.000 Min. :40.00 No :1545
## 1st Qu.:20.00 Male :1462 1st Qu.:2.000 1st Qu.:54.00 Yes:1455
## Median :21.00 Median :4.000 Median :69.00
## Mean :21.52 Mean :4.489 Mean :68.87
## 3rd Qu.:23.00 3rd Qu.:7.000 3rd Qu.:84.00
## Max. :24.00 Max. :9.000 Max. :99.00
## Exam_Frequency Assignment_Load Sleep_Hours Physical_Exercise
## Min. :1.000 Min. :1.000 Min. :4.000 No :1551
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:5.000 Yes:1449
## Median :5.000 Median :5.000 Median :7.000
## Mean :5.026 Mean :4.996 Mean :6.519
## 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:8.000
## Max. :9.000 Max. :9.000 Max. :9.000
## Social_Media_Use Screen_Time Family_Income_Level Peer_Pressure
## Min. :0.000 Min. : 1.000 High : 608 Min. :1.000
## 1st Qu.:2.000 1st Qu.: 3.000 Low :1059 1st Qu.:3.000
## Median :3.000 Median : 6.000 Medium:1333 Median :5.000
## Mean :3.487 Mean : 5.918 Mean :4.984
## 3rd Qu.:5.000 3rd Qu.: 9.000 3rd Qu.:7.000
## Max. :7.000 Max. :11.000 Max. :9.000
## Family_Support Anxiety_Level University_Type Stress_Score
## Min. :1.000 Min. :1.000 National University:1000 Min. :-9.00
## 1st Qu.:3.000 1st Qu.:3.000 Private University : 997 1st Qu.: 7.00
## Median :5.000 Median :5.000 Public University :1003 Median :12.00
## Mean :5.027 Mean :5.001 Mean :11.92
## 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:17.00
## Max. :9.000 Max. :9.000 Max. :33.00
## Stress_Level
## High : 326
## Low :1243
## Medium:1431
##
##
##
# Outlier Examination
# Selecting numerical columns for outlier detection and multivariate techniques
num_cols <- c("Study_Hours", "Class_Attendance", "Exam_Frequency",
"Assignment_Load", "Sleep_Hours", "Social_Media_Use",
"Screen_Time", "Peer_Pressure", "Family_Support", "Anxiety_Level")
num_data <- data[, num_cols]
# Boxplots to visualize univariate outliers
boxplot(scale(num_data), las = 2, col = "lightblue",
main = "Standardized Boxplots for Outlier Detection")

# Standardization Justification [cite: 13]
# We use Z-score standardization because variables have different units (e.g., hours vs frequency).
# This prevents variables with larger scales from dominating the variance.
scaled_data <- scale(num_data)
# PRINCIPAL COMPONENT ANALYSIS (PCA)
# Justification: PCA is used for dimensionality reduction and identifying variables
# that explain the most variance in student stress
pca_res <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# Assumption Check: Scree Plot (Eigenvalues)
fviz_eig(pca_res, addlabels = TRUE, ylim = c(0, 50), main = "Scree Plot")

# Visualization: Clean Variables Factor Map (No dots, professional theme)
fviz_pca_var(pca_res,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
geom = c("arrow", "text")) +
theme_minimal() +
theme(panel.grid = element_blank(), axis.line = element_line(colour = "grey70")) +
labs(title = "PCA: Variables Factor Map")

# FACTOR ANALYSIS (FA)
# Justification: FA identifies latent unobserved constructs (e.g., "Academic Load")
# that cannot be measured directly by single variables
# Assumption Check: KMO and Bartlett's Test
print(cortest.bartlett(cor(num_data), n = nrow(num_data)))
## $chisq
## [1] 42.46741
##
## $p.value
## [1] 0.5798296
##
## $df
## [1] 45
print(KMO(cor(num_data)))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(num_data))
## Overall MSA = 0.49
## MSA for each item =
## Study_Hours Class_Attendance Exam_Frequency Assignment_Load
## 0.50 0.49 0.50 0.50
## Sleep_Hours Social_Media_Use Screen_Time Peer_Pressure
## 0.48 0.50 0.49 0.49
## Family_Support Anxiety_Level
## 0.49 0.49
# Determine the number of factors (Parallel Analysis)
fa.parallel(num_data, fm = "ml", fa = "fa", main = "Parallel Analysis Scree Plot")

## Parallel analysis suggests that the number of factors = 0 and the number of components = NA
# Perform FA with Varimax rotation for clear interpretation
fa_fit <- fa(num_data, nfactors = 3, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)
##
## Loadings:
## ML1 ML2 ML3
## Study_Hours
## Class_Attendance 0.942
## Exam_Frequency
## Assignment_Load
## Sleep_Hours
## Social_Media_Use
## Screen_Time 0.978
## Peer_Pressure
## Family_Support
## Anxiety_Level 0.992
##
## ML1 ML2 ML3
## SS loadings 1.011 0.993 0.929
## Proportion Var 0.101 0.099 0.093
## Cumulative Var 0.101 0.200 0.293
# Visualize factor loadings
fa.diagram(fa_fit, main = "Factor Analysis Structure Diagram")

# LINEAR DISCRIMINANT ANALYSIS (LDA)
# Justification: LDA is used to predict the categorical 'Stress_Level' (Low, Medium, High)
# based on student behaviors and metric
set.seed(2026)
# Split data into Training (70%) and Testing (30%)
train_idx <- createDataPartition(data$Stress_Level, p = 0.7, list = FALSE)
train_set <- data[train_idx, ]
test_set <- data[-train_idx, ]
# Build the LDA model
lda_model <- lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours +
Screen_Time + Anxiety_Level + Peer_Pressure,
data = train_set)
print(lda_model)
## Call:
## lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours +
## Screen_Time + Anxiety_Level + Peer_Pressure, data = train_set)
##
## Prior probabilities of groups:
## High Low Medium
## 0.1089439 0.4143673 0.4766889
##
## Group means:
## Study_Hours Assignment_Load Sleep_Hours Screen_Time Anxiety_Level
## High 4.397380 6.620087 5.947598 8.698690 6.388646
## Low 4.574053 4.021814 6.934558 4.383467 4.179104
## Medium 4.482036 5.513972 6.350299 6.601796 5.449102
## Peer_Pressure
## High 4.860262
## Low 5.043628
## Medium 4.935130
##
## Coefficients of linear discriminants:
## LD1 LD2
## Study_Hours -0.008654337 0.005799269
## Assignment_Load -0.262087843 0.161292203
## Sleep_Hours 0.253490343 -0.216816454
## Screen_Time -0.285419784 -0.232969623
## Anxiety_Level -0.237273388 0.145206399
## Peer_Pressure 0.015753498 -0.030925385
##
## Proportion of trace:
## LD1 LD2
## 0.9991 0.0009
# Model Prediction & Accuracy Check
lda_pred <- predict(lda_model, test_set)
conf_matrix <- confusionMatrix(lda_pred$class, test_set$Stress_Level)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low Medium
## High 28 0 12
## Low 0 273 90
## Medium 69 99 327
##
## Overall Statistics
##
## Accuracy : 0.6993
## 95% CI : (0.6682, 0.7292)
## No Information Rate : 0.4777
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4673
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: High Class: Low Class: Medium
## Sensitivity 0.28866 0.7339 0.7622
## Specificity 0.98502 0.8289 0.6418
## Pos Pred Value 0.70000 0.7521 0.6606
## Neg Pred Value 0.91958 0.8150 0.7469
## Prevalence 0.10802 0.4143 0.4777
## Detection Rate 0.03118 0.3040 0.3641
## Detection Prevalence 0.04454 0.4042 0.5512
## Balanced Accuracy 0.63684 0.7814 0.7020
# Visualize separation of stress levels
plot(lda_model, col = as.numeric(train_set$Stress_Level),
main = "LDA: Stress Level Group Separation")
