library(psych)      # For KMO, Bartlett's, and Factor Analysis
## Warning: package 'psych' was built under R version 4.5.3
library(FactoMineR) # For PCA
## Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra) # For elegant PCA visualization
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MASS)       # For Discriminant Analysis (LDA)
## Warning: package 'MASS' was built under R version 4.5.3
library(caret)      # For data splitting and confusion matrix
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(ggplot2)    # For general plotting
library(reshape2)   # For data reshaping in EDA
## Warning: package 'reshape2' was built under R version 4.5.3
# Load the dataset
# Ensure the file is in your current working directory
data <- read.csv("C:/Users/ASUS/OneDrive/New folder/university_student_stress_dataset.csv", stringsAsFactors = TRUE)


# Checking for Missing Values
print("Missing values per column:")
## [1] "Missing values per column:"
print(colSums(is.na(data))) # The output shows 0 missing values, ensuring data integrity.
##                 Age              Gender         Study_Hours    Class_Attendance 
##                   0                   0                   0                   0 
##             Tuition      Exam_Frequency     Assignment_Load         Sleep_Hours 
##                   0                   0                   0                   0 
##   Physical_Exercise    Social_Media_Use         Screen_Time Family_Income_Level 
##                   0                   0                   0                   0 
##       Peer_Pressure      Family_Support       Anxiety_Level     University_Type 
##                   0                   0                   0                   0 
##        Stress_Score        Stress_Level 
##                   0                   0
# Exploratory Data Analysis (EDA) 
# Summary statistics
summary(data)
##       Age           Gender      Study_Hours    Class_Attendance Tuition   
##  Min.   :19.00   Female:1538   Min.   :0.000   Min.   :40.00    No :1545  
##  1st Qu.:20.00   Male  :1462   1st Qu.:2.000   1st Qu.:54.00    Yes:1455  
##  Median :21.00                 Median :4.000   Median :69.00              
##  Mean   :21.52                 Mean   :4.489   Mean   :68.87              
##  3rd Qu.:23.00                 3rd Qu.:7.000   3rd Qu.:84.00              
##  Max.   :24.00                 Max.   :9.000   Max.   :99.00              
##  Exam_Frequency  Assignment_Load  Sleep_Hours    Physical_Exercise
##  Min.   :1.000   Min.   :1.000   Min.   :4.000   No :1551         
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:5.000   Yes:1449         
##  Median :5.000   Median :5.000   Median :7.000                    
##  Mean   :5.026   Mean   :4.996   Mean   :6.519                    
##  3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:8.000                    
##  Max.   :9.000   Max.   :9.000   Max.   :9.000                    
##  Social_Media_Use  Screen_Time     Family_Income_Level Peer_Pressure  
##  Min.   :0.000    Min.   : 1.000   High  : 608         Min.   :1.000  
##  1st Qu.:2.000    1st Qu.: 3.000   Low   :1059         1st Qu.:3.000  
##  Median :3.000    Median : 6.000   Medium:1333         Median :5.000  
##  Mean   :3.487    Mean   : 5.918                       Mean   :4.984  
##  3rd Qu.:5.000    3rd Qu.: 9.000                       3rd Qu.:7.000  
##  Max.   :7.000    Max.   :11.000                       Max.   :9.000  
##  Family_Support  Anxiety_Level              University_Type  Stress_Score  
##  Min.   :1.000   Min.   :1.000   National University:1000   Min.   :-9.00  
##  1st Qu.:3.000   1st Qu.:3.000   Private University : 997   1st Qu.: 7.00  
##  Median :5.000   Median :5.000   Public University  :1003   Median :12.00  
##  Mean   :5.027   Mean   :5.001                              Mean   :11.92  
##  3rd Qu.:7.000   3rd Qu.:7.000                              3rd Qu.:17.00  
##  Max.   :9.000   Max.   :9.000                              Max.   :33.00  
##  Stress_Level 
##  High  : 326  
##  Low   :1243  
##  Medium:1431  
##               
##               
## 
#  Outlier Examination 
# Selecting numerical columns for outlier detection and multivariate techniques
num_cols <- c("Study_Hours", "Class_Attendance", "Exam_Frequency", 
              "Assignment_Load", "Sleep_Hours", "Social_Media_Use", 
              "Screen_Time", "Peer_Pressure", "Family_Support", "Anxiety_Level")
num_data <- data[, num_cols]

# Boxplots to visualize univariate outliers
boxplot(scale(num_data), las = 2, col = "lightblue", 
        main = "Standardized Boxplots for Outlier Detection")

# Standardization Justification [cite: 13]
# We use Z-score standardization because variables have different units (e.g., hours vs frequency).
# This prevents variables with larger scales from dominating the variance.
scaled_data <- scale(num_data)
# PRINCIPAL COMPONENT ANALYSIS (PCA) 

# Justification: PCA is used for dimensionality reduction and identifying variables 
# that explain the most variance in student stress

pca_res <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

# Assumption Check: Scree Plot (Eigenvalues)
fviz_eig(pca_res, addlabels = TRUE, ylim = c(0, 50), main = "Scree Plot")

# Visualization: Clean Variables Factor Map (No dots, professional theme)
fviz_pca_var(pca_res, 
             col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE, 
             geom = c("arrow", "text")) +
  theme_minimal() +
  theme(panel.grid = element_blank(), axis.line = element_line(colour = "grey70")) +
  labs(title = "PCA: Variables Factor Map")

# FACTOR ANALYSIS (FA) 

# Justification: FA identifies latent unobserved constructs (e.g., "Academic Load") 
# that cannot be measured directly by single variables

# Assumption Check: KMO and Bartlett's Test
print(cortest.bartlett(cor(num_data), n = nrow(num_data)))
## $chisq
## [1] 42.46741
## 
## $p.value
## [1] 0.5798296
## 
## $df
## [1] 45
print(KMO(cor(num_data))) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(num_data))
## Overall MSA =  0.49
## MSA for each item = 
##      Study_Hours Class_Attendance   Exam_Frequency  Assignment_Load 
##             0.50             0.49             0.50             0.50 
##      Sleep_Hours Social_Media_Use      Screen_Time    Peer_Pressure 
##             0.48             0.50             0.49             0.49 
##   Family_Support    Anxiety_Level 
##             0.49             0.49
# Determine the number of factors (Parallel Analysis)
fa.parallel(num_data, fm = "ml", fa = "fa", main = "Parallel Analysis Scree Plot")

## Parallel analysis suggests that the number of factors =  0  and the number of components =  NA
# Perform FA with Varimax rotation for clear interpretation
fa_fit <- fa(num_data, nfactors = 3, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)
## 
## Loadings:
##                  ML1    ML2    ML3   
## Study_Hours                          
## Class_Attendance                0.942
## Exam_Frequency                       
## Assignment_Load                      
## Sleep_Hours                          
## Social_Media_Use                     
## Screen_Time       0.978              
## Peer_Pressure                        
## Family_Support                       
## Anxiety_Level            0.992       
## 
##                  ML1   ML2   ML3
## SS loadings    1.011 0.993 0.929
## Proportion Var 0.101 0.099 0.093
## Cumulative Var 0.101 0.200 0.293
# Visualize factor loadings
fa.diagram(fa_fit, main = "Factor Analysis Structure Diagram")

#  LINEAR DISCRIMINANT ANALYSIS (LDA) 

# Justification: LDA is used to predict the categorical 'Stress_Level' (Low, Medium, High) 
# based on student behaviors and metric

set.seed(2026) 

# Split data into Training (70%) and Testing (30%)
train_idx <- createDataPartition(data$Stress_Level, p = 0.7, list = FALSE)
train_set <- data[train_idx, ]
test_set <- data[-train_idx, ]

# Build the LDA model
lda_model <- lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours + 
                   Screen_Time + Anxiety_Level + Peer_Pressure, 
                 data = train_set)

print(lda_model)
## Call:
## lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours + 
##     Screen_Time + Anxiety_Level + Peer_Pressure, data = train_set)
## 
## Prior probabilities of groups:
##      High       Low    Medium 
## 0.1089439 0.4143673 0.4766889 
## 
## Group means:
##        Study_Hours Assignment_Load Sleep_Hours Screen_Time Anxiety_Level
## High      4.397380        6.620087    5.947598    8.698690      6.388646
## Low       4.574053        4.021814    6.934558    4.383467      4.179104
## Medium    4.482036        5.513972    6.350299    6.601796      5.449102
##        Peer_Pressure
## High        4.860262
## Low         5.043628
## Medium      4.935130
## 
## Coefficients of linear discriminants:
##                          LD1          LD2
## Study_Hours     -0.008654337  0.005799269
## Assignment_Load -0.262087843  0.161292203
## Sleep_Hours      0.253490343 -0.216816454
## Screen_Time     -0.285419784 -0.232969623
## Anxiety_Level   -0.237273388  0.145206399
## Peer_Pressure    0.015753498 -0.030925385
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9991 0.0009
# Model Prediction & Accuracy Check
lda_pred <- predict(lda_model, test_set)
conf_matrix <- confusionMatrix(lda_pred$class, test_set$Stress_Level)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low Medium
##     High     28   0     12
##     Low       0 273     90
##     Medium   69  99    327
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6993          
##                  95% CI : (0.6682, 0.7292)
##     No Information Rate : 0.4777          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4673          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: High Class: Low Class: Medium
## Sensitivity              0.28866     0.7339        0.7622
## Specificity              0.98502     0.8289        0.6418
## Pos Pred Value           0.70000     0.7521        0.6606
## Neg Pred Value           0.91958     0.8150        0.7469
## Prevalence               0.10802     0.4143        0.4777
## Detection Rate           0.03118     0.3040        0.3641
## Detection Prevalence     0.04454     0.4042        0.5512
## Balanced Accuracy        0.63684     0.7814        0.7020
# Visualize separation of stress levels
plot(lda_model, col = as.numeric(train_set$Stress_Level), 
     main = "LDA: Stress Level Group Separation")