STA4503_Mini Project

# ---------------------------------------------------------
# 1. DATA LOADING & PRE-PROCESSING
# ---------------------------------------------------------
library(psych)

## Warning: package 'psych' was built under R version 4.5.3

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 4.5.3

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.5.3

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome to factoextra!

## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(MASS)

## Warning: package 'MASS' was built under R version 4.5.3

library(caret)

## Warning: package 'caret' was built under R version 4.5.3

## Loading required package: lattice

# Load the dataset
data <- read.csv("C:/Users/ASUS/OneDrive/New folder/university_student_stress_dataset.csv", stringsAsFactors = TRUE)

# Selecting continuous variables for PCA and FA
# These items are key to measuring student pressure and lifestyle
num_cols <- c("Study_Hours", "Class_Attendance", "Exam_Frequency", 
              "Assignment_Load", "Sleep_Hours", "Social_Media_Use", 
              "Screen_Time", "Peer_Pressure", "Family_Support", "Anxiety_Level")

data_num <- data[, num_cols]

# Standardization: Scaling is required because variables have different units
# (e.g., Attendance is 0-100, while Sleep is 0-24)
scaled_data <- scale(data_num)

# ---------------------------------------------------------
# 2. ASSUMPTION CHECKING (KMO & BARTLETT'S)
# ---------------------------------------------------------
# Bartlett's test checks if the correlation matrix is an identity matrix (p < 0.05)
bartlett_test <- cortest.bartlett(cor(data_num), n = nrow(data_num))
print(bartlett_test)

## $chisq
## [1] 42.46741
## 
## $p.value
## [1] 0.5798296
## 
## $df
## [1] 45

# KMO Test: Aiming for > 0.9 for "Marvelous" sampling adequacy
kmo_test <- KMO(cor(data_num))
print(kmo_test)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(data_num))
## Overall MSA =  0.49
## MSA for each item = 
##      Study_Hours Class_Attendance   Exam_Frequency  Assignment_Load 
##             0.50             0.49             0.50             0.50 
##      Sleep_Hours Social_Media_Use      Screen_Time    Peer_Pressure 
##             0.48             0.50             0.49             0.49 
##   Family_Support    Anxiety_Level 
##             0.49             0.49

# ---------------------------------------------------------
# 3. TECHNIQUE 1: PRINCIPAL COMPONENT ANALYSIS (PCA)
# ---------------------------------------------------------
pca_res <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

# Scree Plot to determine number of components
fviz_eig(pca_res, addlabels = TRUE, main = "Scree Plot of Variance")

# Biplot: Shows how variables like Anxiety and Screen Time relate
fviz_pca_biplot(pca_res, repel = TRUE,
                col.var = "contrib", 
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# ---------------------------------------------------------
# 4. TECHNIQUE 2: EXPLORATORY FACTOR ANALYSIS (EFA)
# ---------------------------------------------------------
# Using Parallel Analysis to confirm number of factors
fa_parallel <- fa.parallel(data_num, fm = "ml", fa = "fa")

## Parallel analysis suggests that the number of factors =  0  and the number of components =  NA

# Running Factor Analysis with Varimax rotation for interpretability
fa_fit <- fa(data_num, nfactors = 3, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)

## 
## Loadings:
##                  ML1    ML2    ML3   
## Study_Hours                          
## Class_Attendance                0.942
## Exam_Frequency                       
## Assignment_Load                      
## Sleep_Hours                          
## Social_Media_Use                     
## Screen_Time       0.978              
## Peer_Pressure                        
## Family_Support                       
## Anxiety_Level            0.992       
## 
##                  ML1   ML2   ML3
## SS loadings    1.011 0.993 0.929
## Proportion Var 0.101 0.099 0.093
## Cumulative Var 0.101 0.200 0.293

# Visualize latent factors
fa.diagram(fa_fit)

# ---------------------------------------------------------
# 5. TECHNIQUE 3: DISCRIMINANT ANALYSIS (LDA)
# ---------------------------------------------------------
set.seed(2026) # Setting seed for reproducibility 
train_idx <- createDataPartition(data$Stress_Level, p = 0.7, list = FALSE)
train_set <- data[train_idx, ]
test_set <- data[-train_idx, ]

# Model: Predicting Stress_Level (Low, Medium, High)
lda_model <- lda(Stress_Level ~ Study_Hours + Assignment_Load + Sleep_Hours + 
                   Screen_Time + Anxiety_Level + Peer_Pressure, 
                 data = train_set)

# Model Evaluation
lda_pred <- predict(lda_model, test_set)
conf_matrix <- confusionMatrix(lda_pred$class, test_set$Stress_Level)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low Medium
##     High     28   0     12
##     Low       0 273     90
##     Medium   69  99    327
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6993          
##                  95% CI : (0.6682, 0.7292)
##     No Information Rate : 0.4777          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4673          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: High Class: Low Class: Medium
## Sensitivity              0.28866     0.7339        0.7622
## Specificity              0.98502     0.8289        0.6418
## Pos Pred Value           0.70000     0.7521        0.6606
## Neg Pred Value           0.91958     0.8150        0.7469
## Prevalence               0.10802     0.4143        0.4777
## Detection Rate           0.03118     0.3040        0.3641
## Detection Prevalence     0.04454     0.4042        0.5512
## Balanced Accuracy        0.63684     0.7814        0.7020

STA4503_Mini Project

Lakshan Dulanga Samaraweera

2026-04-11