library(psych)
## Warning: package 'psych' was built under R version 4.5.3
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
# Load data
data <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/StudentPerformanceFactors.csv", stringsAsFactors = TRUE)
# Handle Missing Values (Handling variables with NA: Teacher_Quality, Parental_Education_Level, Distance_from_Home)
# We will use mode imputation or simply omit for this analysis to ensure clean results
data_clean <- na.omit(data)
# Subset continuous variables for PCA and Factor Analysis
# These are numeric factors influencing student performance
cont_vars <- data_clean[, c("Hours_Studied", "Attendance", "Sleep_Hours",
"Previous_Scores", "Tutoring_Sessions",
"Physical_Activity", "Exam_Score")]
# Standardize the data (Requirement: PCA and FA are scale-sensitive)
scaled_data <- scale(cont_vars)
# ---------------------------------------------------------
# 2. PRINCIPAL COMPONENT ANALYSIS (PCA)
# ---------------------------------------------------------
# Justification: Dimensionality reduction to identify the primary
# sources of variance in student metrics.
# Perform PCA
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# Assumption Check: Correlation matrix check (Scree plot/Eigenvalues)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3285 1.0246 1.0108 1.0083 0.9852 0.9750 0.47471
## Proportion of Variance 0.2521 0.1500 0.1460 0.1452 0.1387 0.1358 0.03219
## Cumulative Proportion 0.2521 0.4021 0.5481 0.6933 0.8320 0.9678 1.00000
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50)) # Scree Plot

# Biplot for interpretation
fviz_pca_biplot(pca_result, repel = TRUE,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# ---------------------------------------------------------
# 3. FACTOR ANALYSIS (FA)
# ---------------------------------------------------------
# Justification: Identifying unobserved (latent) constructs such as
# "Academic Engagement" vs "Lifestyle Balance".
# Assumption Check: KMO and Bartlett's Test for Factorability
cortest.bartlett(cor(scaled_data), n = nrow(scaled_data))
## $chisq
## [1] 6045.579
##
## $p.value
## [1] 0
##
## $df
## [1] 21
KMO(cor(scaled_data))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(scaled_data))
## Overall MSA = 0.33
## MSA for each item =
## Hours_Studied Attendance Sleep_Hours Previous_Scores
## 0.27 0.33 0.61 0.19
## Tutoring_Sessions Physical_Activity Exam_Score
## 0.19 0.17 0.39
# Determine number of factors (Parallel Analysis)
fa.parallel(scaled_data, fm = "ml", fa = "fa")

## Parallel analysis suggests that the number of factors = 4 and the number of components = NA
# Perform Factor Analysis (using varimax rotation for interpretability)
fa_fit <- fa(scaled_data, nfactors = 2, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)
##
## Loadings:
## ML1 ML2
## Hours_Studied 0.546
## Attendance 0.974
## Sleep_Hours
## Previous_Scores
## Tutoring_Sessions
## Physical_Activity
## Exam_Score 0.744 0.664
##
## ML1 ML2
## SS loadings 1.519 0.871
## Proportion Var 0.217 0.124
## Cumulative Var 0.217 0.341
# Visualize factor loadings
fa.diagram(fa_fit)

# ---------------------------------------------------------
# 4. LINEAR DISCRIMINANT ANALYSIS (LDA)
# ---------------------------------------------------------
# Justification: To predict "Motivation_Level" based on student metrics.
# Prepare target variable
# We will predict 'Motivation_Level' (Low, Medium, High)
set.seed(123)
train_index <- createDataPartition(data_clean$Motivation_Level, p = 0.7, list = FALSE)
train_data <- data_clean[train_index, ]
test_data <- data_clean[-train_index, ]
# Perform LDA
lda_model <- lda(Motivation_Level ~ Hours_Studied + Attendance + Sleep_Hours +
Previous_Scores + Tutoring_Sessions + Exam_Score,
data = train_data)
# Assumption Check: Visualizing partition boundaries
plot(lda_model)

# Predictions and Model Evaluation
lda_pred <- predict(lda_model, test_data)
confusionMatrix(lda_pred$class, test_data$Motivation_Level)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low Medium
## High 3 5 7
## Low 0 0 0
## Medium 392 576 998
##
## Overall Statistics
##
## Accuracy : 0.5053
## 95% CI : (0.483, 0.5275)
## No Information Rate : 0.5073
## P-Value [Acc > NIR] : 0.5802
##
## Kappa : 6e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Low Class: Medium
## Sensitivity 0.007595 0.0000 0.993035
## Specificity 0.992434 1.0000 0.008197
## Pos Pred Value 0.200000 NaN 0.507630
## Neg Pred Value 0.800610 0.7067 0.533333
## Prevalence 0.199394 0.2933 0.507320
## Detection Rate 0.001514 0.0000 0.503786
## Detection Prevalence 0.007572 0.0000 0.992428
## Balanced Accuracy 0.500014 0.5000 0.500616