library(psych)
## Warning: package 'psych' was built under R version 4.5.3
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
# Load data
data <- read.csv("C:/Users/ASUS/OneDrive/Desktop/Research work/StudentPerformanceFactors.csv", stringsAsFactors = TRUE)

# Handle Missing Values (Handling variables with NA: Teacher_Quality, Parental_Education_Level, Distance_from_Home)
# We will use mode imputation or simply omit for this analysis to ensure clean results
data_clean <- na.omit(data)

# Subset continuous variables for PCA and Factor Analysis
# These are numeric factors influencing student performance
cont_vars <- data_clean[, c("Hours_Studied", "Attendance", "Sleep_Hours", 
                            "Previous_Scores", "Tutoring_Sessions", 
                            "Physical_Activity", "Exam_Score")]

# Standardize the data (Requirement: PCA and FA are scale-sensitive)
scaled_data <- scale(cont_vars)

# ---------------------------------------------------------
# 2. PRINCIPAL COMPONENT ANALYSIS (PCA)
# ---------------------------------------------------------
# Justification: Dimensionality reduction to identify the primary 
# sources of variance in student metrics.

# Perform PCA
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

# Assumption Check: Correlation matrix check (Scree plot/Eigenvalues)
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6     PC7
## Standard deviation     1.3285 1.0246 1.0108 1.0083 0.9852 0.9750 0.47471
## Proportion of Variance 0.2521 0.1500 0.1460 0.1452 0.1387 0.1358 0.03219
## Cumulative Proportion  0.2521 0.4021 0.5481 0.6933 0.8320 0.9678 1.00000
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50)) # Scree Plot

# Biplot for interpretation
fviz_pca_biplot(pca_result, repel = TRUE,
                col.var = "contrib", # Color by contributions to the PC
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# ---------------------------------------------------------
# 3. FACTOR ANALYSIS (FA)
# ---------------------------------------------------------
# Justification: Identifying unobserved (latent) constructs such as 
# "Academic Engagement" vs "Lifestyle Balance".

# Assumption Check: KMO and Bartlett's Test for Factorability
cortest.bartlett(cor(scaled_data), n = nrow(scaled_data))
## $chisq
## [1] 6045.579
## 
## $p.value
## [1] 0
## 
## $df
## [1] 21
KMO(cor(scaled_data))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(scaled_data))
## Overall MSA =  0.33
## MSA for each item = 
##     Hours_Studied        Attendance       Sleep_Hours   Previous_Scores 
##              0.27              0.33              0.61              0.19 
## Tutoring_Sessions Physical_Activity        Exam_Score 
##              0.19              0.17              0.39
# Determine number of factors (Parallel Analysis)
fa.parallel(scaled_data, fm = "ml", fa = "fa")

## Parallel analysis suggests that the number of factors =  4  and the number of components =  NA
# Perform Factor Analysis (using varimax rotation for interpretability)
fa_fit <- fa(scaled_data, nfactors = 2, rotate = "varimax", fm = "ml")
print(fa_fit$loadings, cutoff = 0.3)
## 
## Loadings:
##                   ML1    ML2   
## Hours_Studied             0.546
## Attendance         0.974       
## Sleep_Hours                    
## Previous_Scores                
## Tutoring_Sessions              
## Physical_Activity              
## Exam_Score         0.744  0.664
## 
##                  ML1   ML2
## SS loadings    1.519 0.871
## Proportion Var 0.217 0.124
## Cumulative Var 0.217 0.341
# Visualize factor loadings
fa.diagram(fa_fit)

# ---------------------------------------------------------
# 4. LINEAR DISCRIMINANT ANALYSIS (LDA)
# ---------------------------------------------------------
# Justification: To predict "Motivation_Level" based on student metrics.

# Prepare target variable
# We will predict 'Motivation_Level' (Low, Medium, High)
set.seed(123)
train_index <- createDataPartition(data_clean$Motivation_Level, p = 0.7, list = FALSE)
train_data <- data_clean[train_index, ]
test_data <- data_clean[-train_index, ]

# Perform LDA
lda_model <- lda(Motivation_Level ~ Hours_Studied + Attendance + Sleep_Hours + 
                   Previous_Scores + Tutoring_Sessions + Exam_Score, 
                 data = train_data)

# Assumption Check: Visualizing partition boundaries
plot(lda_model)

# Predictions and Model Evaluation
lda_pred <- predict(lda_model, test_data)
confusionMatrix(lda_pred$class, test_data$Motivation_Level)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low Medium
##     High      3   5      7
##     Low       0   0      0
##     Medium  392 576    998
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5053         
##                  95% CI : (0.483, 0.5275)
##     No Information Rate : 0.5073         
##     P-Value [Acc > NIR] : 0.5802         
##                                          
##                   Kappa : 6e-04          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: High Class: Low Class: Medium
## Sensitivity             0.007595     0.0000      0.993035
## Specificity             0.992434     1.0000      0.008197
## Pos Pred Value          0.200000        NaN      0.507630
## Neg Pred Value          0.800610     0.7067      0.533333
## Prevalence              0.199394     0.2933      0.507320
## Detection Rate          0.001514     0.0000      0.503786
## Detection Prevalence    0.007572     0.0000      0.992428
## Balanced Accuracy       0.500014     0.5000      0.500616