library(dplyr)
library(ggplot2)
library(e1071)
library(corrplot)
library(caret)
library(visdat)
library(car)
library(PRROC)
library(rpart)
library(rpart.plot)
library(pROC)
library(randomForest)
library(reshape2)
library(scales)
# Read the dataset from the working directory
stu_df <- read.csv("student_habits_performance.csv")
# Inspect variable names, types, and a preview of values
str(stu_df)
## 'data.frame': 1000 obs. of 16 variables:
## $ student_id : chr "S1000" "S1001" "S1002" "S1003" ...
## $ age : int 23 20 21 23 19 24 21 21 23 18 ...
## $ gender : chr "Female" "Female" "Male" "Female" ...
## $ study_hours_per_day : num 0 6.9 1.4 1 5 7.2 5.6 4.3 4.4 4.8 ...
## $ social_media_hours : num 1.2 2.8 3.1 3.9 4.4 1.3 1.5 1 2.2 3.1 ...
## $ netflix_hours : num 1.1 2.3 1.3 1 0.5 0 1.4 2 1.7 1.3 ...
## $ part_time_job : chr "No" "No" "No" "No" ...
## $ attendance_percentage : num 85 97.3 94.8 71 90.9 82.9 85.8 77.7 100 95.4 ...
## $ sleep_hours : num 8 4.6 8 9.2 4.9 7.4 6.5 4.6 7.1 7.5 ...
## $ diet_quality : chr "Fair" "Good" "Poor" "Poor" ...
## $ exercise_frequency : int 6 6 1 4 3 1 2 0 3 5 ...
## $ parental_education_level : chr "Master" "High School" "High School" "Master" ...
## $ internet_quality : chr "Average" "Average" "Poor" "Good" ...
## $ mental_health_rating : int 8 8 1 1 1 4 4 8 1 10 ...
## $ extracurricular_participation: chr "Yes" "No" "No" "Yes" ...
## $ exam_score : num 56.2 100 34.3 26.8 66.4 100 89.8 72.6 78.9 100 ...
# Summary statistics for all variables
summary(stu_df)
## student_id age gender study_hours_per_day
## Length:1000 Min. :17.00 Length:1000 Min. :0.00
## Class :character 1st Qu.:18.75 Class :character 1st Qu.:2.60
## Mode :character Median :20.00 Mode :character Median :3.50
## Mean :20.50 Mean :3.55
## 3rd Qu.:23.00 3rd Qu.:4.50
## Max. :24.00 Max. :8.30
## social_media_hours netflix_hours part_time_job attendance_percentage
## Min. :0.000 Min. :0.000 Length:1000 Min. : 56.00
## 1st Qu.:1.700 1st Qu.:1.000 Class :character 1st Qu.: 78.00
## Median :2.500 Median :1.800 Mode :character Median : 84.40
## Mean :2.506 Mean :1.820 Mean : 84.13
## 3rd Qu.:3.300 3rd Qu.:2.525 3rd Qu.: 91.03
## Max. :7.200 Max. :5.400 Max. :100.00
## sleep_hours diet_quality exercise_frequency parental_education_level
## Min. : 3.20 Length:1000 Min. :0.000 Length:1000
## 1st Qu.: 5.60 Class :character 1st Qu.:1.000 Class :character
## Median : 6.50 Mode :character Median :3.000 Mode :character
## Mean : 6.47 Mean :3.042
## 3rd Qu.: 7.30 3rd Qu.:5.000
## Max. :10.00 Max. :6.000
## internet_quality mental_health_rating extracurricular_participation
## Length:1000 Min. : 1.000 Length:1000
## Class :character 1st Qu.: 3.000 Class :character
## Mode :character Median : 5.000 Mode :character
## Mean : 5.438
## 3rd Qu.: 8.000
## Max. :10.000
## exam_score
## Min. : 18.40
## 1st Qu.: 58.48
## Median : 70.50
## Mean : 69.60
## 3rd Qu.: 81.33
## Max. :100.00
The dataset contains 1,000 students aged 17-24. Students study an average of 3.5 hours per day, spend ~2.5 hours on social media, and sleep ~6.5 hours. Most students do not hold a part-time job. Exam scores range from 18 to 100 with a mean of ~69.6.
# Preview the first 6 rows
head(stu_df)
## student_id age gender study_hours_per_day social_media_hours netflix_hours
## 1 S1000 23 Female 0.0 1.2 1.1
## 2 S1001 20 Female 6.9 2.8 2.3
## 3 S1002 21 Male 1.4 3.1 1.3
## 4 S1003 23 Female 1.0 3.9 1.0
## 5 S1004 19 Female 5.0 4.4 0.5
## 6 S1005 24 Male 7.2 1.3 0.0
## part_time_job attendance_percentage sleep_hours diet_quality
## 1 No 85.0 8.0 Fair
## 2 No 97.3 4.6 Good
## 3 No 94.8 8.0 Poor
## 4 No 71.0 9.2 Poor
## 5 No 90.9 4.9 Fair
## 6 No 82.9 7.4 Fair
## exercise_frequency parental_education_level internet_quality
## 1 6 Master Average
## 2 6 High School Average
## 3 1 High School Poor
## 4 4 Master Good
## 5 3 Master Good
## 6 1 Master Average
## mental_health_rating extracurricular_participation exam_score
## 1 8 Yes 56.2
## 2 8 No 100.0
## 3 1 No 34.3
## 4 1 Yes 26.8
## 5 1 No 66.4
## 6 4 No 100.0
vis_dat(stu_df)
anyNA(stu_df)
## [1] FALSE
No missing values were found. The dataset is clean and ready for preprocessing.
stu_df$gender <- as.factor(stu_df$gender)
stu_df$part_time_job <- as.factor(stu_df$part_time_job)
stu_df$diet_quality <- as.factor(stu_df$diet_quality)
stu_df$parental_education_level <- as.factor(stu_df$parental_education_level)
stu_df$internet_quality <- as.factor(stu_df$internet_quality)
stu_df$extracurricular_participation <- as.factor(stu_df$extracurricular_participation)
str(stu_df)
## 'data.frame': 1000 obs. of 16 variables:
## $ student_id : chr "S1000" "S1001" "S1002" "S1003" ...
## $ age : int 23 20 21 23 19 24 21 21 23 18 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 2 1 1 2 1 1 1 1 ...
## $ study_hours_per_day : num 0 6.9 1.4 1 5 7.2 5.6 4.3 4.4 4.8 ...
## $ social_media_hours : num 1.2 2.8 3.1 3.9 4.4 1.3 1.5 1 2.2 3.1 ...
## $ netflix_hours : num 1.1 2.3 1.3 1 0.5 0 1.4 2 1.7 1.3 ...
## $ part_time_job : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
## $ attendance_percentage : num 85 97.3 94.8 71 90.9 82.9 85.8 77.7 100 95.4 ...
## $ sleep_hours : num 8 4.6 8 9.2 4.9 7.4 6.5 4.6 7.1 7.5 ...
## $ diet_quality : Factor w/ 3 levels "Fair","Good",..: 1 2 3 3 1 1 2 1 2 2 ...
## $ exercise_frequency : int 6 6 1 4 3 1 2 0 3 5 ...
## $ parental_education_level : Factor w/ 4 levels "Bachelor","High School",..: 3 2 2 3 3 3 3 1 1 1 ...
## $ internet_quality : Factor w/ 3 levels "Average","Good",..: 1 1 3 2 2 1 3 1 2 2 ...
## $ mental_health_rating : int 8 8 1 1 1 4 4 8 1 10 ...
## $ extracurricular_participation: Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 1 1 1 2 ...
## $ exam_score : num 56.2 100 34.3 26.8 66.4 100 89.8 72.6 78.9 100 ...
# student_id is just a row identifier with no predictive information
stu_df <- stu_df |> dplyr::select(-student_id)
# Loop through every numeric column and plot a histogram with a density overlay
# This lets us visually detect skewness and extreme outliers
for (col in names(stu_df)) {
if (is.numeric(stu_df[[col]])) {
hist(stu_df[[col]],
main = paste("Histogram of", col),
xlab = col,
col = "lightblue",
border = "black",
probability = TRUE)
# Red density curve shows the overall distribution shape
lines(density(stu_df[[col]], na.rm = TRUE), col = "red", lwd = 2)
}
}
Distributions show mild skewness for some variables, but no extreme outliers require removal. All observations are retained.
for (col in colnames(stu_df)) {
if (is.factor(stu_df[[col]])) {
print(
ggplot(stu_df, aes(x = .data[[col]])) +
geom_bar(fill = "lightblue") +
geom_text(stat = "count",
aes(label = after_stat(count)),
vjust = -0.5,
color = "black") +
labs(title = paste("Distribution of", col), x = col, y = "Count") +
theme_minimal()
)
}
}
# Compute Pearson correlation between all numeric predictors.
cor_matrix <- cor(stu_df[, sapply(stu_df, is.numeric)])
corrplot(cor_matrix,
method = "color",
addCoef.col = "lightgrey",
tl.col = "black",
number.cex = 0.7)
study_hours_per_day shows the strongest correlation with
exam_score. Note that exam_score will be
removed before modeling — its correlations here are shown for
exploratory context only.
# Students scoring >= 70 are labelled "pass", below 70 are "fail"
threshold <- 70
stu_df$pass_fail <- ifelse(stu_df$exam_score >= threshold, "pass", "fail")
stu_df$pass_fail <- as.factor(stu_df$pass_fail)
str(stu_df)
## 'data.frame': 1000 obs. of 16 variables:
## $ age : int 23 20 21 23 19 24 21 21 23 18 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 2 1 1 2 1 1 1 1 ...
## $ study_hours_per_day : num 0 6.9 1.4 1 5 7.2 5.6 4.3 4.4 4.8 ...
## $ social_media_hours : num 1.2 2.8 3.1 3.9 4.4 1.3 1.5 1 2.2 3.1 ...
## $ netflix_hours : num 1.1 2.3 1.3 1 0.5 0 1.4 2 1.7 1.3 ...
## $ part_time_job : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
## $ attendance_percentage : num 85 97.3 94.8 71 90.9 82.9 85.8 77.7 100 95.4 ...
## $ sleep_hours : num 8 4.6 8 9.2 4.9 7.4 6.5 4.6 7.1 7.5 ...
## $ diet_quality : Factor w/ 3 levels "Fair","Good",..: 1 2 3 3 1 1 2 1 2 2 ...
## $ exercise_frequency : int 6 6 1 4 3 1 2 0 3 5 ...
## $ parental_education_level : Factor w/ 4 levels "Bachelor","High School",..: 3 2 2 3 3 3 3 1 1 1 ...
## $ internet_quality : Factor w/ 3 levels "Average","Good",..: 1 1 3 2 2 1 3 1 2 2 ...
## $ mental_health_rating : int 8 8 1 1 1 4 4 8 1 10 ...
## $ extracurricular_participation: Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 1 1 1 2 ...
## $ exam_score : num 56.2 100 34.3 26.8 66.4 100 89.8 72.6 78.9 100 ...
## $ pass_fail : Factor w/ 2 levels "fail","pass": 1 2 1 1 1 2 2 2 2 2 ...
# Check the class balance
stu_df |>
ggplot(aes(x = pass_fail, fill = pass_fail)) +
geom_bar() +
scale_fill_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Pass/Fail Distribution", x = "Outcome", y = "Count") +
theme_minimal() +
theme(legend.position = "none")
The dataset is well-balanced: 511 pass and 489 fail. This is good for classification modeling as neither class dominates.
ggplot(stu_df, aes(x = study_hours_per_day, fill = pass_fail)) +
geom_histogram(binwidth = 0.5, alpha = 0.7, position = "dodge") +
scale_fill_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
labs(title = "Distribution of Study Hours per Day by Pass/Fail",
x = "Study Hours per Day",
y = "Frequency") +
theme_minimal()
ggplot(stu_df, aes(x = study_hours_per_day, y = exam_score, color = pass_fail)) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
labs(title = "Study Hours vs Exam Score",
x = "Study Hours per Day",
y = "Exam Score") +
theme_minimal()
Students who study more tend to pass. Most passing students study 3+ hours per day while failing students cluster below 3 hours. Study hours is expected to be the strongest predictor in our models.
ggplot(stu_df, aes(x = pass_fail, y = mental_health_rating, fill = pass_fail)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
labs(title = "Mental Health Rating by Pass/Fail",
x = "Outcome",
y = "Mental Health Rating (1-10)") +
theme_minimal() +
theme(legend.position = "none")
Passing students have a noticeably higher median mental health rating (7) compared to failing students (4). Mental well being plays a meaningful role in academic outcomes.
ggplot(stu_df, aes(x = part_time_job, y = exam_score, fill = part_time_job)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("No" = "lightblue", "Yes" = "lightgreen")) +
labs(title = "Exam Scores by Part-Time Job Status",
x = "Has Part-Time Job",
y = "Exam Score") +
theme_minimal() +
theme(legend.position = "none")
The medians are very close (~70.6 no job vs ~69.2 with job), suggesting part-time work has a smaller effect on academic performance than commonly assumed.
ggplot(stu_df, aes(x = attendance_percentage, y = exam_score, color = pass_fail)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
labs(title = "Attendance Percentage vs Exam Score",
x = "Attendance Percentage",
y = "Exam Score") +
theme_minimal()
Unlike study hours, attendance shows a weaker and more scattered relationship with exam scores. Both passing and failing students span the full attendance range, confirming that showing up alone is not enough — study habits matter more.
stu_df <- stu_df |> dplyr::select(-exam_score)
# 70% training, 30% test
set.seed(28)
train_index <- createDataPartition(stu_df$pass_fail, p = 0.7, list = FALSE)
stu_train <- stu_df[train_index, ]
stu_test <- stu_df[-train_index, ]
cat("Training set size:", nrow(stu_train), "\n")
## Training set size: 701
cat("Test set size: ", nrow(stu_test), "\n")
## Test set size: 299
# ------------------------------------------------------------------
# plot_confusion_heatmap()
# ------------------------------------------------------------------
plot_confusion_heatmap <- function(pred_classes, true_classes, model_name) {
cm <- confusionMatrix(pred_classes, true_classes)
cm_tbl <- as.data.frame(cm$table)
ggplot(cm_tbl, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6, fontface = "bold") +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = paste("Confusion Matrix Heatmap -", model_name),
x = "Actual", y = "Predicted", fill = "Count") +
theme_minimal(base_size = 13)
}
# ------------------------------------------------------------------
# plot_actual_vs_predicted()
# ------------------------------------------------------------------
plot_actual_vs_predicted <- function(pred_classes, true_classes, model_name) {
df <- data.frame(
Actual = true_classes,
Predicted = pred_classes,
Correct = ifelse(pred_classes == true_classes, "Correct", "Incorrect")
)
ggplot(df, aes(x = Actual, y = Predicted, color = Correct)) +
geom_jitter(width = 0.2, height = 0.2, alpha = 0.6, size = 2) +
scale_color_manual(values = c("Correct" = "steelblue", "Incorrect" = "tomato")) +
labs(title = paste("Actual vs Predicted -", model_name),
x = "Actual Class", y = "Predicted Class", color = "Result") +
theme_minimal(base_size = 13)
}
# ------------------------------------------------------------------
# plot_prob_distribution()
# ------------------------------------------------------------------
plot_prob_distribution <- function(probs, true_classes, model_name) {
data.frame(prob = probs, actual = true_classes) |>
ggplot(aes(x = prob, fill = actual)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("fail" = "lightgreen", "pass" = "lightblue")) +
geom_vline(xintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = paste("Predicted Probability Distribution -", model_name),
x = "Predicted Probability of Pass", y = "Density", fill = "Actual") +
theme_minimal()
}
# ------------------------------------------------------------------
# get_metrics()
# Extracts key metrics from a confusionMatrix object into a named vector.
# ------------------------------------------------------------------
get_metrics <- function(pred, truth) {
cm <- confusionMatrix(pred, truth)
c(Accuracy = round(cm$overall["Accuracy"], 4),
Sensitivity = round(cm$byClass["Sensitivity"], 4),
Specificity = round(cm$byClass["Specificity"], 4),
Kappa = round(cm$overall["Kappa"], 4),
Balanced_Accuracy = round(cm$byClass["Balanced Accuracy"], 4))
}
We use all available predictors (~ .)
in every model. After removing student_id and
exam_score, all remaining columns are used — including
categorical variables like diet_quality,
internet_quality, parental_education_level,
and part_time_job. R automatically handles dummy encoding
for factors.
10-fold cross-validation is applied to every model
using trainControl. This gives a more reliable performance
estimate by averaging results across 10 different training splits.
# Shared CV control object used by all models
# method = "cv" means k-fold cross-validation
# number = 10 means 10 folds
set.seed(28)
cv_ctrl <- trainControl(method = "cv", number = 10)
stu_glm_full <- glm(pass_fail ~ .,
data = stu_train,
family = binomial)
summary(stu_glm_full)
##
## Call:
## glm(formula = pass_fail ~ ., family = binomial, data = stu_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.68860 2.72102 -7.971 1.58e-15 ***
## age -0.02377 0.06312 -0.377 0.706478
## genderMale 0.09222 0.30181 0.306 0.759945
## genderOther -0.81191 0.62196 -1.305 0.191753
## study_hours_per_day 3.40461 0.30855 11.034 < 2e-16 ***
## social_media_hours -1.03316 0.15340 -6.735 1.64e-11 ***
## netflix_hours -0.92704 0.15672 -5.915 3.31e-09 ***
## part_time_jobYes 0.08248 0.33393 0.247 0.804904
## attendance_percentage 0.05615 0.01509 3.722 0.000198 ***
## sleep_hours 0.71841 0.12755 5.632 1.78e-08 ***
## diet_qualityGood -0.60894 0.32777 -1.858 0.063195 .
## diet_qualityPoor -0.50223 0.39910 -1.258 0.208252
## exercise_frequency 0.52277 0.08703 6.007 1.89e-09 ***
## parental_education_levelHigh School -0.19963 0.33361 -0.598 0.549568
## parental_education_levelMaster -0.34232 0.43039 -0.795 0.426391
## parental_education_levelNone -0.80168 0.54592 -1.468 0.141972
## internet_qualityGood 0.14929 0.32366 0.461 0.644602
## internet_qualityPoor 0.19007 0.44255 0.429 0.667569
## mental_health_rating 0.74413 0.07660 9.715 < 2e-16 ***
## extracurricular_participationYes -0.11368 0.31390 -0.362 0.717243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 971.47 on 700 degrees of freedom
## Residual deviance: 318.98 on 681 degrees of freedom
## AIC: 358.98
##
## Number of Fisher Scoring iterations: 7
knitr::kable(vif(stu_glm_full), caption = "VIF - Full Logistic Regression")
| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| age | 1.032063 | 1 | 1.015905 |
| gender | 1.171575 | 2 | 1.040381 |
| study_hours_per_day | 3.299033 | 1 | 1.816324 |
| social_media_hours | 1.503924 | 1 | 1.226346 |
| netflix_hours | 1.323892 | 1 | 1.150605 |
| part_time_job | 1.033277 | 1 | 1.016502 |
| attendance_percentage | 1.158266 | 1 | 1.076228 |
| sleep_hours | 1.221159 | 1 | 1.105061 |
| diet_quality | 1.137166 | 2 | 1.032657 |
| exercise_frequency | 1.460380 | 1 | 1.208462 |
| parental_education_level | 1.163984 | 3 | 1.025631 |
| internet_quality | 1.155062 | 2 | 1.036696 |
| mental_health_rating | 2.418803 | 1 | 1.555250 |
| extracurricular_participation | 1.055718 | 1 | 1.027481 |
# Predict probabilities on the test set
pred_glm_full_prob <- predict(stu_glm_full, newdata = stu_test, type = "response")
# Convert to class labels using 0.5 threshold
# P(pass) > 0.5 -> predict "pass"; otherwise predict "fail"
# 0.5 is the standard default decision boundary
pred_glm_full_class <- as.factor(ifelse(pred_glm_full_prob > 0.5, "pass", "fail"))
confusionMatrix(pred_glm_full_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 135 10
## pass 11 143
##
## Accuracy : 0.9298
## 95% CI : (0.8946, 0.956)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8594
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9247
## Specificity : 0.9346
## Pos Pred Value : 0.9310
## Neg Pred Value : 0.9286
## Prevalence : 0.4883
## Detection Rate : 0.4515
## Detection Prevalence : 0.4849
## Balanced Accuracy : 0.9296
##
## 'Positive' Class : fail
##
# Precision-Recall curve
glm_full_pr <- pr.curve(
scores.class0 = pred_glm_full_prob[stu_test$pass_fail == "pass"],
scores.class1 = pred_glm_full_prob[stu_test$pass_fail == "fail"],
curve = TRUE
)
plot(glm_full_pr, main = "Precision-Recall Curve - Full Logistic Regression")
# Predicted probability distribution
plot_prob_distribution(pred_glm_full_prob, stu_test$pass_fail,
"Full Logistic Regression")
plot_confusion_heatmap(pred_glm_full_class, stu_test$pass_fail,
"Full Logistic Regression")
plot_actual_vs_predicted(pred_glm_full_class, stu_test$pass_fail,
"Full Logistic Regression")
# 10-fold CV for full logistic regression
cv_glm_full <- train(pass_fail ~ .,
data = stu_train,
method = "glm",
family = "binomial",
trControl = cv_ctrl)
cat("Full GLM - CV Accuracy:", round(mean(cv_glm_full$resample$Accuracy), 4), "\n")
## Full GLM - CV Accuracy: 0.8745
cat("Full GLM - CV SD: ", round(sd(cv_glm_full$resample$Accuracy), 4), "\n")
## Full GLM - CV SD: 0.0183
stu_glm_step <- step(stu_glm_full, direction = "both", trace = 0)
summary(stu_glm_step)
##
## Call:
## glm(formula = pass_fail ~ study_hours_per_day + social_media_hours +
## netflix_hours + attendance_percentage + sleep_hours + exercise_frequency +
## mental_health_rating, family = binomial, data = stu_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.97634 2.33188 -9.424 < 2e-16 ***
## study_hours_per_day 3.27816 0.29011 11.300 < 2e-16 ***
## social_media_hours -1.00647 0.14779 -6.810 9.74e-12 ***
## netflix_hours -0.87977 0.14894 -5.907 3.49e-09 ***
## attendance_percentage 0.05637 0.01471 3.833 0.000127 ***
## sleep_hours 0.68576 0.12376 5.541 3.01e-08 ***
## exercise_frequency 0.52407 0.08470 6.188 6.11e-10 ***
## mental_health_rating 0.70999 0.07145 9.937 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 971.47 on 700 degrees of freedom
## Residual deviance: 327.61 on 693 degrees of freedom
## AIC: 343.61
##
## Number of Fisher Scoring iterations: 7
knitr::kable(vif(stu_glm_step), caption = "VIF - Stepwise Logistic Regression")
| x | |
|---|---|
| study_hours_per_day | 3.019916 |
| social_media_hours | 1.442074 |
| netflix_hours | 1.252041 |
| attendance_percentage | 1.126974 |
| sleep_hours | 1.176234 |
| exercise_frequency | 1.419682 |
| mental_health_rating | 2.177391 |
# Compare which predictors were kept vs removed
cat("Full model predictors: ", length(coef(stu_glm_full)) - 1, "\n")
## Full model predictors: 19
cat("Stepwise model predictors: ", length(coef(stu_glm_step)) - 1, "\n")
## Stepwise model predictors: 7
cat("\nVariables retained by stepwise:\n")
##
## Variables retained by stepwise:
print(names(coef(stu_glm_step))[-1])
## [1] "study_hours_per_day" "social_media_hours" "netflix_hours"
## [4] "attendance_percentage" "sleep_hours" "exercise_frequency"
## [7] "mental_health_rating"
pred_glm_step_prob <- predict(stu_glm_step, newdata = stu_test, type = "response")
pred_glm_step_class <- as.factor(ifelse(pred_glm_step_prob > 0.5, "pass", "fail"))
confusionMatrix(pred_glm_step_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 136 10
## pass 10 143
##
## Accuracy : 0.9331
## 95% CI : (0.8986, 0.9587)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8661
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9315
## Specificity : 0.9346
## Pos Pred Value : 0.9315
## Neg Pred Value : 0.9346
## Prevalence : 0.4883
## Detection Rate : 0.4548
## Detection Prevalence : 0.4883
## Balanced Accuracy : 0.9331
##
## 'Positive' Class : fail
##
glm_step_pr <- pr.curve(
scores.class0 = pred_glm_step_prob[stu_test$pass_fail == "pass"],
scores.class1 = pred_glm_step_prob[stu_test$pass_fail == "fail"],
curve = TRUE
)
plot(glm_step_pr, main = "Precision-Recall Curve - Stepwise Logistic Regression")
plot_prob_distribution(pred_glm_step_prob, stu_test$pass_fail,
"Stepwise Logistic Regression")
plot_confusion_heatmap(pred_glm_step_class, stu_test$pass_fail,
"Stepwise Logistic Regression")
plot_actual_vs_predicted(pred_glm_step_class, stu_test$pass_fail,
"Stepwise Logistic Regression")
# Re-train using the formula selected by step() for CV
step_formula <- formula(stu_glm_step)
cv_glm_step <- train(step_formula,
data = stu_train,
method = "glm",
family = "binomial",
trControl = cv_ctrl)
cat("Stepwise GLM - CV Accuracy:", round(mean(cv_glm_step$resample$Accuracy), 4), "\n")
## Stepwise GLM - CV Accuracy: 0.8902
cat("Stepwise GLM - CV SD: ", round(sd(cv_glm_step$resample$Accuracy), 4), "\n")
## Stepwise GLM - CV SD: 0.0265
Comparing the full and stepwise models tells us which variables are genuinely informative. If stepwise drops a predictor, that variable did not meaningfully improve the model beyond what the others already captured.
# Grow a full decision tree with no depth or complexity restrictions
# method = "class" specifies a classification tree
# The tree may overfit the training data at full depth
stu_dt <- rpart(pass_fail ~ .,
data = stu_train,
method = "class")
rpart.plot(stu_dt,
main = "Unpruned Decision Tree - Pass/Fail",
type = 4,
extra = 101,
cex = 0.8)
The unpruned tree uses study hours as the primary split, followed by mental health and attendance. The fully grown tree captures complex interactions but may overfit the training data.
pred_dt_class <- predict(stu_dt, newdata = stu_test, type = "class")
confusionMatrix(pred_dt_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 116 16
## pass 30 137
##
## Accuracy : 0.8462
## 95% CI : (0.8002, 0.8851)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6915
##
## Mcnemar's Test P-Value : 0.05527
##
## Sensitivity : 0.7945
## Specificity : 0.8954
## Pos Pred Value : 0.8788
## Neg Pred Value : 0.8204
## Prevalence : 0.4883
## Detection Rate : 0.3880
## Detection Prevalence : 0.4415
## Balanced Accuracy : 0.8450
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_dt_class, stu_test$pass_fail, "Unpruned Decision Tree")
plot_actual_vs_predicted(pred_dt_class, stu_test$pass_fail, "Unpruned Decision Tree")
cv_dt <- train(pass_fail ~ .,
data = stu_train,
method = "rpart",
trControl = cv_ctrl)
cat("Unpruned DT - CV Accuracy:", round(mean(cv_dt$resample$Accuracy), 4), "\n")
## Unpruned DT - CV Accuracy: 0.8017
cat("Unpruned DT - CV SD: ", round(sd(cv_dt$resample$Accuracy), 4), "\n")
## Unpruned DT - CV SD: 0.0401
# Pruning removes branches that contribute little predictive value
# cp = 0.05 is a moderate complexity penalty
# Higher cp = simpler tree; helps prevent overfitting
stu_dt_pruned <- prune(stu_dt, cp = 0.05)
rpart.plot(stu_dt_pruned,
main = "Pruned Decision Tree - Pass/Fail",
type = 4,
extra = 101,
cex = 0.8)
The pruned tree is simpler and more interpretable. Study hours per day remains the primary split, with mental health providing additional separation for borderline cases.
pred_dt_pruned_class <- predict(stu_dt_pruned, newdata = stu_test, type = "class")
confusionMatrix(pred_dt_pruned_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 126 29
## pass 20 124
##
## Accuracy : 0.8361
## 95% CI : (0.7892, 0.8762)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6725
##
## Mcnemar's Test P-Value : 0.2531
##
## Sensitivity : 0.8630
## Specificity : 0.8105
## Pos Pred Value : 0.8129
## Neg Pred Value : 0.8611
## Prevalence : 0.4883
## Detection Rate : 0.4214
## Detection Prevalence : 0.5184
## Balanced Accuracy : 0.8367
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_dt_pruned_class, stu_test$pass_fail, "Pruned Decision Tree")
plot_actual_vs_predicted(pred_dt_pruned_class, stu_test$pass_fail,
"Pruned Decision Tree")
cv_dt_pruned <- train(pass_fail ~ .,
data = stu_train,
method = "rpart",
tuneGrid = data.frame(cp = 0.05),
trControl = cv_ctrl)
cat("Pruned DT - CV Accuracy:", round(mean(cv_dt_pruned$resample$Accuracy), 4), "\n")
## Pruned DT - CV Accuracy: 0.7944
cat("Pruned DT - CV SD: ", round(sd(cv_dt_pruned$resample$Accuracy), 4), "\n")
## Pruned DT - CV SD: 0.0796
set.seed(28)
stu_rf <- randomForest(pass_fail ~ .,
data = stu_train,
mtry = floor(sqrt(ncol(stu_train) - 1)),
importance = TRUE)
importance(stu_rf)
## fail pass MeanDecreaseAccuracy
## age 1.37913053 1.4025408 2.0405446
## gender 2.25163044 -0.7196872 0.8980441
## study_hours_per_day 70.31010651 73.7921730 82.9806828
## social_media_hours 7.38137287 7.7539772 10.8356963
## netflix_hours 0.83378523 2.4845748 2.4894018
## part_time_job -1.28495581 0.3576566 -0.7452039
## attendance_percentage 0.90623242 0.3057528 0.8361275
## sleep_hours 4.07127269 4.6228444 6.1489119
## diet_quality -0.06221191 -0.6594717 -0.5225557
## exercise_frequency 2.24196620 3.8793767 4.3274023
## parental_education_level -0.57849560 0.9484400 0.3440079
## internet_quality 0.48630959 0.2089477 0.4525137
## mental_health_rating 24.01641869 25.2029751 33.1073809
## extracurricular_participation -0.73216211 -0.3725739 -0.7416844
## MeanDecreaseGini
## age 13.531935
## gender 5.980069
## study_hours_per_day 137.514727
## social_media_hours 28.304257
## netflix_hours 23.451923
## part_time_job 3.850415
## attendance_percentage 25.370099
## sleep_hours 26.425985
## diet_quality 8.007710
## exercise_frequency 14.655825
## parental_education_level 10.712259
## internet_quality 7.180428
## mental_health_rating 41.269324
## extracurricular_participation 3.185696
# Variable importance plot
# MeanDecreaseAccuracy: how much accuracy drops if this variable is removed
# MeanDecreaseGini: how much this variable reduces node impurity in trees
# Higher values = more important predictor
varImpPlot(stu_rf)
pred_rf_class <- predict(stu_rf, newdata = stu_test)
confusionMatrix(pred_rf_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 131 14
## pass 15 139
##
## Accuracy : 0.903
## 95% CI : (0.8637, 0.9341)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8059
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8973
## Specificity : 0.9085
## Pos Pred Value : 0.9034
## Neg Pred Value : 0.9026
## Prevalence : 0.4883
## Detection Rate : 0.4381
## Detection Prevalence : 0.4849
## Balanced Accuracy : 0.9029
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_rf_class, stu_test$pass_fail, "Random Forest")
plot_actual_vs_predicted(pred_rf_class, stu_test$pass_fail, "Random Forest")
set.seed(28)
cv_rf <- train(pass_fail ~ .,
data = stu_train,
method = "rf",
trControl = cv_ctrl)
cat("Random Forest - CV Accuracy:", round(mean(cv_rf$resample$Accuracy), 4), "\n")
## Random Forest - CV Accuracy: 0.8574
cat("Random Forest - CV SD: ", round(sd(cv_rf$resample$Accuracy), 4), "\n")
## Random Forest - CV SD: 0.045
# Linear SVM finds the optimal straight-line hyperplane separating pass/fail
# probability = TRUE enables probability output (needed for evaluation)
stu_svm_linear <- svm(pass_fail ~ .,
data = stu_train,
kernel = "linear",
probability = TRUE)
pred_svm_linear_class <- predict(stu_svm_linear, newdata = stu_test)
confusionMatrix(pred_svm_linear_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 136 10
## pass 10 143
##
## Accuracy : 0.9331
## 95% CI : (0.8986, 0.9587)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8661
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9315
## Specificity : 0.9346
## Pos Pred Value : 0.9315
## Neg Pred Value : 0.9346
## Prevalence : 0.4883
## Detection Rate : 0.4548
## Detection Prevalence : 0.4883
## Balanced Accuracy : 0.9331
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_svm_linear_class, stu_test$pass_fail, "SVM Linear")
plot_actual_vs_predicted(pred_svm_linear_class, stu_test$pass_fail, "SVM Linear")
set.seed(28)
cv_svm_linear <- train(pass_fail ~ .,
data = stu_train,
method = "svmLinear",
trControl = cv_ctrl)
cat("SVM Linear - CV Accuracy:", round(mean(cv_svm_linear$resample$Accuracy), 4), "\n")
## SVM Linear - CV Accuracy: 0.8788
cat("SVM Linear - CV SD: ", round(sd(cv_svm_linear$resample$Accuracy), 4), "\n")
## SVM Linear - CV SD: 0.0365
# Polynomial kernel (degree 3) fits curved decision boundaries
stu_svm_poly <- svm(pass_fail ~ .,
data = stu_train,
kernel = "polynomial",
degree = 3,
probability = TRUE)
pred_svm_poly_class <- predict(stu_svm_poly, newdata = stu_test)
confusionMatrix(pred_svm_poly_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 131 11
## pass 15 142
##
## Accuracy : 0.913
## 95% CI : (0.8752, 0.9424)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8259
##
## Mcnemar's Test P-Value : 0.5563
##
## Sensitivity : 0.8973
## Specificity : 0.9281
## Pos Pred Value : 0.9225
## Neg Pred Value : 0.9045
## Prevalence : 0.4883
## Detection Rate : 0.4381
## Detection Prevalence : 0.4749
## Balanced Accuracy : 0.9127
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_svm_poly_class, stu_test$pass_fail, "SVM Polynomial")
plot_actual_vs_predicted(pred_svm_poly_class, stu_test$pass_fail, "SVM Polynomial")
set.seed(28)
cv_svm_poly <- train(pass_fail ~ .,
data = stu_train,
method = "svmPoly",
trControl = cv_ctrl,
tuneGrid = data.frame(degree = 3, scale = 1, C = 1))
cat("SVM Poly - CV Accuracy:", round(mean(cv_svm_poly$resample$Accuracy), 4), "\n")
## SVM Poly - CV Accuracy: 0.7917
cat("SVM Poly - CV SD: ", round(sd(cv_svm_poly$resample$Accuracy), 4), "\n")
## SVM Poly - CV SD: 0.0517
# Radial Basis Function (RBF) kernel is the most flexible —
# it can model complex, non-linear decision boundaries
stu_svm_radial <- svm(pass_fail ~ .,
data = stu_train,
kernel = "radial",
probability = TRUE)
pred_svm_radial_class <- predict(stu_svm_radial, newdata = stu_test)
confusionMatrix(pred_svm_radial_class, stu_test$pass_fail)
## Confusion Matrix and Statistics
##
## Reference
## Prediction fail pass
## fail 134 9
## pass 12 144
##
## Accuracy : 0.9298
## 95% CI : (0.8946, 0.956)
## No Information Rate : 0.5117
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8594
##
## Mcnemar's Test P-Value : 0.6625
##
## Sensitivity : 0.9178
## Specificity : 0.9412
## Pos Pred Value : 0.9371
## Neg Pred Value : 0.9231
## Prevalence : 0.4883
## Detection Rate : 0.4482
## Detection Prevalence : 0.4783
## Balanced Accuracy : 0.9295
##
## 'Positive' Class : fail
##
plot_confusion_heatmap(pred_svm_radial_class, stu_test$pass_fail, "SVM Radial")
plot_actual_vs_predicted(pred_svm_radial_class, stu_test$pass_fail, "SVM Radial")
set.seed(28)
cv_svm_radial <- train(pass_fail ~ .,
data = stu_train,
method = "svmRadial",
trControl = cv_ctrl)
cat("SVM Radial - CV Accuracy:", round(mean(cv_svm_radial$resample$Accuracy), 4), "\n")
## SVM Radial - CV Accuracy: 0.8658
cat("SVM Radial - CV SD: ", round(sd(cv_svm_radial$resample$Accuracy), 4), "\n")
## SVM Radial - CV SD: 0.0533
# Build a full comparison table across all 8 models
# Metrics: Test Accuracy, Sensitivity, Specificity, Kappa,
# Balanced Accuracy, and 10-fold CV Accuracy
results_df <- data.frame(
Model = c("GLM Full", "GLM Stepwise", "DT Unpruned", "DT Pruned",
"Random Forest", "SVM Linear", "SVM Polynomial", "SVM Radial"),
rbind(
get_metrics(pred_glm_full_class, stu_test$pass_fail),
get_metrics(pred_glm_step_class, stu_test$pass_fail),
get_metrics(pred_dt_class, stu_test$pass_fail),
get_metrics(pred_dt_pruned_class, stu_test$pass_fail),
get_metrics(pred_rf_class, stu_test$pass_fail),
get_metrics(pred_svm_linear_class, stu_test$pass_fail),
get_metrics(pred_svm_poly_class, stu_test$pass_fail),
get_metrics(pred_svm_radial_class, stu_test$pass_fail)
),
CV_Accuracy = round(c(
mean(cv_glm_full$resample$Accuracy),
mean(cv_glm_step$resample$Accuracy),
mean(cv_dt$resample$Accuracy),
mean(cv_dt_pruned$resample$Accuracy),
mean(cv_rf$resample$Accuracy),
mean(cv_svm_linear$resample$Accuracy),
mean(cv_svm_poly$resample$Accuracy),
mean(cv_svm_radial$resample$Accuracy)
), 4)
)
knitr::kable(results_df, caption = "Model Performance Comparison — All Models")
| Model | Accuracy.Accuracy | Sensitivity.Sensitivity | Specificity.Specificity | Kappa.Kappa | Balanced_Accuracy.Balanced.Accuracy | CV_Accuracy |
|---|---|---|---|---|---|---|
| GLM Full | 0.9298 | 0.9247 | 0.9346 | 0.8594 | 0.9296 | 0.8745 |
| GLM Stepwise | 0.9331 | 0.9315 | 0.9346 | 0.8661 | 0.9331 | 0.8902 |
| DT Unpruned | 0.8462 | 0.7945 | 0.8954 | 0.6915 | 0.8450 | 0.8017 |
| DT Pruned | 0.8361 | 0.8630 | 0.8105 | 0.6725 | 0.8367 | 0.7944 |
| Random Forest | 0.9030 | 0.8973 | 0.9085 | 0.8059 | 0.9029 | 0.8574 |
| SVM Linear | 0.9331 | 0.9315 | 0.9346 | 0.8661 | 0.9331 | 0.8788 |
| SVM Polynomial | 0.9130 | 0.8973 | 0.9281 | 0.8259 | 0.9127 | 0.7917 |
| SVM Radial | 0.9298 | 0.9178 | 0.9412 | 0.8594 | 0.9295 | 0.8658 |
# Bar chart comparing 10-fold CV accuracy across all models
# CV accuracy is more reliable than single test set accuracy because
# it averages performance across 10 different training/validation splits
results_df |>
ggplot(aes(x = reorder(Model, CV_Accuracy), y = CV_Accuracy, fill = Model)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = paste0(round(CV_Accuracy * 100, 1), "%")),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_y_continuous(limits = c(0, 1.1),
labels = scales::percent_format(accuracy = 1)) +
labs(title = "10-Fold CV Accuracy by Model",
x = "Model",
y = "CV Accuracy") +
theme_minimal(base_size = 12)
This project set out to predict whether a student would pass or fail their exam based on a comprehensive set of lifestyle, health, and demographic variables. Using all available predictors allowed the models to learn from a richer picture of each student’s circumstances.
Key findings from EDA and modeling:
Overall, the results confirm that academic performance can be predicted reasonably well from lifestyle data. Targeted improvements in study habits and mental health support represent the highest-impact areas for students looking to improve their grades.