rm(list = ls())
packages <- c("tidyverse", "ggplot2", "corrplot", "GGally", "car",
"lmtest", "broom", "patchwork", "scales", "Metrics")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])
invisible(lapply(packages, library, character.only = TRUE))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.95 loaded
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## Loading required package: zoo
##
##
## Attaching package: 'zoo'
##
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
# Fix all conflicts
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
conflict_prefer("as.Date", "base")
## [conflicted] Will prefer base::as.Date over any other package.
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("recode", "dplyr")
## [conflicted] Will prefer dplyr::recode over any other package.
#LOAD DATA SET: STUDENT PERFORMANCE
student <- read.csv("Student_Performance.csv")
head(student)
## Hours.Studied Previous.Scores Extracurricular.Activities Sleep.Hours
## 1 7 99 Yes 9
## 2 4 82 No 4
## 3 8 51 Yes 7
## 4 5 52 Yes 5
## 5 7 75 No 8
## 6 3 78 No 9
## Sample.Question.Papers.Practiced Performance.Index
## 1 1 91
## 2 2 65
## 3 2 45
## 4 2 36
## 5 5 66
## 6 6 61
glimpse(student)
## Rows: 10,000
## Columns: 6
## $ Hours.Studied <int> 7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3…
## $ Previous.Scores <int> 99, 82, 51, 52, 75, 78, 73, 45, 77, 8…
## $ Extracurricular.Activities <chr> "Yes", "No", "Yes", "Yes", "No", "No"…
## $ Sleep.Hours <int> 9, 4, 7, 5, 8, 9, 5, 4, 8, 4, 4, 6, 9…
## $ Sample.Question.Papers.Practiced <int> 1, 2, 2, 2, 5, 6, 6, 6, 2, 0, 5, 2, 2…
## $ Performance.Index <dbl> 91, 65, 45, 36, 66, 61, 63, 42, 61, 6…
str(student)
## 'data.frame': 10000 obs. of 6 variables:
## $ Hours.Studied : int 7 4 8 5 7 3 7 8 5 4 ...
## $ Previous.Scores : int 99 82 51 52 75 78 73 45 77 89 ...
## $ Extracurricular.Activities : chr "Yes" "No" "Yes" "Yes" ...
## $ Sleep.Hours : int 9 4 7 5 8 9 5 4 8 4 ...
## $ Sample.Question.Papers.Practiced: int 1 2 2 2 5 6 6 6 2 0 ...
## $ Performance.Index : num 91 65 45 36 66 61 63 42 61 69 ...
# Summary Statistics
summary(student)
## Hours.Studied Previous.Scores Extracurricular.Activities Sleep.Hours
## Min. :1.000 Min. :40.00 Length :10000 Min. :4.000
## 1st Qu.:3.000 1st Qu.:54.00 N.unique : 2 1st Qu.:5.000
## Median :5.000 Median :69.00 N.blank : 0 Median :7.000
## Mean :4.993 Mean :69.45 Min.nchar: 2 Mean :6.531
## 3rd Qu.:7.000 3rd Qu.:85.00 Max.nchar: 3 3rd Qu.:8.000
## Max. :9.000 Max. :99.00 Max. :9.000
## Sample.Question.Papers.Practiced Performance.Index
## Min. :0.000 Min. : 10.00
## 1st Qu.:2.000 1st Qu.: 40.00
## Median :5.000 Median : 55.00
## Mean :4.583 Mean : 55.22
## 3rd Qu.:7.000 3rd Qu.: 71.00
## Max. :9.000 Max. :100.00
#column names and number of rows
colnames(student)
## [1] "Hours.Studied" "Previous.Scores"
## [3] "Extracurricular.Activities" "Sleep.Hours"
## [5] "Sample.Question.Papers.Practiced" "Performance.Index"
nrow(student)
## [1] 10000
#Missing values
print(colSums(is.na(student)))
## Hours.Studied Previous.Scores
## 0 0
## Extracurricular.Activities Sleep.Hours
## 0 0
## Sample.Question.Papers.Practiced Performance.Index
## 0 0
#Data Preparation
# Turning Extracurricular.Activities as binary: Yes=1, No=0
student <- student %>%
mutate(
Extra = ifelse(Extracurricular.Activities == "Yes", 1L, 0L)
)
# Rename columns for convenience
names(student) <- c("Hours", "PrevScore", "Extra_Label",
"Sleep", "Papers", "PerformanceIndex", "Extra")
print(names(student))
## [1] "Hours" "PrevScore" "Extra_Label" "Sleep"
## [5] "Papers" "PerformanceIndex" "Extra"
#Distribution of Student Performance Index
performance <- ggplot(student, aes(x = PerformanceIndex)) +
geom_histogram(binwidth = 3, fill = "#1565C0", colour = "white", alpha = 0.85) +
geom_vline(xintercept = mean(student$PerformanceIndex),
colour = "red", linetype = "dashed", linewidth = 1) +
annotate("text",
x = mean(student$PerformanceIndex) + 3,
y = 700,
label = paste0("Mean = ", round(mean(student$PerformanceIndex), 1)),
colour = "red", size = 3.5) +
labs(title = "Distribution of Student Performance Index",
subtitle = "Target variable – approximately normal",
x = "Performance Index", y = "Count") +
theme_minimal(base_size = 12)
print(performance)
#Performance vs Hours Studied
p1 <- ggplot(student, aes(x = Hours, y = PerformanceIndex)) +
geom_point(alpha = 0.15, colour = "#1565C0", size = 0.8) +
geom_smooth(method = "lm", colour = "red", linewidth = 1) +
labs(title = "Performance vs Hours Studied",
x = "Hours Studied", y = "Performance Index") +
theme_minimal(base_size = 10)
#Performance vs Previous Scores
p2 <- ggplot(student, aes(x = PrevScore, y = PerformanceIndex)) +
geom_point(alpha = 0.15, colour = "#2E7D32", size = 0.8) +
geom_smooth(method = "lm", colour = "red", linewidth = 1) +
labs(title = "Performance vs Previous Scores",
x = "Previous Scores", y = "Performance Index") +
theme_minimal(base_size = 10)
#Performance vs Sleep Hours
p3 <- ggplot(student, aes(x = Sleep, y = PerformanceIndex)) +
geom_point(alpha = 0.15, colour = "#6A1B9A", size = 0.8) +
geom_smooth(method = "lm", colour = "red", linewidth = 1) +
labs(title = "Performance vs Sleep Hours",
x = "Sleep Hours", y = "Performance Index") +
theme_minimal(base_size = 10)
#Performance vs Sample Papers
p4 <- ggplot(student, aes(x = Papers, y = PerformanceIndex)) +
geom_point(alpha = 0.15, colour = "#E65100", size = 0.8) +
geom_smooth(method = "lm", colour = "red", linewidth = 1) +
labs(title = "Performance vs Sample Papers",
x = "Sample Papers Practiced", y = "Performance Index") +
theme_minimal(base_size = 10)
print((p1 + p2) / (p3 + p4))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#Performance Index by Extracurricular Activities
p_extra <- ggplot(student, aes(x = Extra_Label, y = PerformanceIndex,
fill = Extra_Label)) +
geom_boxplot(alpha = 0.8, width = 0.4, show.legend = FALSE) +
stat_summary(fun = mean, geom = "point", shape = 18,
size = 4, colour = "white") +
scale_fill_manual(values = c("Yes" = "#1565C0", "No" = "#E53935")) +
labs(title = "Performance Index by Extracurricular Activities",
subtitle = "White diamond = group mean",
x = "Extracurricular Activities", y = "Performance Index") +
theme_minimal(base_size = 12)
print(p_extra)
#Correlation Matrix
num_vars <- student %>% select(Hours, PrevScore, Sleep, Papers,
Extra, PerformanceIndex)
cor_matrix <- cor(num_vars, method = "pearson")
cat("\n── Pearson Correlation with Performance Index ─────────\n")
##
## ── Pearson Correlation with Performance Index ─────────
print(round(sort(cor_matrix[, "PerformanceIndex"], decreasing = TRUE), 4))
## PerformanceIndex PrevScore Hours Sleep
## 1.0000 0.9152 0.3737 0.0481
## Papers Extra
## 0.0433 0.0245
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.75,
col = colorRampPalette(c("#E53935", "white", "#1565C0"))(200),
title = "Correlation Matrix – Student Performance Dataset",
mar = c(0, 0, 2, 0))
#Train/ Test split (80/20)
set.seed(10)
train_idx <- sample(seq_len(nrow(student)), size = floor(0.8 * nrow(student)))
train_data <- student[ train_idx, ]
test_data <- student[-train_idx, ]
cat(sprintf("\n── Train/Test Split ───────────────────────────────\n"))
##
## ── Train/Test Split ───────────────────────────────
cat(sprintf(" Training set : %d rows (80%%)\n", nrow(train_data)))
## Training set : 8000 rows (80%)
cat(sprintf(" Test set : %d rows (20%%)\n", nrow(test_data)))
## Test set : 2000 rows (20%)
cat(" MODEL 1 – SIMPLE LINEAR REGRESSION\n")
## MODEL 1 – SIMPLE LINEAR REGRESSION
cat(" ------------------------------------\n")
## ------------------------------------
slr <- lm(PerformanceIndex ~ PrevScore, data = train_data)
print(summary(slr))
##
## Call:
## lm(formula = PerformanceIndex ~ PrevScore, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2007 -6.4794 -0.1227 6.5011 19.5401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.592902 0.358627 -43.48 <2e-16 ***
## PrevScore 1.019496 0.005013 203.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.769 on 7998 degrees of freedom
## Multiple R-squared: 0.838, Adjusted R-squared: 0.838
## F-statistic: 4.137e+04 on 1 and 7998 DF, p-value: < 2.2e-16
slr_r2 <- summary(slr)$r.squared
slr_pred <- predict(slr, newdata = test_data)
slr_rmse <- sqrt(mean((test_data$PerformanceIndex - slr_pred)^2))
slr_mae <- mean(abs(test_data$PerformanceIndex - slr_pred))
cat(sprintf("\n Test Rsq : %.4f\n", slr_r2))
##
## Test Rsq : 0.8380
cat(sprintf(" Test RMSE : %.4f\n", slr_rmse))
## Test RMSE : 7.6443
cat(sprintf(" Test MAE : %.4f\n", slr_mae))
## Test MAE : 6.4911
cat(" MODEL 2 – MULTIPLE LINEAR REGRESSION\n")
## MODEL 2 – MULTIPLE LINEAR REGRESSION
cat("---------------------------------------------\n")
## ---------------------------------------------
mlr <- lm(PerformanceIndex ~ Hours + PrevScore + Extra + Sleep + Papers,
data = train_data)
print(summary(mlr))
##
## Call:
## lm(formula = PerformanceIndex ~ Hours + PrevScore + Extra + Sleep +
## Papers, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6419 -1.3885 -0.0062 1.3684 8.3629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.188929 0.142089 -240.62 <2e-16 ***
## Hours 2.855078 0.008790 324.80 <2e-16 ***
## PrevScore 1.019862 0.001318 773.99 <2e-16 ***
## Extra 0.618327 0.045682 13.54 <2e-16 ***
## Sleep 0.483804 0.013436 36.01 <2e-16 ***
## Papers 0.187353 0.007975 23.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.042 on 7994 degrees of freedom
## Multiple R-squared: 0.9888, Adjusted R-squared: 0.9888
## F-statistic: 1.413e+05 on 5 and 7994 DF, p-value: < 2.2e-16
#Test set evaluation
cat("\tModel Evaluation on test set \n")
## Model Evaluation on test set
cat("----------------------------------------------")
## ----------------------------------------------
mlr_pred <- predict(mlr, newdata = test_data)
mlr_r2 <- 1 - sum((test_data$PerformanceIndex - mlr_pred)^2) /
sum((test_data$PerformanceIndex - mean(test_data$PerformanceIndex))^2)
mlr_rmse <- sqrt(mean((test_data$PerformanceIndex - mlr_pred)^2))
mlr_mae <- mean(abs(test_data$PerformanceIndex - mlr_pred))
cat(sprintf("\n %-35s SLR (PrevScore) MLR (all 5)\n", "Metric"))
##
## Metric SLR (PrevScore) MLR (all 5)
cat(rep("-", 65), "\n", sep = "")
## -----------------------------------------------------------------
cat(sprintf(" %-35s %-16.4f %.4f\n", "Test Rsq", slr_r2, mlr_r2))
## Test Rsq 0.8380 0.9885
cat(sprintf(" %-35s %-16.4f %.4f\n", "Test RMSE", slr_rmse, mlr_rmse))
## Test RMSE 7.6443 2.0240
cat(sprintf(" %-35s %-16.4f %.4f\n", "Test MAE", slr_mae, mlr_mae))
## Test MAE 6.4911 1.5846
cat(rep("-", 65), "\n", sep = "")
## -----------------------------------------------------------------
cat("\n Interpretation:\n")
##
## Interpretation:
cat(" → MLR explains more variance (higher Rsq) and makes smaller errors\n")
## → MLR explains more variance (higher Rsq) and makes smaller errors
cat(" (lower RMSE and MAE) than the simple model using Previous Scores alone.\n")
## (lower RMSE and MAE) than the simple model using Previous Scores alone.
cat(" → Each additional predictor adds genuine explanatory power.\n")
## → Each additional predictor adds genuine explanatory power.
#Visualization
#Actual vs Predicted
test_results <- data.frame(
Actual = test_data$PerformanceIndex,
Predicted = mlr_pred,
Residual = test_data$PerformanceIndex - mlr_pred
)
p_avp <- ggplot(test_results, aes(x = Actual, y = Predicted)) +
geom_point(aes(colour = abs(Residual)), alpha = 0.4, size = 1.2) +
geom_abline(slope = 1, intercept = 0, colour = "red",
linetype = "dashed", linewidth = 1) +
scale_colour_gradient(low = "#A5D6A7", high = "#B71C1C",
name = "|Residual|") +
labs(title = "Actual vs Predicted – Performance Index (Test Set)",
subtitle = sprintf("MLR | Rsq = %.4f | RMSE = %.2f | MAE = %.2f",
mlr_r2, mlr_rmse, mlr_mae),
x = "Actual Performance Index",
y = "Predicted Performance Index") +
theme_minimal(base_size = 12)
print(p_avp)
#Residual Distribution
p_resid_hist <- ggplot(test_results, aes(x = Residual)) +
geom_histogram(binwidth = 1, fill = "#1565C0", colour = "white", alpha = 0.8) +
geom_vline(xintercept = 0, colour = "red", linetype = "dashed", linewidth = 1) +
labs(title = "Distribution of Residuals (Test Set)",
subtitle = "Should be centred at zero and approximately normal",
x = "Residual (Actual – Predicted)", y = "Count") +
theme_minimal(base_size = 12)
print(p_resid_hist)
#Coefficient Plot
coef_plot_df <- tidy(mlr, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(
term = recode(term,
Hours = "Hours Studied",
PrevScore = "Previous Scores",
Extra = "Extracurricular (Yes)",
Sleep = "Sleep Hours",
Papers = "Sample Papers"),
Significant = ifelse(p.value < 0.05, "p < 0.05", "p ≥ 0.05")
)
p_coef <- ggplot(coef_plot_df,
aes(x = reorder(term, estimate), y = estimate,
colour = Significant)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey50") +
coord_flip() +
scale_colour_manual(values = c("p < 0.05" = "#1565C0", "p ≥ 0.05" = "#B0BEC5")) +
labs(title = "MLR Coefficient Plot with 95% Confidence Intervals",
subtitle = "Blue = statistically significant (p < 0.05)",
x = NULL, y = "Coefficient Estimate",
colour = "Significance") +
theme_minimal(base_size = 12) +
theme(legend.position = "top")
print(p_coef)
#Key Findings
cat("--------------------Key Findings------------------------\n\n")
## --------------------Key Findings------------------------
cat("1.\tPREVIOUS SCORES is by far the strongest predictor.\n")
## 1. PREVIOUS SCORES is by far the strongest predictor.
cat("\tStudents with higher past performance consistently.\n")
## Students with higher past performance consistently.
cat("\tscore higher — prior knowledge is the biggest driver.\n\n")
## score higher — prior knowledge is the biggest driver.
cat("2.\tHOURS STUDIED has a strong positive effect.\n")
## 2. HOURS STUDIED has a strong positive effect.
cat("\tEvery extra hour of study lifts the index significantly.\n")
## Every extra hour of study lifts the index significantly.
cat("\tPractical implication: encourage consistent study habits.\n\n")
## Practical implication: encourage consistent study habits.
cat("3.\tEXTRACURRICULAR ACTIVITIES has a positive effect.\n")
## 3. EXTRACURRICULAR ACTIVITIES has a positive effect.
cat("\tParticipation correlates with higher performance,\n")
## Participation correlates with higher performance,
cat("\tpossibly because active students are better organised\n")
## possibly because active students are better organised
cat("\tand motivated overall.\n\n")
## and motivated overall.
cat("4.\tSAMPLE PAPERS PRACTICED has a positive effect.\n")
## 4. SAMPLE PAPERS PRACTICED has a positive effect.
cat("\tPractice papers help students prepare for exam format,\n")
## Practice papers help students prepare for exam format,
cat("\treducing test anxiety and improving retrieval.\n\n")
## reducing test anxiety and improving retrieval.
cat("5.\tSLEEP HOURS has a modest positive effect.\n")
## 5. SLEEP HOURS has a modest positive effect.
cat("\tAdequate sleep supports memory consolidation and\n")
## Adequate sleep supports memory consolidation and
cat("\tconcentration — important but smaller than study habits.\n")
## concentration — important but smaller than study habits.