rm(list = ls())

HOME WORK: Find a data set of which you can fit multiple linear regression and interpret your results

Installing requiered packages

packages <- c("tidyverse", "ggplot2", "corrplot", "GGally", "car",
              "lmtest", "broom", "patchwork", "scales", "Metrics")
 
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])
invisible(lapply(packages, library, character.only = TRUE))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.95 loaded
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## Loading required package: zoo
## 
## 
## Attaching package: 'zoo'
## 
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Fix all conflicts
library(conflicted)
conflict_prefer("filter",  "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag",     "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
conflict_prefer("as.Date", "base")
## [conflicted] Will prefer base::as.Date over any other package.
conflict_prefer("select",  "dplyr") 
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("recode", "dplyr")
## [conflicted] Will prefer dplyr::recode over any other package.

#LOAD DATA SET: STUDENT PERFORMANCE

student <- read.csv("Student_Performance.csv")

Data set review

head(student)
##   Hours.Studied Previous.Scores Extracurricular.Activities Sleep.Hours
## 1             7              99                        Yes           9
## 2             4              82                         No           4
## 3             8              51                        Yes           7
## 4             5              52                        Yes           5
## 5             7              75                         No           8
## 6             3              78                         No           9
##   Sample.Question.Papers.Practiced Performance.Index
## 1                                1                91
## 2                                2                65
## 3                                2                45
## 4                                2                36
## 5                                5                66
## 6                                6                61
glimpse(student)
## Rows: 10,000
## Columns: 6
## $ Hours.Studied                    <int> 7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3…
## $ Previous.Scores                  <int> 99, 82, 51, 52, 75, 78, 73, 45, 77, 8…
## $ Extracurricular.Activities       <chr> "Yes", "No", "Yes", "Yes", "No", "No"…
## $ Sleep.Hours                      <int> 9, 4, 7, 5, 8, 9, 5, 4, 8, 4, 4, 6, 9…
## $ Sample.Question.Papers.Practiced <int> 1, 2, 2, 2, 5, 6, 6, 6, 2, 0, 5, 2, 2…
## $ Performance.Index                <dbl> 91, 65, 45, 36, 66, 61, 63, 42, 61, 6…
str(student)
## 'data.frame':    10000 obs. of  6 variables:
##  $ Hours.Studied                   : int  7 4 8 5 7 3 7 8 5 4 ...
##  $ Previous.Scores                 : int  99 82 51 52 75 78 73 45 77 89 ...
##  $ Extracurricular.Activities      : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Sleep.Hours                     : int  9 4 7 5 8 9 5 4 8 4 ...
##  $ Sample.Question.Papers.Practiced: int  1 2 2 2 5 6 6 6 2 0 ...
##  $ Performance.Index               : num  91 65 45 36 66 61 63 42 61 69 ...
# Summary Statistics
summary(student)
##  Hours.Studied   Previous.Scores Extracurricular.Activities  Sleep.Hours   
##  Min.   :1.000   Min.   :40.00   Length   :10000            Min.   :4.000  
##  1st Qu.:3.000   1st Qu.:54.00   N.unique :    2            1st Qu.:5.000  
##  Median :5.000   Median :69.00   N.blank  :    0            Median :7.000  
##  Mean   :4.993   Mean   :69.45   Min.nchar:    2            Mean   :6.531  
##  3rd Qu.:7.000   3rd Qu.:85.00   Max.nchar:    3            3rd Qu.:8.000  
##  Max.   :9.000   Max.   :99.00                              Max.   :9.000  
##  Sample.Question.Papers.Practiced Performance.Index
##  Min.   :0.000                    Min.   : 10.00   
##  1st Qu.:2.000                    1st Qu.: 40.00   
##  Median :5.000                    Median : 55.00   
##  Mean   :4.583                    Mean   : 55.22   
##  3rd Qu.:7.000                    3rd Qu.: 71.00   
##  Max.   :9.000                    Max.   :100.00
#column names and number of rows

colnames(student)
## [1] "Hours.Studied"                    "Previous.Scores"                 
## [3] "Extracurricular.Activities"       "Sleep.Hours"                     
## [5] "Sample.Question.Papers.Practiced" "Performance.Index"
nrow(student)
## [1] 10000
#Missing values
print(colSums(is.na(student)))
##                    Hours.Studied                  Previous.Scores 
##                                0                                0 
##       Extracurricular.Activities                      Sleep.Hours 
##                                0                                0 
## Sample.Question.Papers.Practiced                Performance.Index 
##                                0                                0

#Data Preparation

# Turning Extracurricular.Activities as binary: Yes=1, No=0
student <- student %>%
  mutate(
    Extra = ifelse(Extracurricular.Activities == "Yes", 1L, 0L)
  )
 
# Rename columns for convenience
names(student) <- c("Hours", "PrevScore", "Extra_Label",
                    "Sleep", "Papers", "PerformanceIndex", "Extra")
 
print(names(student))
## [1] "Hours"            "PrevScore"        "Extra_Label"      "Sleep"           
## [5] "Papers"           "PerformanceIndex" "Extra"

Exploratory analysis

#Distribution of Student Performance Index
performance <- ggplot(student, aes(x = PerformanceIndex)) +
  geom_histogram(binwidth = 3, fill = "#1565C0", colour = "white", alpha = 0.85) +
  geom_vline(xintercept = mean(student$PerformanceIndex),
             colour = "red", linetype = "dashed", linewidth = 1) +
  annotate("text",
           x     = mean(student$PerformanceIndex) + 3,
           y     = 700,
           label = paste0("Mean = ", round(mean(student$PerformanceIndex), 1)),
           colour = "red", size = 3.5) +
  labs(title    = "Distribution of Student Performance Index",
       subtitle = "Target variable – approximately normal",
       x = "Performance Index", y = "Count") +
  theme_minimal(base_size = 12)
print(performance)

#Performance vs Hours Studied
p1 <- ggplot(student, aes(x = Hours,     y = PerformanceIndex)) +
  geom_point(alpha = 0.15, colour = "#1565C0", size = 0.8) +
  geom_smooth(method = "lm", colour = "red", linewidth = 1) +
  labs(title = "Performance vs Hours Studied",
       x = "Hours Studied", y = "Performance Index") +
  theme_minimal(base_size = 10)

#Performance vs Previous Scores 
p2 <- ggplot(student, aes(x = PrevScore, y = PerformanceIndex)) +
  geom_point(alpha = 0.15, colour = "#2E7D32", size = 0.8) +
  geom_smooth(method = "lm", colour = "red", linewidth = 1) +
  labs(title = "Performance vs Previous Scores",
       x = "Previous Scores", y = "Performance Index") +
  theme_minimal(base_size = 10)

#Performance vs Sleep Hours 
p3 <- ggplot(student, aes(x = Sleep,     y = PerformanceIndex)) +
  geom_point(alpha = 0.15, colour = "#6A1B9A", size = 0.8) +
  geom_smooth(method = "lm", colour = "red", linewidth = 1) +
  labs(title = "Performance vs Sleep Hours",
       x = "Sleep Hours", y = "Performance Index") +
  theme_minimal(base_size = 10)

#Performance vs Sample Papers 
p4 <- ggplot(student, aes(x = Papers,    y = PerformanceIndex)) +
  geom_point(alpha = 0.15, colour = "#E65100", size = 0.8) +
  geom_smooth(method = "lm", colour = "red", linewidth = 1) +
  labs(title = "Performance vs Sample Papers",
       x = "Sample Papers Practiced", y = "Performance Index") +
  theme_minimal(base_size = 10)
 
print((p1 + p2) / (p3 + p4))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#Performance Index by Extracurricular Activities
p_extra <- ggplot(student, aes(x = Extra_Label, y = PerformanceIndex,
                                fill = Extra_Label)) +
  geom_boxplot(alpha = 0.8, width = 0.4, show.legend = FALSE) +
  stat_summary(fun = mean, geom = "point", shape = 18,
               size = 4, colour = "white") +
  scale_fill_manual(values = c("Yes" = "#1565C0", "No" = "#E53935")) +
  labs(title    = "Performance Index by Extracurricular Activities",
       subtitle = "White diamond = group mean",
       x = "Extracurricular Activities", y = "Performance Index") +
  theme_minimal(base_size = 12)
print(p_extra)

#Correlation Matrix

num_vars   <- student %>% select(Hours, PrevScore, Sleep, Papers,
                                  Extra, PerformanceIndex)
cor_matrix <- cor(num_vars, method = "pearson")
 
cat("\n── Pearson Correlation with Performance Index ─────────\n")
## 
## ── Pearson Correlation with Performance Index ─────────
print(round(sort(cor_matrix[, "PerformanceIndex"], decreasing = TRUE), 4))
## PerformanceIndex        PrevScore            Hours            Sleep 
##           1.0000           0.9152           0.3737           0.0481 
##           Papers            Extra 
##           0.0433           0.0245
corrplot(cor_matrix,
         method      = "color",
         type        = "upper",
         addCoef.col = "black",
         tl.col      = "black",
         tl.srt      = 45,
         number.cex  = 0.75,
         col         = colorRampPalette(c("#E53935", "white", "#1565C0"))(200),
         title       = "Correlation Matrix – Student Performance Dataset",
         mar         = c(0, 0, 2, 0))

#Train/ Test split (80/20)

set.seed(10)
train_idx  <- sample(seq_len(nrow(student)), size = floor(0.8 * nrow(student)))
train_data <- student[ train_idx, ]
test_data  <- student[-train_idx, ]
 
cat(sprintf("\n── Train/Test Split ───────────────────────────────\n"))
## 
## ── Train/Test Split ───────────────────────────────
cat(sprintf("  Training set : %d rows (80%%)\n", nrow(train_data)))
##   Training set : 8000 rows (80%)
cat(sprintf("  Test set     : %d rows (20%%)\n", nrow(test_data)))
##   Test set     : 2000 rows (20%)

SIMPLE LINEAR REGRESSION (Baseline: PrevScore only)

cat("  MODEL 1 – SIMPLE LINEAR REGRESSION\n")
##   MODEL 1 – SIMPLE LINEAR REGRESSION
cat("  ------------------------------------\n")
##   ------------------------------------
slr <- lm(PerformanceIndex ~ PrevScore, data = train_data)
print(summary(slr))
## 
## Call:
## lm(formula = PerformanceIndex ~ PrevScore, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2007  -6.4794  -0.1227   6.5011  19.5401 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.592902   0.358627  -43.48   <2e-16 ***
## PrevScore     1.019496   0.005013  203.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.769 on 7998 degrees of freedom
## Multiple R-squared:  0.838,  Adjusted R-squared:  0.838 
## F-statistic: 4.137e+04 on 1 and 7998 DF,  p-value: < 2.2e-16
slr_r2   <- summary(slr)$r.squared
slr_pred <- predict(slr, newdata = test_data)
slr_rmse <- sqrt(mean((test_data$PerformanceIndex - slr_pred)^2))
slr_mae  <- mean(abs(test_data$PerformanceIndex - slr_pred))
 
cat(sprintf("\n  Test Rsq   : %.4f\n", slr_r2))
## 
##   Test Rsq   : 0.8380
cat(sprintf("  Test RMSE : %.4f\n", slr_rmse))
##   Test RMSE : 7.6443
cat(sprintf("  Test MAE  : %.4f\n", slr_mae))
##   Test MAE  : 6.4911

MULTIPLE LINEAR REGRESSION (Full Model)

cat("  MODEL 2 – MULTIPLE LINEAR REGRESSION\n")
##   MODEL 2 – MULTIPLE LINEAR REGRESSION
cat("---------------------------------------------\n")
## ---------------------------------------------
mlr <- lm(PerformanceIndex ~ Hours + PrevScore + Extra + Sleep + Papers,
          data = train_data)
print(summary(mlr))
## 
## Call:
## lm(formula = PerformanceIndex ~ Hours + PrevScore + Extra + Sleep + 
##     Papers, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6419 -1.3885 -0.0062  1.3684  8.3629 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -34.188929   0.142089 -240.62   <2e-16 ***
## Hours         2.855078   0.008790  324.80   <2e-16 ***
## PrevScore     1.019862   0.001318  773.99   <2e-16 ***
## Extra         0.618327   0.045682   13.54   <2e-16 ***
## Sleep         0.483804   0.013436   36.01   <2e-16 ***
## Papers        0.187353   0.007975   23.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.042 on 7994 degrees of freedom
## Multiple R-squared:  0.9888, Adjusted R-squared:  0.9888 
## F-statistic: 1.413e+05 on 5 and 7994 DF,  p-value: < 2.2e-16

#Test set evaluation

cat("\tModel Evaluation on test set \n")
##  Model Evaluation on test set
cat("----------------------------------------------")
## ----------------------------------------------
mlr_pred <- predict(mlr, newdata = test_data)
mlr_r2   <- 1 - sum((test_data$PerformanceIndex - mlr_pred)^2) /
                 sum((test_data$PerformanceIndex - mean(test_data$PerformanceIndex))^2)
mlr_rmse <- sqrt(mean((test_data$PerformanceIndex - mlr_pred)^2))
mlr_mae  <- mean(abs(test_data$PerformanceIndex - mlr_pred))
 
cat(sprintf("\n  %-35s  SLR (PrevScore)   MLR (all 5)\n", "Metric"))
## 
##   Metric                               SLR (PrevScore)   MLR (all 5)
cat(rep("-", 65), "\n", sep = "")
## -----------------------------------------------------------------
cat(sprintf("  %-35s  %-16.4f  %.4f\n", "Test Rsq",   slr_r2,   mlr_r2))
##   Test Rsq                             0.8380            0.9885
cat(sprintf("  %-35s  %-16.4f  %.4f\n", "Test RMSE", slr_rmse, mlr_rmse))
##   Test RMSE                            7.6443            2.0240
cat(sprintf("  %-35s  %-16.4f  %.4f\n", "Test MAE",  slr_mae,  mlr_mae))
##   Test MAE                             6.4911            1.5846
cat(rep("-", 65), "\n", sep = "")
## -----------------------------------------------------------------
cat("\n  Interpretation:\n")
## 
##   Interpretation:
cat("  → MLR explains more variance (higher Rsq) and makes smaller errors\n")
##   → MLR explains more variance (higher Rsq) and makes smaller errors
cat("    (lower RMSE and MAE) than the simple model using Previous Scores alone.\n")
##     (lower RMSE and MAE) than the simple model using Previous Scores alone.
cat("  → Each additional predictor adds genuine explanatory power.\n")
##   → Each additional predictor adds genuine explanatory power.

#Visualization

#Actual vs Predicted

test_results <- data.frame(
  Actual    = test_data$PerformanceIndex,
  Predicted = mlr_pred,
  Residual  = test_data$PerformanceIndex - mlr_pred
)
 
p_avp <- ggplot(test_results, aes(x = Actual, y = Predicted)) +
  geom_point(aes(colour = abs(Residual)), alpha = 0.4, size = 1.2) +
  geom_abline(slope = 1, intercept = 0, colour = "red",
              linetype = "dashed", linewidth = 1) +
  scale_colour_gradient(low = "#A5D6A7", high = "#B71C1C",
                        name = "|Residual|") +
  labs(title    = "Actual vs Predicted – Performance Index (Test Set)",
       subtitle  = sprintf("MLR  |  Rsq = %.4f  |  RMSE = %.2f  |  MAE = %.2f",
                            mlr_r2, mlr_rmse, mlr_mae),
       x = "Actual Performance Index",
       y = "Predicted Performance Index") +
  theme_minimal(base_size = 12)
print(p_avp)

#Residual Distribution
p_resid_hist <- ggplot(test_results, aes(x = Residual)) +
  geom_histogram(binwidth = 1, fill = "#1565C0", colour = "white", alpha = 0.8) +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed", linewidth = 1) +
  labs(title    = "Distribution of Residuals (Test Set)",
       subtitle = "Should be centred at zero and approximately normal",
       x = "Residual (Actual – Predicted)", y = "Count") +
  theme_minimal(base_size = 12)
print(p_resid_hist)

#Coefficient Plot
coef_plot_df <- tidy(mlr, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = recode(term,
                  Hours     = "Hours Studied",
                  PrevScore = "Previous Scores",
                  Extra     = "Extracurricular (Yes)",
                  Sleep     = "Sleep Hours",
                  Papers    = "Sample Papers"),
    Significant = ifelse(p.value < 0.05, "p < 0.05", "p ≥ 0.05")
  )
 
p_coef <- ggplot(coef_plot_df,
                 aes(x = reorder(term, estimate), y = estimate,
                     colour = Significant)) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, linewidth = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey50") +
  coord_flip() +
  scale_colour_manual(values = c("p < 0.05" = "#1565C0", "p ≥ 0.05" = "#B0BEC5")) +
  labs(title    = "MLR Coefficient Plot with 95% Confidence Intervals",
       subtitle = "Blue = statistically significant (p < 0.05)",
       x = NULL, y = "Coefficient Estimate",
       colour = "Significance") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "top")
print(p_coef)

#Key Findings

cat("--------------------Key Findings------------------------\n\n")
## --------------------Key Findings------------------------
cat("1.\tPREVIOUS SCORES is by far the strongest predictor.\n")
## 1.   PREVIOUS SCORES is by far the strongest predictor.
cat("\tStudents with higher past performance consistently.\n")
##  Students with higher past performance consistently.
cat("\tscore higher — prior knowledge is the biggest driver.\n\n")
##  score higher — prior knowledge is the biggest driver.
cat("2.\tHOURS STUDIED has a strong positive effect.\n")
## 2.   HOURS STUDIED has a strong positive effect.
cat("\tEvery extra hour of study lifts the index significantly.\n")
##  Every extra hour of study lifts the index significantly.
cat("\tPractical implication: encourage consistent study habits.\n\n")
##  Practical implication: encourage consistent study habits.
cat("3.\tEXTRACURRICULAR ACTIVITIES has a positive effect.\n")
## 3.   EXTRACURRICULAR ACTIVITIES has a positive effect.
cat("\tParticipation correlates with higher performance,\n")
##  Participation correlates with higher performance,
cat("\tpossibly because active students are better organised\n")
##  possibly because active students are better organised
cat("\tand motivated overall.\n\n")
##  and motivated overall.
cat("4.\tSAMPLE PAPERS PRACTICED has a positive effect.\n")
## 4.   SAMPLE PAPERS PRACTICED has a positive effect.
cat("\tPractice papers help students prepare for exam format,\n")
##  Practice papers help students prepare for exam format,
cat("\treducing test anxiety and improving retrieval.\n\n")
##  reducing test anxiety and improving retrieval.
cat("5.\tSLEEP HOURS has a modest positive effect.\n")
## 5.   SLEEP HOURS has a modest positive effect.
cat("\tAdequate sleep supports memory consolidation and\n")
##  Adequate sleep supports memory consolidation and
cat("\tconcentration — important but smaller than study habits.\n")
##  concentration — important but smaller than study habits.