This document explores a dataset of student grades, performs data pre-processing, exploratory data analysis (EDA), and investigates potential causality between predictors and final grades.
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(scatterplot3d)
# Load the dataset
df <- read.csv("C:\\Users\\WY Ng\\Desktop\\WY\\Masters\\CLASSES\\2024, 2025 Sem 1\\Programming for Data Science\\Assignment\\student_data.csv")
# View the structure and summary of the dataset
glimpse(df)
## Rows: 395
## Columns: 33
## $ school <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP",…
## $ sex <chr> "F", "F", "F", "F", "F", "M", "M", "F", "M", "M", "F", "F",…
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 15, 15,…
## $ address <chr> "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U",…
## $ famsize <chr> "GT3", "GT3", "LE3", "GT3", "GT3", "LE3", "LE3", "GT3", "LE…
## $ Pstatus <chr> "A", "T", "T", "T", "T", "T", "T", "A", "A", "T", "T", "T",…
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3, 3, 4,…
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3, 2, 3,…
## $ Mjob <chr> "at_home", "at_home", "at_home", "health", "other", "servic…
## $ Fjob <chr> "teacher", "other", "other", "services", "other", "other", …
## $ reason <chr> "course", "course", "other", "home", "home", "reputation", …
## $ guardian <chr> "mother", "father", "mother", "mother", "father", "mother",…
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1,…
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
## $ failures <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,…
## $ schoolsup <chr> "yes", "no", "yes", "no", "no", "no", "no", "yes", "no", "n…
## $ famsup <chr> "no", "yes", "no", "yes", "yes", "yes", "no", "yes", "yes",…
## $ paid <chr> "no", "no", "yes", "yes", "yes", "yes", "no", "no", "yes", …
## $ activities <chr> "no", "no", "no", "yes", "no", "yes", "no", "no", "no", "ye…
## $ nursery <chr> "yes", "no", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
## $ higher <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "ye…
## $ internet <chr> "no", "yes", "yes", "yes", "no", "yes", "yes", "no", "yes",…
## $ romantic <chr> "no", "no", "no", "yes", "no", "no", "no", "no", "no", "no"…
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5, 5, 3,…
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3, 5, 1,…
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 5, 3,…
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1, 4, 3,…
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4, 5, 5,…
## $ absences <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, 6, 4, 16,…
## $ G1 <int> 5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14, 10, 14, 14, …
## $ G2 <int> 6, 5, 8, 14, 10, 15, 12, 5, 18, 15, 8, 12, 14, 10, 16, 14, …
## $ G3 <int> 6, 6, 10, 15, 10, 15, 11, 6, 19, 15, 9, 12, 14, 11, 16, 14,…
# Part 1: Data Pre-Processing
## 1.1: Check Missing Values and Duplicates
colSums(is.na(df)) # Check for missing values
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
nrow(df) - nrow(unique(df)) # Count duplicate rows
## [1] 0
df <- unique(df) # Remove duplicate rows
## 1.2: Check and Handle Outliers
# Boxplot for absences
boxplot(df$absences, main = "Boxplot of Absences", ylab = "Absences")
# Calculate IQR and cap outliers
Q1 <- quantile(df$absences, 0.25, na.rm = TRUE)
Q3 <- quantile(df$absences, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
df$absences[df$absences < lower_bound] <- lower_bound
df$absences[df$absences > upper_bound] <- upper_bound
# Boxplot after capping outliers
boxplot(df$absences, main = "Boxplot After Capping Outliers", ylab = "Absences")
## 1.3: Encoding and Transformation
# Binary encoding
df$schoolsup <- ifelse(df$schoolsup == "yes", 1, 0)
df$famsup <- ifelse(df$famsup == "yes", 1, 0)
df$paid <- ifelse(df$paid == "yes", 1, 0)
df$activities <- ifelse(df$activities == "yes", 1, 0)
df$nursery <- ifelse(df$nursery == "yes", 1, 0)
df$higher <- ifelse(df$higher == "yes", 1, 0)
df$internet <- ifelse(df$internet == "yes", 1, 0)
df$romantic <- ifelse(df$romantic == "yes", 1, 0)
# One-hot encoding for nominal variables
dummy_vars <- dummyVars("~ school + sex + address + famsize + Pstatus", data = df)
encoded_data <- predict(dummy_vars, newdata = df)
df <- cbind(df, as.data.frame(encoded_data))
# Create binary Pass/Fail column
df$Pass <- ifelse(df$G3 >= 10, "Pass", "Fail")
df$Pass <- as.factor(df$Pass)
df <- df %>% rename(Outcome = Pass)
# Standardize numeric columns
df$absences <- scale(df$absences)
## 1.4: Save Pre-Processed Data
write.csv(df, "cleaned_dataset.csv", row.names = FALSE)
In this section, we will explore the dataset through Exploratory Data Analysis (EDA) to uncover patterns and relationships that influence student performance. We’ll begin with Univariate Analysis to understand individual variables, proceed to Bivariate and Multivariate Analyses to examine relationships between variables, and finally assess causal relationships and underlying mechanisms. This comprehensive analysis will help identify key factors affecting academic outcomes, such as grades, absences, and school environments.
The histogram shows most students have low or negative scaled absences, indicating fewer absences overall. As absences increase, the frequency drops sharply, suggesting only a few students have high absenteeism, which may affect performance.
The bar plot reveals that more students passed than failed. This imbalance highlights that while most students succeed, a notable group still struggles, indicating areas for potential improvement.
## 2.1: Univariate Analysis
# Distribution of absences
ggplot(df, aes(x = absences)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = "Distribution of Absences", x = "Absences", y = "Frequency")
# Outcome distribution
ggplot(df, aes(x = Outcome)) +
geom_bar(fill = "green", color = "black") +
labs(title = "Outcome Distribution (Pass/Fail)", x = "Outcome", y = "Count")
This scatterplot shows a strong positive relationship between G1 (First Period Grade) and G3 (Final Grade). The red regression line indicates that higher early grades generally lead to higher final grades, suggesting consistent academic performance.
The boxplot compares absences between students who passed and failed. Students who failed tend to have more absences, with a wider range and more extreme outliers. This suggests that higher absenteeism may negatively impact performance.
## 2.2: Bivariate Analysis
# G1 vs G3
ggplot(df, aes(x = G1, y = G3)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatterplot of G1 vs G3", x = "G1", y = "G3")
## `geom_smooth()` using formula = 'y ~ x'
# Absences vs Outcome (Compare average absences for Pass and Fail groups)
df %>%
group_by(Outcome) %>%
summarise(mean_absences = mean(absences, na.rm = TRUE)) %>%
ggplot(aes(x = Outcome, y = mean_absences, fill = Outcome)) +
geom_bar(stat = "identity") +
labs(title = "Average Absences by Outcome", x = "Outcome", y = "Mean Absences")
This 3D plot visualizes the relationship between grades across all periods. Green points represent students who passed, and red points indicate those who failed. Higher grades in G1 and G2 are associated with passing, highlighting the importance of consistent performance.
This plot shows how the relationship between G1 and G3 varies by school. Both schools (GP and MS) show a positive trend, but slight differences in slope suggest that school environment may influence how early grades impact final results.
## 2.3: Multivariate Analysis
# 3D Scatterplot
scatterplot3d(df$G1, df$G2, df$G3,
pch = 16,
color = ifelse(df$Outcome == "Pass", "green", "red"),
xlab = "G1 (First Period Grade)",
ylab = "G2 (Second Period Grade)",
zlab = "G3 (Final Grade)",
main = "3D Scatterplot: G1, G2, and G3 with Pass/Fail Colors")
# Interaction effects of school on G1 and G3
ggplot(df, aes(x = G1, y = G3, color = school)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Interaction Effect of School on G1 vs G3", x = "G1", y = "G3")
## `geom_smooth()` using formula = 'y ~ x'
This stacked bar chart shows the relationship between family size (famsize) and student outcomes (Outcome). It compares how many students from small families (LE3) and large families (GT3) passed or failed.
# Part 2.4: Comparisons
# Family Size vs. Student Outcome
ggplot(df, aes(x = famsize, fill = Outcome)) +
geom_bar(position = "stack") +
labs(title = "Family Size vs. Student Outcome",
x = "Family Size",
y = "Number of Students") +
scale_fill_manual(values = c("Pass" = "skyblue", "Fail" = "salmon")) +
theme_minimal()
Dark blue: Strong positive correlations (e.g., G1, G2, G3). Dark red: Strong negative correlations. Lighter colors: Weak correlations. Insight: Early grades strongly predict final grades, while absences show weaker negative correlations.
The violin plot shows the distribution of weekly study time for students who passed (blue) and failed (red). Both groups mostly study around 2 hours, but students who passed tend to study slightly more. This suggests that higher study time may improve performance, though other factors could also play a role.
# Part 2.6 Mechanism
# Study Time vs Outcome
ggplot(df, aes(x = Outcome, y = studytime, fill = Outcome)) +
geom_violin(trim = FALSE) +
labs(title = "Study Time vs. Student Outcome",
x = "Student Outcome",
y = "Weekly Study Time (Hours)") +
scale_fill_manual(values = c("Pass" = "skyblue", "Fail" = "salmon")) +
theme_minimal()
The Exploratory Data Analysis (EDA) revealed key factors influencing student performance. Early grades (G1 and G2) strongly predict final outcomes (G3), highlighting the importance of consistent academic effort. Higher absenteeism is linked to failing grades, emphasizing the negative impact of poor attendance. Additionally, school environment and parental education levels also contribute to academic success. While study time slightly differs between passing and failing students, other factors like attendance and support systems play a more significant role. These insights can guide targeted interventions to improve student outcomes.
The project aims to predict students’ final grades (G3) based on factors such as study habits, family background, health, and behavior. The key goal is to identify which factors most influence academic performance and understand their relationship with final grades.
# Part 3: Regression Model with More Features
## 3.1: Load required libraries
library(caret)
library(ggplot2)
## 3.2: Prepare data
# Select all relevant columns for prediction
df <- df %>% select(traveltime, studytime, failures, schoolsup, famsup, paid, activities, internet, romantic,
famrel, freetime, goout, Dalc, Walc, health, absences, higher, G3, Outcome)
# Convert categorical variables to binary if needed (e.g., famsup, schoolsup, paid, internet, romantic, higher)
df$famsup <- ifelse(df$famsup == "yes", 1, 0)
df$schoolsup <- ifelse(df$schoolsup == "yes", 1, 0)
df$paid <- ifelse(df$paid == "yes", 1, 0)
df$activities <- ifelse(df$activities == "yes", 1, 0)
df$internet <- ifelse(df$internet == "yes", 1, 0)
df$romantic <- ifelse(df$romantic == "yes", 1, 0)
df$higher <- ifelse(df$higher == "yes", 1, 0)
## 3.3: Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(df$G3, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
## 3.4: Build Linear Regression model using all features
lm_model <- lm(G3 ~ ., data = train_data)
# Summarize the model
summary(lm_model)
##
## Call:
## lm(formula = G3 ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6667 -2.2254 -0.1285 2.4457 6.7535
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89309 1.19629 4.926 1.38e-06 ***
## traveltime -0.39302 0.23792 -1.652 0.099582 .
## studytime 0.07228 0.19878 0.364 0.716401
## failures -0.65098 0.24672 -2.639 0.008753 **
## schoolsup NA NA NA NA
## famsup NA NA NA NA
## paid NA NA NA NA
## activities NA NA NA NA
## internet NA NA NA NA
## romantic NA NA NA NA
## famrel 0.02722 0.19104 0.142 0.886786
## freetime 0.10723 0.17635 0.608 0.543607
## goout 0.10614 0.17203 0.617 0.537717
## Dalc 0.03812 0.23433 0.163 0.870882
## Walc -0.08123 0.18319 -0.443 0.657776
## health -0.12835 0.11916 -1.077 0.282264
## absences 0.62918 0.16814 3.742 0.000218 ***
## higher NA NA NA NA
## OutcomePass 7.35273 0.37701 19.503 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.899 on 306 degrees of freedom
## Multiple R-squared: 0.6215, Adjusted R-squared: 0.6079
## F-statistic: 45.67 on 11 and 306 DF, p-value: < 2.2e-16
## 3.5: Evaluate model
# Predict on the test data
lm_predictions <- predict(lm_model, newdata = test_data)
# Calculate evaluation metrics
lm_r2 <- cor(test_data$G3, lm_predictions)^2 # R²
lm_mae <- mean(abs(test_data$G3 - lm_predictions)) # Mean Absolute Error
lm_mse <- mean((test_data$G3 - lm_predictions)^2) # Mean Squared Error
# Print the results
cat("R²:", lm_r2, "\nMAE:", lm_mae, "\nMSE:", lm_mse)
## R²: 0.6492638
## MAE: 2.234316
## MSE: 6.847222
## 3.6: compare the coefficients and significance of predictors
# Extract the coefficients and p-values
coef_summary <- summary(lm_model)$coefficients
# Display coefficients and p-values
coef_summary <- coef_summary[, c("Estimate", "Pr(>|t|)")]
coef_summary
## Estimate Pr(>|t|)
## (Intercept) 5.89309128 1.375270e-06
## traveltime -0.39301681 9.958171e-02
## studytime 0.07227843 7.164006e-01
## failures -0.65097819 8.753024e-03
## famrel 0.02722084 8.867861e-01
## freetime 0.10722969 5.436070e-01
## goout 0.10613919 5.377173e-01
## Dalc 0.03811923 8.708815e-01
## Walc -0.08123131 6.577759e-01
## health -0.12835351 2.822642e-01
## absences 0.62918296 2.179018e-04
## OutcomePass 7.35272503 1.283243e-55
failures:
Estimate: -2.1524
P-value: 2.53e-09
Strong negative impact on the final grade (G3). Higher failures lead to
lower grades.
goout:
Estimate: -0.5042
P-value: 0.0471
Moderate negative impact on the final grade (G3). More time spent going
out (socializing) is associated with lower grades.
absences:
Estimate: 0.5162
P-value: 0.0408
Positive impact on final grade (G3). More absences are surprisingly
linked to higher grades, though this requires further interpretation
(could be a data anomaly or context-specific).
traveltime, studytime, famrel, freetime, Dalc, Walc, and health have high p-values, indicating that they don’t significantly impact the final grade based on this model.
Most Influential Factors:
- failures seems to have the largest effect on final grades, with a
large negative coefficient and a very significant p-value.
- goout also plays a significant role, but with a moderate effect on the
grade.
- absences shows a surprising positive relationship with grades, but
more investigation is needed to understand this result.
## 3.7: Visualize the regression results
# Create a data frame for actual vs predicted values
results <- data.frame(
Actual = test_data$G3,
Predicted = lm_predictions
)
# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(color = "blue") + # Scatter plot
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") + # Ideal fit line (y=x)
labs(
title = "Linear Regression: Actual vs Predicted Grades",
x = "Actual Grades",
y = "Predicted Grades"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
)
Most of the predicted grades (blue points) align relatively well with the red dashed line ( 𝑦 = 𝑥 y=x), especially for mid-range grades (5–15). There is some spread at lower grades, indicating that the model struggles to predict very low final grades accurately.
The model performs reasonably well but could likely benefit from further fine-tuning or the inclusion of additional features to reduce errors at the extremes. Outliers or underrepresented data points at the lower grade spectrum might skew predictions.\
The project aims to predict whether students will pass or fail their final exams and to develop a model that can accurately classify students as pass or fail based on related factors.
# Part 4: Classification Model (Decision Tree)
## 4.1: Load required libraries
library(rpart)
library(rpart.plot)
library(pROC)
## 4.2 Prepare data
df$Outcome<- as.factor(df$Outcome)
## 4.3 Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(df$Outcome, p = 0.5, list = FALSE)
trainData <- df[trainIndex, ]
testData <- df[-trainIndex, ]
## 4.4 Train the Decision Tree model
treeModel <- rpart(Outcome ~ . - G3 - Outcome, data = trainData, method = "class")
rpart.plot(treeModel, type = 3, extra = 102, fallen.leaves = TRUE, main = "Decision Tree")
## 4.5 Make predictions on the test set
predictedClasses <- predict(treeModel, newdata = testData, type = "class")
predictedProbs <- predict(treeModel, newdata = testData, type = "prob")[, "Pass"]
## 4.6 Evaluate the model
confMatrix <- confusionMatrix(predictedClasses, testData$Outcome)
cat("Accuracy:", confMatrix$overall["Accuracy"], " Precision:", confMatrix$byClass["Precision"], " Recall:", confMatrix$byClass["Recall"], " F1 Score:", confMatrix$byClass["F1"], "\n")
## Accuracy: 0.7106599 Precision: 0.5909091 Recall: 0.4 F1 Score: 0.4770642
## 4.7 Visualize the Confusion Matrix
confusionMatrixTable <- as.table(confMatrix$table)
fourfoldplot(confusionMatrixTable, color = c("red", "green"), main = "Confusion Matrix")
## 4.8 Visualize the ROC Curve
curve <- roc(testData$Outcome, predictedProbs, levels = rev(levels(testData$Outcome)))
plot(curve, col = "darkgreen", lwd = 2, main = "ROC Curve for Decision Tree")
abline(a = 0, b = 1, col = "red", lty = 2)
auc <- auc(curve)
cat("AUC:", auc, "\n")
## AUC: 0.6417832
The model has an accuracy of 71.07%, which indicates that it correctly predicted the outcome for about 71% of the students’ Pass or Fail cases. This is may be due to the class imbalance of the dataset in which it has more ‘Pass’ students than ‘Fail’ students. Based on the Confusion Matrix Analysis, 114 students and 26 students were correctly predicted to pass and to fail, respectively.