Step 1: Download and Prepare Your Data

1.1 Download the Actual Data

Since I can’t access your Moodle directly, you need to: 1. Log into your university Moodle 2. Download the student.txt file 3. Save it in your project folder (same location as your R Markdown file)

OR use the UCI dataset directly with this code:

# OPTION A: If using the actual student.txt file from Moodle
# student_data <- read.table("student.txt", header = TRUE, sep = ";")

# OPTION B: Download from UCI (recommended for practice)
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"
download.file(url, "student.zip")
unzip("student.zip")

# Load both datasets
math_data <- read.table("student-mat.csv", sep = ";", header = TRUE)
por_data <- read.table("student-por.csv", sep = ";", header = TRUE)

# Merge and remove duplicates (as per dataset instructions)
student_data <- bind_rows(math_data, por_data) %>%
  distinct(school, sex, age, address, famsize, Pstatus, Medu, Fedu,
           Mjob, Fjob, reason, guardian, traveltime, studytime,
           failures, schoolsup, famsup, paid, activities, nursery,
           higher, internet, romantic, famrel, freetime, goout,
           Dalc, Walc, health, absences, .keep_all = TRUE)

cat("Dataset loaded successfully!\n")
## Dataset loaded successfully!
cat("Total students:", nrow(student_data), "\n")
## Total students: 1005
cat("Variables:", ncol(student_data), "\n")
## Variables: 33

1.2 Create Target Variable

# Create binary target: Pass (G3 >= 10) vs Fail (G3 < 10)
student_data$performance <- factor(
  ifelse(student_data$G3 >= 10, "Pass", "Fail"),
  levels = c("Fail", "Pass")
)

# Check distribution
cat("Performance Distribution:\n")
## Performance Distribution:
table(student_data$performance)
## 
## Fail Pass 
##  229  776
cat("\nPercentage:\n")
## 
## Percentage:
prop.table(table(student_data$performance)) * 100
## 
##     Fail     Pass 
## 22.78607 77.21393
# Visualize
ggplot(student_data, aes(x = performance, fill = performance)) +
  geom_bar() +
  labs(title = "Distribution of Pass vs Fail",
       x = "Final Result",
       y = "Number of Students") +
  theme_minimal()

Step 2: Split Data into Training & Testing

# Set seed for reproducibility
set.seed(123)

# Create 70/30 split
train_index <- createDataPartition(student_data$performance,
                                  p = 0.7,
                                  list = FALSE)

train_data <- student_data[train_index, ]
test_data <- student_data[-train_index, ]

cat("Training set size:", nrow(train_data), "\n")
## Training set size: 705
cat("Testing set size:", nrow(test_data), "\n")
## Testing set size: 300
# Save a copy for your records
write.csv(train_data, "train_data.csv", row.names = FALSE)
write.csv(test_data, "test_data.csv", row.names = FALSE)

Step 3: Build and Visualize Decision Tree

# Build decision tree model
tree_model <- rpart(performance ~ .,
                    data = train_data,
                    method = "class",
                    control = rpart.control(
                      minsplit = 20,    # Minimum splits
                      minbucket = 7,    # Minimum in leaf
                      cp = 0.01,        # Complexity
                      maxdepth = 4      # Tree depth
                    ))

# Display tree summary
cat("Decision Tree Summary:\n")
## Decision Tree Summary:
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = performance ~ ., data = train_data, method = "class", 
##     control = rpart.control(minsplit = 20, minbucket = 7, cp = 0.01, 
##         maxdepth = 4))
## 
## Variables actually used in tree construction:
## [1] G3
## 
## Root node error: 161/705 = 0.22837
## 
## n= 705 
## 
##     CP nsplit rel error xerror    xstd
## 1 1.00      0         1      1 0.06923
## 2 0.01      1         0      0 0.00000
# Visualize the tree
rpart.plot(tree_model,
           type = 4,
           extra = 104,
           box.palette = "GnBu",
           fallen.leaves = TRUE,
           main = "Decision Tree: Predicting Student Performance",
           tweak = 1.1)

Step 4: Interpret the Tree

# Examine one leaf node
cat("\n=== Analyzing Leaf Nodes ===\n")
## 
## === Analyzing Leaf Nodes ===
print(tree_model)
## n= 705 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 705 161 Pass (0.2283688 0.7716312)  
##   2) G3< 9.5 161   0 Fail (1.0000000 0.0000000) *
##   3) G3>=9.5 544   0 Pass (0.0000000 1.0000000) *
# Make a specific prediction for explanation
example_student <- data.frame(
  G2 = 12,           # Second period grade
  failures = 0,      # No past failures
  G1 = 11,           # First period grade
  Medu = 3,          # Mother's education (secondary)
  higher = "yes",    # Wants higher education
  studytime = 2,     # Studies 2-5 hours weekly
  absences = 5,      # Moderate absences
  stringsAsFactors = TRUE
)

# Fill other columns with common values
for(col in names(train_data)) {
  if(!col %in% names(example_student) && col != "performance") {
    if(is.factor(train_data[[col]])) {
      example_student[[col]] <- names(sort(table(train_data[[col]]), 
                                          decreasing = TRUE))[1]
    } else {
      example_student[[col]] <- median(train_data[[col]], na.rm = TRUE)
    }
  }
}

# Make prediction
pred <- predict(tree_model, example_student, type = "class")
prob <- predict(tree_model, example_student, type = "prob")

cat("\n=== Example Prediction ===\n")
## 
## === Example Prediction ===
cat("Student with G2 = 12, failures = 0\n")
## Student with G2 = 12, failures = 0
cat("Predicted:", as.character(pred), "\n")
## Predicted: Pass
cat("Probability Pass:", round(prob[2] * 100, 1), "%\n")
## Probability Pass: 100 %
cat("Probability Fail:", round(prob[1] * 100, 1), "%\n")
## Probability Fail: 0 %

Step 5: Evaluate Model Performance

# Make predictions on test set
test_predictions <- predict(tree_model, test_data, type = "class")

# Create confusion matrix
conf_matrix <- confusionMatrix(test_predictions, 
                              test_data$performance, 
                              positive = "Pass")

# Display confusion matrix
cat("=== Confusion Matrix ===\n")
## === Confusion Matrix ===
print(conf_matrix$table)
##           Reference
## Prediction Fail Pass
##       Fail   68    0
##       Pass    0  232
# Calculate metrics
accuracy <- conf_matrix$overall["Accuracy"]
precision <- conf_matrix$byClass["Precision"]
recall <- conf_matrix$byClass["Recall"]
specificity <- conf_matrix$byClass["Specificity"]

# Create results table
results <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "Specificity"),
  Definition = c("Overall correct predictions",
                 "Correct pass predictions / All pass predictions",
                 "Actual passes correctly identified",
                 "Actual fails correctly identified"),
  Value = round(c(accuracy, precision, recall, specificity), 3)
)

cat("\n=== Performance Metrics ===\n")
## 
## === Performance Metrics ===
pander(results)
Ā  Metric Definition Value
Accuracy Accuracy Overall correct predictions 1
Precision Precision Correct pass predictions / All pass predictions 1
Recall Recall Actual passes correctly identified 1
Specificity Specificity Actual fails correctly identified 1
# Visualization of results
ggplot(results, aes(x = Metric, y = Value, fill = Metric)) +
  geom_col() +
  geom_text(aes(label = Value), vjust = -0.5) +
  ylim(0, 1) +
  labs(title = "Model Performance Metrics",
       y = "Score") +
  theme_minimal()

Step 6: Save Your Work

# Save your model for future use
saveRDS(tree_model, "student_performance_tree.rds")

# Save predictions
test_results <- cbind(test_data, Predicted = test_predictions)
write.csv(test_results, "test_predictions.csv", row.names = FALSE)

cat("\nāœ… Analysis complete! Files saved:\n")
## 
## āœ… Analysis complete! Files saved:
cat("1. student_performance_tree.rds - Your trained model\n")
## 1. student_performance_tree.rds - Your trained model
cat("2. test_predictions.csv - All test predictions\n")
## 2. test_predictions.csv - All test predictions
cat("3. train_data.csv - Training data used\n")
## 3. train_data.csv - Training data used
cat("4. test_data.csv - Testing data used\n")
## 4. test_data.csv - Testing data used