Since I canāt access your Moodle directly, you need to: 1. Log into
your university Moodle 2. Download the student.txt file 3.
Save it in your project folder (same location as your R Markdown
file)
OR use the UCI dataset directly with this code:
# OPTION A: If using the actual student.txt file from Moodle
# student_data <- read.table("student.txt", header = TRUE, sep = ";")
# OPTION B: Download from UCI (recommended for practice)
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"
download.file(url, "student.zip")
unzip("student.zip")
# Load both datasets
math_data <- read.table("student-mat.csv", sep = ";", header = TRUE)
por_data <- read.table("student-por.csv", sep = ";", header = TRUE)
# Merge and remove duplicates (as per dataset instructions)
student_data <- bind_rows(math_data, por_data) %>%
distinct(school, sex, age, address, famsize, Pstatus, Medu, Fedu,
Mjob, Fjob, reason, guardian, traveltime, studytime,
failures, schoolsup, famsup, paid, activities, nursery,
higher, internet, romantic, famrel, freetime, goout,
Dalc, Walc, health, absences, .keep_all = TRUE)
cat("Dataset loaded successfully!\n")
## Dataset loaded successfully!
cat("Total students:", nrow(student_data), "\n")
## Total students: 1005
cat("Variables:", ncol(student_data), "\n")
## Variables: 33
# Create binary target: Pass (G3 >= 10) vs Fail (G3 < 10)
student_data$performance <- factor(
ifelse(student_data$G3 >= 10, "Pass", "Fail"),
levels = c("Fail", "Pass")
)
# Check distribution
cat("Performance Distribution:\n")
## Performance Distribution:
table(student_data$performance)
##
## Fail Pass
## 229 776
cat("\nPercentage:\n")
##
## Percentage:
prop.table(table(student_data$performance)) * 100
##
## Fail Pass
## 22.78607 77.21393
# Visualize
ggplot(student_data, aes(x = performance, fill = performance)) +
geom_bar() +
labs(title = "Distribution of Pass vs Fail",
x = "Final Result",
y = "Number of Students") +
theme_minimal()
# Set seed for reproducibility
set.seed(123)
# Create 70/30 split
train_index <- createDataPartition(student_data$performance,
p = 0.7,
list = FALSE)
train_data <- student_data[train_index, ]
test_data <- student_data[-train_index, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 705
cat("Testing set size:", nrow(test_data), "\n")
## Testing set size: 300
# Save a copy for your records
write.csv(train_data, "train_data.csv", row.names = FALSE)
write.csv(test_data, "test_data.csv", row.names = FALSE)
# Build decision tree model
tree_model <- rpart(performance ~ .,
data = train_data,
method = "class",
control = rpart.control(
minsplit = 20, # Minimum splits
minbucket = 7, # Minimum in leaf
cp = 0.01, # Complexity
maxdepth = 4 # Tree depth
))
# Display tree summary
cat("Decision Tree Summary:\n")
## Decision Tree Summary:
printcp(tree_model)
##
## Classification tree:
## rpart(formula = performance ~ ., data = train_data, method = "class",
## control = rpart.control(minsplit = 20, minbucket = 7, cp = 0.01,
## maxdepth = 4))
##
## Variables actually used in tree construction:
## [1] G3
##
## Root node error: 161/705 = 0.22837
##
## n= 705
##
## CP nsplit rel error xerror xstd
## 1 1.00 0 1 1 0.06923
## 2 0.01 1 0 0 0.00000
# Visualize the tree
rpart.plot(tree_model,
type = 4,
extra = 104,
box.palette = "GnBu",
fallen.leaves = TRUE,
main = "Decision Tree: Predicting Student Performance",
tweak = 1.1)
# Examine one leaf node
cat("\n=== Analyzing Leaf Nodes ===\n")
##
## === Analyzing Leaf Nodes ===
print(tree_model)
## n= 705
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 705 161 Pass (0.2283688 0.7716312)
## 2) G3< 9.5 161 0 Fail (1.0000000 0.0000000) *
## 3) G3>=9.5 544 0 Pass (0.0000000 1.0000000) *
# Make a specific prediction for explanation
example_student <- data.frame(
G2 = 12, # Second period grade
failures = 0, # No past failures
G1 = 11, # First period grade
Medu = 3, # Mother's education (secondary)
higher = "yes", # Wants higher education
studytime = 2, # Studies 2-5 hours weekly
absences = 5, # Moderate absences
stringsAsFactors = TRUE
)
# Fill other columns with common values
for(col in names(train_data)) {
if(!col %in% names(example_student) && col != "performance") {
if(is.factor(train_data[[col]])) {
example_student[[col]] <- names(sort(table(train_data[[col]]),
decreasing = TRUE))[1]
} else {
example_student[[col]] <- median(train_data[[col]], na.rm = TRUE)
}
}
}
# Make prediction
pred <- predict(tree_model, example_student, type = "class")
prob <- predict(tree_model, example_student, type = "prob")
cat("\n=== Example Prediction ===\n")
##
## === Example Prediction ===
cat("Student with G2 = 12, failures = 0\n")
## Student with G2 = 12, failures = 0
cat("Predicted:", as.character(pred), "\n")
## Predicted: Pass
cat("Probability Pass:", round(prob[2] * 100, 1), "%\n")
## Probability Pass: 100 %
cat("Probability Fail:", round(prob[1] * 100, 1), "%\n")
## Probability Fail: 0 %
# Make predictions on test set
test_predictions <- predict(tree_model, test_data, type = "class")
# Create confusion matrix
conf_matrix <- confusionMatrix(test_predictions,
test_data$performance,
positive = "Pass")
# Display confusion matrix
cat("=== Confusion Matrix ===\n")
## === Confusion Matrix ===
print(conf_matrix$table)
## Reference
## Prediction Fail Pass
## Fail 68 0
## Pass 0 232
# Calculate metrics
accuracy <- conf_matrix$overall["Accuracy"]
precision <- conf_matrix$byClass["Precision"]
recall <- conf_matrix$byClass["Recall"]
specificity <- conf_matrix$byClass["Specificity"]
# Create results table
results <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "Specificity"),
Definition = c("Overall correct predictions",
"Correct pass predictions / All pass predictions",
"Actual passes correctly identified",
"Actual fails correctly identified"),
Value = round(c(accuracy, precision, recall, specificity), 3)
)
cat("\n=== Performance Metrics ===\n")
##
## === Performance Metrics ===
pander(results)
| Ā | Metric | Definition | Value |
|---|---|---|---|
| Accuracy | Accuracy | Overall correct predictions | 1 |
| Precision | Precision | Correct pass predictions / All pass predictions | 1 |
| Recall | Recall | Actual passes correctly identified | 1 |
| Specificity | Specificity | Actual fails correctly identified | 1 |
# Visualization of results
ggplot(results, aes(x = Metric, y = Value, fill = Metric)) +
geom_col() +
geom_text(aes(label = Value), vjust = -0.5) +
ylim(0, 1) +
labs(title = "Model Performance Metrics",
y = "Score") +
theme_minimal()
# Save your model for future use
saveRDS(tree_model, "student_performance_tree.rds")
# Save predictions
test_results <- cbind(test_data, Predicted = test_predictions)
write.csv(test_results, "test_predictions.csv", row.names = FALSE)
cat("\nā
Analysis complete! Files saved:\n")
##
## ā
Analysis complete! Files saved:
cat("1. student_performance_tree.rds - Your trained model\n")
## 1. student_performance_tree.rds - Your trained model
cat("2. test_predictions.csv - All test predictions\n")
## 2. test_predictions.csv - All test predictions
cat("3. train_data.csv - Training data used\n")
## 3. train_data.csv - Training data used
cat("4. test_data.csv - Testing data used\n")
## 4. test_data.csv - Testing data used