# Set a fixed random seed for reproducibility
set.seed(10923)
# Number of students
#TODO: set num_students to 500
# Enter code below:
num_students <- 500Utilizing Supervised Learning in Learning Analytics
Case Study 4
Business Scenario: Predicting Student Performance
In this case study, you are an analyst at an online education platform. The management is interested in predicting student performance based on various factors to provide personalized support and improve the learning experience. Your task is to develop a supervised learning model to predict students’ final grades using simulated data.
Objective:
Your goal is to build a predictive model using supervised learning techniques in R. You will utilize simulated student data with features such as study hours, quiz scores, forum participation, and previous grades to predict the final grades.
Data Generation:
Simulate study hours (ranging from 1 to 20 hours)
study_hours <- sample(1:20, num_students, replace = TRUE)
num_students <- 500 # Set the number of students
study_hours <- sample(1:20, num_students, replace = TRUE)Simulate quiz scores (ranging from 0 to 100)
quiz_scores <- sample(0:100, num_students, replace = TRUE)
# Set a fixed random seed for reproducibility
set.seed(10923)
# Number of students
num_students <- 500
# Simulate quiz scores (ranging from 0 to 100)
quiz_scores <- sample(0:100, num_students, replace = TRUE)Simulate forum participation (ranging from 0 to 50 posts)
forum_posts <- sample(0:50, num_students, replace = TRUE)
# Set a fixed random seed for reproducibility
set.seed(10923)
# Number of students
num_students <- 500
# Simulate forum participation (ranging from 0 to 50 posts)
forum_posts <- sample(0:50, num_students, replace = TRUE)Simulate previous grades (ranging from 0 to 100)
previous_grades <- sample(0:100, num_students, replace = TRUE)
# Set a fixed random seed for reproducibility
set.seed(10923)
# Number of students
num_students <- 500
# Simulate previous grades (ranging from 0 to 100)
previous_grades <- sample(0:100, num_students, replace = TRUE)Simulate final grades (ranging from 0 to 100)
final_grades <- 0.3 * study_hours + 0.4 * quiz_scores + 0.2 * forum_posts + 0.1 * previous_grades + rnorm(num_students, mean = 0, sd = 5) + 25
# Simulate final grades (ranging from 0 to 100)
final_grades <- 0.3 * study_hours + 0.4 * quiz_scores + 0.2 * forum_posts + 0.1 * previous_grades + rnorm(num_students, mean = 0, sd = 5) + 25Create a data frame
student_data <- data.frame(StudyHours = study_hours, QuizScores = quiz_scores, ForumPosts = forum_posts, PreviousGrades = previous_grades, FinalGrades = final_grades)
# Create a data frame
student_data <- data.frame(
StudyHours = study_hours,
QuizScores = quiz_scores,
ForumPosts = forum_posts,
PreviousGrades = previous_grades,
FinalGrades = final_grades
)View the first few rows of the generated data
head(student_data)
# View the first few rows of the generated data
head(student_data) StudyHours QuizScores ForumPosts PreviousGrades FinalGrades
1 20 83 19 83 84.40419
2 12 84 20 84 71.08812
3 13 75 11 75 60.53433
4 4 76 12 76 68.42699
5 5 94 30 94 75.98611
6 18 3 3 3 24.19763
Explore the data
# Todo:Modeling
Use 80% of the data for training and 20% for testing to predict final grades. Compute the Mean Squared Error and model accuracy based on prediction interval.
# Todo:
# Splitting the data into training and testing sets (80% training, 20% testing)
set.seed(10923) # Set seed for reproducibility
sample_index <- sample(1:nrow(student_data), 0.8 * nrow(student_data))
train_data <- student_data[sample_index, ]
test_data <- student_data[-sample_index, ]
# Building a Linear Regression model using the train data and assign it to an object # called model.
# Todo: Target variable is FinalGrades and the Features are StudyHours, QuizScores, # ForumPosts, and PreviousGrades
# Enter code below:
# Set seed for reproducibility
set.seed(10923)
# Split the data into training (80%) and testing (20%)
sample_index <- sample(1:nrow(student_data), 0.8 * nrow(student_data))
train_data <- student_data[sample_index, ]
test_data <- student_data[-sample_index, ]
# Build a Linear Regression model
model <- lm(FinalGrades ~ StudyHours + QuizScores + ForumPosts + PreviousGrades, data = train_data)Making predictions on the test set. use the model object to make prediction.
Enter code below:
# Make predictions on the test set using the model
predictions <- predict(model, newdata = test_data)Evaluation metrics
Compute the mean squared error and R-squared
Enter code below
# Compute the mean squared error (MSE)
mse <- mean((test_data$FinalGrades - predictions)^2)
# Compute the R-squared (R^2) value
sst <- sum((test_data$FinalGrades - mean(test_data$FinalGrades))^2)
ssr <- sum((predictions - mean(test_data$FinalGrades))^2)
rsquared <- 1 - (ssr / sst)
# Print the results
cat("Mean Squared Error (MSE):", mse, "\n")Mean Squared Error (MSE): 26.1576
cat("R-squared (R^2) Value:", rsquared, "\n")R-squared (R^2) Value: 0.1083417
Print evaluation metrics
#Enter code below
# Print the mean squared error (MSE)
cat("Mean Squared Error (MSE):", mse, "\n")Mean Squared Error (MSE): 26.1576
# Print the R-squared (R^2) value
cat("R-squared (R^2) Value:", rsquared, "\n")R-squared (R^2) Value: 0.1083417
Model Accuracy based on Prediction Interval
Get the predictions and prediction intervals
pred_int <- predict(model, newdata = test_data, interval = "prediction")Extract lower and upper bounds of the prediction interval
lower_bound <- pred_int[, "lwr"]
upper_bound <- pred_int[, "upr"]Actual values from the test data
actual_values <- test_data$FinalGrades
actual_values <- test_data$FinalGradesCheck if the actual values fall within the prediction interval
correct_predictions <- actual_values >= lower_bound & actual_values <= upper_boundCompute accuracy
accuracy <- sum(correct_predictions) / length(correct_predictions)
Print accuracy
The accuracy is calculated as the proportion of correct predictions.
Have fun!