Utilizing Supervised Learning in Learning Analytics

Case Study 4

Author

Nikhila

Business Scenario: Predicting Student Performance

In this case study, you are an analyst at an online education platform. The management is interested in predicting student performance based on various factors to provide personalized support and improve the learning experience. Your task is to develop a supervised learning model to predict students’ final grades using simulated data.

Objective:

Your goal is to build a predictive model using supervised learning techniques in R. You will utilize simulated student data with features such as study hours, quiz scores, forum participation, and previous grades to predict the final grades.

Data Generation:

# Set a fixed random seed for reproducibility
set.seed(101923)

# Number of students
#TODO: set num_students to 500
# Enter code below:
num_students <- 500


# Simulate study hours (ranging from 1 to 20 hours)
study_hours <- sample(1:20, num_students, replace = TRUE)

# Simulate quiz scores (ranging from 0 to 100)
quiz_scores <- sample(0:100, num_students, replace = TRUE)

# Simulate forum participation (ranging from 0 to 50 posts)
forum_posts <- sample(0:50, num_students, replace = TRUE)

# Simulate previous grades (ranging from 0 to 100)
previous_grades <- sample(0:100, num_students, replace = TRUE)

# Simulate final grades (ranging from 0 to 100)
final_grades <- 0.3 * study_hours + 0.4 * quiz_scores + 0.2 * forum_posts + 0.1 * previous_grades + rnorm(num_students, mean = 0, sd = 5) + 25

# Create a data frame
student_data <- data.frame(StudyHours = study_hours, QuizScores = quiz_scores, ForumPosts = forum_posts, PreviousGrades = previous_grades, FinalGrades = final_grades)

# View the first few rows of the generated data
head(student_data)
  StudyHours QuizScores ForumPosts PreviousGrades FinalGrades
1          7         90          1             58    62.21874
2          7         46          1             96    59.34761
3          2         72         36             17    53.65904
4          9         66         25             51    67.53756
5         10         19         50             56    49.58935
6         12         15          1             92    35.62870

Explore the data

# Todo:
# Summary statistics
summary(student_data)
   StudyHours      QuizScores       ForumPosts    PreviousGrades  
 Min.   : 1.00   Min.   :  0.00   Min.   : 0.00   Min.   :  0.00  
 1st Qu.: 5.75   1st Qu.: 23.00   1st Qu.:12.00   1st Qu.: 22.75  
 Median :10.00   Median : 46.00   Median :25.00   Median : 55.00  
 Mean   :10.14   Mean   : 47.89   Mean   :24.32   Mean   : 51.24  
 3rd Qu.:15.00   3rd Qu.: 73.00   3rd Qu.:37.00   3rd Qu.: 76.25  
 Max.   :20.00   Max.   :100.00   Max.   :50.00   Max.   :100.00  
  FinalGrades   
 Min.   :26.05  
 1st Qu.:46.41  
 Median :56.34  
 Mean   :56.89  
 3rd Qu.:67.25  
 Max.   :88.89  
# Correlation analysis
cor(student_data)
                StudyHours  QuizScores  ForumPosts PreviousGrades FinalGrades
StudyHours      1.00000000  0.03291741  0.07783247    -0.01783637   0.2024983
QuizScores      0.03291741  1.00000000 -0.06680563    -0.07581315   0.8476383
ForumPosts      0.07783247 -0.06680563  1.00000000    -0.02648057   0.1825078
PreviousGrades -0.01783637 -0.07581315 -0.02648057     1.00000000   0.1629126
FinalGrades     0.20249834  0.84763833  0.18250776     0.16291257   1.0000000

Modeling

Use 80% of the data for training and 20% for testing to predict final grades. Compute the Mean Squared Error and model accuracy based on prediction interval.

# Todo:
# Splitting the data into training and testing sets (80% training, 20% testing)
set.seed(101923) # Set seed for reproducibility
sample_index <- sample(1:nrow(student_data), 0.8 * nrow(student_data))
train_data <- student_data[sample_index, ]
test_data <- student_data[-sample_index, ]

# Building a Linear Regression model using the train data and assign it to an object # called model.
# Todo: Target variable is FinalGrades and the Features are StudyHours, QuizScores, # ForumPosts, and PreviousGrades
# Enter code below:

model <- lm(FinalGrades ~ StudyHours + QuizScores + ForumPosts + PreviousGrades, data = train_data)

# Making predictions on the test set. use the model object to make prediction.
# Enter code below:
predictions <- predict(model, newdata = test_data)

# Evaluation metrics
# Compute the mean squared error and R-squared
# Enter code below

mse <- mean((test_data$FinalGrades - predictions)^2)
r_squared <- 1 - (sum((test_data$FinalGrades - predictions)^2) / sum((test_data$FinalGrades - mean(test_data$FinalGrades))^2))

# Print evaluation metrics
#Enter code below

cat("Mean Squared Error:", mse, "\n")
Mean Squared Error: 28.49991 
cat("R-squared Value:", r_squared, "\n")
R-squared Value: 0.8332237 
# Creating different models
# Model 1: Baseline model with all four features
model1 <- lm(FinalGrades ~ StudyHours + QuizScores + ForumPosts + PreviousGrades, data = train_data)

# Model 2: Simplified model with StudyHours and QuizScores
model2 <- lm(FinalGrades ~ StudyHours + QuizScores, data = train_data)

# Model 3: Additional model with different feature combination
model3 <- lm(FinalGrades ~ StudyHours + PreviousGrades, data = train_data)

# Model 4: Another additional model with different feature combination
model4 <- lm(FinalGrades ~ QuizScores + ForumPosts, data = train_data)

# Comparing model fits using MSE
# Compute MSE for each model
mse_model1 <- mean((test_data$FinalGrades - predict(model1, newdata = test_data))^2)
mse_model2 <- mean((test_data$FinalGrades - predict(model2, newdata = test_data))^2)
mse_model3 <- mean((test_data$FinalGrades - predict(model3, newdata = test_data))^2)
mse_model4 <- mean((test_data$FinalGrades - predict(model4, newdata = test_data))^2)

# Model accuracy using prediction interval
# Model 1
pred_int_model1 <- predict(model1, newdata = test_data, interval = "prediction")
lower_bound_model1 <- pred_int_model1[, "lwr"]
upper_bound_model1 <- pred_int_model1[, "upr"]
correct_predictions_model1 <- test_data$FinalGrades >= lower_bound_model1 & test_data$FinalGrades <= upper_bound_model1
accuracy_model1 <- sum(correct_predictions_model1) / length(correct_predictions_model1)

# Model 2
pred_int_model2 <- predict(model2, newdata = test_data, interval = "prediction")
# ... (similar steps for model 2, model 3, and model 4)

# Summary of findings
cat("MSE for Model 1:", mse_model1, "\n")
MSE for Model 1: 28.49991 
cat("MSE for Model 2:", mse_model2, "\n")
MSE for Model 2: 51.67902 
cat("MSE for Model 3:", mse_model3, "\n")
MSE for Model 3: 172.8869 
cat("MSE for Model 4:", mse_model4, "\n")
MSE for Model 4: 45.53353 
cat("Model 1 Accuracy using Prediction Interval:", accuracy_model1, "\n")
Model 1 Accuracy using Prediction Interval: 0.94 
# ... (similar steps for model 2, model 3, and model 4)
summary(model)

Call:
lm(formula = FinalGrades ~ StudyHours + QuizScores + ForumPosts + 
    PreviousGrades, data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.1289  -3.2309  -0.0955   3.1938  15.2959 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    24.269069   0.870613  27.876  < 2e-16 ***
StudyHours      0.370291   0.044543   8.313 1.52e-15 ***
QuizScores      0.395925   0.008522  46.461  < 2e-16 ***
ForumPosts      0.200061   0.016989  11.776  < 2e-16 ***
PreviousGrades  0.100604   0.008206  12.260  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.977 on 395 degrees of freedom
Multiple R-squared:  0.8619,    Adjusted R-squared:  0.8605 
F-statistic: 616.2 on 4 and 395 DF,  p-value: < 2.2e-16
nrow(test_data)
[1] 100

Model Accuracy based on Prediction Interval

# Get the predictions and prediction intervals
pred_int <- predict(model, newdata = test_data, interval = "prediction")

# Extract lower and upper bounds of the prediction interval
lower_bound <- pred_int[, "lwr"]
upper_bound <- pred_int[, "upr"]

# Actual values from the test data
actual_values <- test_data$FinalGrades

# Check if the actual values fall within the prediction interval
correct_predictions <- actual_values >= lower_bound & actual_values <= upper_bound

# Compute accuracy
accuracy <- sum(correct_predictions) / length(correct_predictions)

# Print accuracy
cat("Model Accuracy using Prediction Interval:", accuracy, "\n")
Model Accuracy using Prediction Interval: 0.94 

The accuracy is calculated as the proportion of correct predictions.

Have fun!