This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Business Scenario: Predicting Student Performance #Data Generation:
# Set a fixed random seed for reproducibility
set.seed(10923)
# Number of students
num_students <- 500 # Set the number of students to 500
# Simulate study hours (ranging from 1 to 20 hours)
study_hours <- sample(1:20, num_students, replace = TRUE)
# Simulate quiz scores (ranging from 0 to 100)
quiz_scores <- sample(0:100, num_students, replace = TRUE)
# Simulate forum participation (ranging from 0 to 50 posts)
forum_posts <- sample(0:50, num_students, replace = TRUE)
# Simulate previous grades (ranging from 0 to 100)
previous_grades <- sample(0:100, num_students, replace = TRUE)
# Simulate final grades (ranging from 0 to 100)
final_grades <- 0.3 * study_hours + 0.4 * quiz_scores + 0.2 * forum_posts + 0.1 * previous_grades + rnorm(num_students, mean = 0, sd = 5) + 25
# Create a data frame
student_data <- data.frame(StudyHours = study_hours, QuizScores = quiz_scores, ForumPosts = forum_posts, PreviousGrades = previous_grades, FinalGrades = final_grades)
# View the first few rows of the generated data
head(student_data)
## StudyHours QuizScores ForumPosts PreviousGrades FinalGrades
## 1 20 91 22 78 80.80895
## 2 12 26 27 1 46.45853
## 3 13 5 8 60 40.22946
## 4 4 96 13 78 70.64216
## 5 5 74 45 31 62.35254
## 6 18 1 47 50 48.42835
#Explore the data
# Todo:
#Modeling
Use 80% of the data for training and 20% for testing to predict final grades. Compute the Mean Squared Error and model accuracy based on prediction interval.
# Splitting the data into training and testing sets (80% training, 20% testing)
set.seed(10923) # Set seed for reproducibility
sample_index <- sample(1:nrow(student_data), 0.8 * nrow(student_data))
train_data <- student_data[sample_index, ]
test_data <- student_data[-sample_index, ]
# Building a Linear Regression model using the train data and assign it to an object # called model.
# Todo: Target variable is FinalGrades and the Features are StudyHours, QuizScores, # ForumPosts, and PreviousGrades
# Enter code below:
model <- lm(FinalGrades ~ StudyHours + QuizScores + ForumPosts + PreviousGrades, data = train_data)
# Making predictions on the test set. use the model object to make predictions.
# Enter code below:
predictions <- predict(model, newdata = test_data)
# Evaluation metrics
# Compute the mean squared error and R-squared
# Enter code below:
mse <- mean((test_data$FinalGrades - predictions)^2)
rsquared <- 1 - mse / var(test_data$FinalGrades)
# Print evaluation metrics
#Enter code below
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 22.34656
cat("R-squared (R^2):", rsquared, "\n")
## R-squared (R^2): 0.8865354
#Model Accuracy based on Prediction Interval
# Get the predictions and prediction intervals
pred_int <- predict(model, newdata = test_data, interval = "prediction")
# Extract lower and upper bounds of the prediction interval
lower_bound <- pred_int[, "lwr"]
upper_bound <- pred_int[, "upr"]
# Actual values from the test data
actual_values <- test_data$FinalGrades
# Check if the actual values fall within the prediction interval
correct_predictions <- actual_values >= lower_bound & actual_values <= upper_bound
# Compute accuracy
accuracy <- sum(correct_predictions) / length(correct_predictions)
# Print accuracy
cat("Model Accuracy using Prediction Interval:", accuracy, "\n")
## Model Accuracy using Prediction Interval: 0.96