# Visualization of average scoresggplot(students, aes(x = average_score)) +geom_histogram(binwidth =5, fill ="lightblue", color ="black") +labs(title ="Distribution of Average Scores", x ="Average Score", y ="Count")
# Academic performance by genderggplot(students, aes(x = gender, y = average_score, fill = gender)) +geom_boxplot() +labs(title ="Academic Performance by Gender", x ="Gender", y ="Average Score") +theme(legend.position ="none")
# Academic performance by parental level of educationggplot(students, aes(x = parental.level.of.education, y = average_score, fill = parental.level.of.education)) +geom_boxplot() +labs(title ="Academic Performance by Parental Level of Education", x ="Parental Education", y ="Average Score") +theme(axis.text.x =element_text(angle =45, hjust =1))
# Data preprocessing# Split the data into training and testing sets (800 training, 200 testing)set.seed(123) # For reproducibilitytrainIndex <-sample(seq_len(nrow(students)), size =800)training_data <- students[trainIndex, ]testing_data <- students[-trainIndex, ]
# Define and fit your models (example: linear regression, decision tree, random forest)model1 <-lm(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)model2 <-rpart(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)model3 <-randomForest(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)# Make predictions on the test datapred1 <-predict(model1, newdata = testing_data)pred2 <-predict(model2, newdata = testing_data)pred3 <-predict(model3, newdata = testing_data)# Calculate MSE for each modelmse1 <-mean((pred1 - testing_data$average_score)^2)mse2 <-mean((pred2 - testing_data$average_score)^2)mse3 <-mean((pred3 - testing_data$average_score)^2)
# Load the "MLmetrics" package for additional accuracy metricslibrary(MLmetrics)# Calculate various accuracy scoresmae1 <-MAE(testing_data$average_score, pred1)mae2 <-MAE(testing_data$average_score, pred2)mae3 <-MAE(testing_data$average_score, pred3)r2_score1 <-R2_Score(testing_data$average_score, pred1)r2_score2 <-R2_Score(testing_data$average_score, pred2)r2_score3 <-R2_Score(testing_data$average_score, pred3)# Store the results in a data frame or tablemodel_comparison <-data.frame(Model =c("Model 1 - Linear Regression", "Model 2 - Decision Tree", "Model 3 - Random Forest"),MSE =c(mse1, mse2, mse3),MAE =c(mae1, mae2, mae3),R2_Score =c(r2_score1, r2_score2, r2_score3))# Print the model comparison tableprint(model_comparison)
Model MSE MAE R2_Score
1 Model 1 - Linear Regression 171.6690 10.37107 -4.472292
2 Model 2 - Decision Tree 183.9219 11.12077 -5.297347
3 Model 3 - Random Forest 180.1656 10.70143 -14.581736