**STUDENT ACADEMIC PERFORMANCE ANALYSIS

Draksharam naga Venkata priya harshitha naidu and Dr. Itauma Itauma

Northwood University, Midland, Michigan, USA. **

.

Objective:The main object of this study is to perform supervised learning analytics to predict student academic performance based on a given dataset.

# Install and load necessary packages
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.3.1
library(caret)
Warning: package 'caret' was built under R version 4.3.1
Loading required package: lattice
library(MLmetrics)
Warning: package 'MLmetrics' was built under R version 4.3.1

Attaching package: 'MLmetrics'
The following objects are masked from 'package:caret':

    MAE, RMSE
The following object is masked from 'package:base':

    Recall
library(rpart)
Warning: package 'rpart' was built under R version 4.3.1
library(randomForest)
Warning: package 'randomForest' was built under R version 4.3.1
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'
The following object is masked from 'package:ggplot2':

    margin
# Create a data frame with the dataset
students <- read.csv("StudentsPerformance.csv")

# Todo:
# Summary statistics for scores
summary(students[, c("math.score", "reading.score", "writing.score")])
   math.score     reading.score    writing.score   
 Min.   :  0.00   Min.   : 17.00   Min.   : 10.00  
 1st Qu.: 57.00   1st Qu.: 59.00   1st Qu.: 57.75  
 Median : 66.00   Median : 70.00   Median : 69.00  
 Mean   : 66.09   Mean   : 69.17   Mean   : 68.05  
 3rd Qu.: 77.00   3rd Qu.: 79.00   3rd Qu.: 79.00  
 Max.   :100.00   Max.   :100.00   Max.   :100.00  

Calculate the average score for each student

students$average_score <- rowMeans(students[, c("math.score", "reading.score", "writing.score")])
# Visualization of average scores
ggplot(students, aes(x = average_score)) +
  geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
  labs(title = "Distribution of Average Scores", x = "Average Score", y = "Count")

# Academic performance by gender
ggplot(students, aes(x = gender, y = average_score, fill = gender)) +
  geom_boxplot() +
  labs(title = "Academic Performance by Gender", x = "Gender", y = "Average Score") +
  theme(legend.position = "none")

# Academic performance by parental level of education
ggplot(students, aes(x = parental.level.of.education, y = average_score, fill = parental.level.of.education)) +
  geom_boxplot() +
  labs(title = "Academic Performance by Parental Level of Education", x = "Parental Education", y = "Average Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Data preprocessing
# Split the data into training and testing sets (800 training, 200 testing)
set.seed(123)  # For reproducibility
trainIndex <- sample(seq_len(nrow(students)), size = 800)
training_data <- students[trainIndex, ]
testing_data <- students[-trainIndex, ]
# Define and fit your models (example: linear regression, decision tree, random forest)
model1 <- lm(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)
model2 <- rpart(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)
model3 <- randomForest(average_score ~ gender + race.ethnicity + parental.level.of.education + lunch, data = training_data)

# Make predictions on the test data
pred1 <- predict(model1, newdata = testing_data)
pred2 <- predict(model2, newdata = testing_data)
pred3 <- predict(model3, newdata = testing_data)

# Calculate MSE for each model
mse1 <- mean((pred1 - testing_data$average_score)^2)
mse2 <- mean((pred2 - testing_data$average_score)^2)
mse3 <- mean((pred3 - testing_data$average_score)^2)
# Load the "MLmetrics" package for additional accuracy metrics
library(MLmetrics)

# Calculate various accuracy scores
mae1 <- MAE(testing_data$average_score, pred1)
mae2 <- MAE(testing_data$average_score, pred2)
mae3 <- MAE(testing_data$average_score, pred3)


r2_score1 <- R2_Score(testing_data$average_score, pred1)
r2_score2 <- R2_Score(testing_data$average_score, pred2)
r2_score3 <- R2_Score(testing_data$average_score, pred3)

# Store the results in a data frame or table
model_comparison <- data.frame(
  Model = c("Model 1 - Linear Regression", "Model 2 - Decision Tree", "Model 3 - Random Forest"),
  MSE = c(mse1, mse2, mse3),
  MAE = c(mae1, mae2, mae3),
  R2_Score = c(r2_score1, r2_score2, r2_score3)
)

# Print the model comparison table
print(model_comparison)
                        Model      MSE      MAE   R2_Score
1 Model 1 - Linear Regression 171.6690 10.37107  -4.472292
2     Model 2 - Decision Tree 183.9219 11.12077  -5.297347
3     Model 3 - Random Forest 180.1656 10.70143 -14.581736