# Question 1: Import data & Explore

# load file & libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

scsu_data <- read.csv("C:/Users/Owner/Downloads/SCSU_Student_Data_with_Missing_and_Outliers(1).csv")

# view summary stats 

summary(scsu_data)
##    Student_ID        Major                GPA        Final_Exam_Score
##  Min.   :  1.00   Length:100         Min.   :2.010   Min.   :  0.00  
##  1st Qu.: 25.75   Class :character   1st Qu.:2.480   1st Qu.: 65.00  
##  Median : 50.50   Mode  :character   Median :2.990   Median : 74.00  
##  Mean   : 50.50                      Mean   :3.001   Mean   : 74.05  
##  3rd Qu.: 75.25                      3rd Qu.:3.520   3rd Qu.: 85.00  
##  Max.   :100.00                      Max.   :5.000   Max.   :150.00  
##                                      NA's   :5       NA's   :4
head(scsu_data, 5)
##   Student_ID      Major  GPA Final_Exam_Score
## 1          1 Psychology 3.60               55
## 2          2   Business 2.15               65
## 3          3    Biology   NA               78
## 4          4   Business 3.54              150
## 5          5   Business 2.40               69

There are 5 missing values in GPA, and 4 missing values in the final exam score. Missing values/Outliers method: remove, replace (numerical data: use mean, but if data skewed, use median), outliers: recap with outbound or lowerbound

# calculating amount of missing values
sapply(scsu_data,function(x) sum(is.na(x)))
##       Student_ID            Major              GPA Final_Exam_Score 
##                0                0                5                4
# Question 2: Data Cleaning 

# replace missing values in GPA and Final exam score
scsu_data$GPA[is.na(scsu_data$GPA)] <- median(scsu_data$GPA, na.rm = TRUE)


# find mode of majors and replace 

mode_major <- names(sort(table(scsu_data$Major), decreasing = TRUE))[1]
scsu_data$Major[is.na(scsu_data$Major)] <- mode_major
# outliers using iqr method 

Q1 <- quantile(scsu_data$Final_Exam_Score, 0.25, na.rm = TRUE)
Q3 <- quantile(scsu_data$Final_Exam_Score, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1


# calculate lb & ub

lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# locate outliers 

scsu_data$Final_Exam_Score <- pmin(pmax(scsu_data$Final_Exam_Score, lower_bound), upper_bound)
# Question 3: Data Analysis & Visualization 

# Histograms for GPA and Final Exam Score 


par(mfrow = c(1, 2))
hist(scsu_data$GPA, main = "GPA", xlab = "GPA", col = "pink")
hist(scsu_data$Final_Exam_Score, main = "Final Exam Scores", xlab = "Final Exam Scores", col = "lightblue")

# bar chart for the majors
ggplot(scsu_data, aes(x = Major)) +
  geom_bar(fill = "lightyellow") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Majors", x = "Major", y = "Count")

# scatter plot

ggplot(scsu_data, aes(x = GPA, y = Final_Exam_Score)) +
  geom_point(color = "lightblue") +
  geom_smooth(method = "lm", se = FALSE, color = "lightgreen") +
  labs(title = "Relationship Between GPA and Final Exam Score", x = "GPA", y = "Final Exam Score")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Question 4: filtering the data ~


# top scholars (3.5 or higher)
top_scholars <- scsu_data %>% filter(GPA >= 3.5)
print(nrow(top_scholars))
## [1] 26
# computer science or math majors

cs_mat_students <- scsu_data %>% filter(Major %in% c("Computer Science", "Mathematics"))
print(nrow(cs_mat_students))
## [1] 30
# students that aced finals (above 80)
aced_finals <- scsu_data %>% filter(Final_Exam_Score > 80)
print(nrow(aced_finals))
## [1] 38
# overachievers 

avg_score_major <- scsu_data %>%
  group_by(Major) %>%
  summarise(avg_score = mean(Final_Exam_Score, na.rm = TRUE))

scsu_data <- scsu_data %>%
  left_join(avg_score_major, by = "Major") %>%
  mutate(Above_Average_Exam = Final_Exam_Score > avg_score + 10)

overachievers <- scsu_data %>% filter(Above_Average_Exam == TRUE)
print(nrow(overachievers))
## [1] 30
# Major with the Highest Average GPA
avg_gpa_major <- scsu_data %>%
  group_by(Major) %>%
  summarise(avg_gpa = mean(GPA, na.rm = TRUE))

highest_avg_gpa_major <- avg_gpa_major %>%
  filter(avg_gpa == max(avg_gpa)) %>%
  pull(Major)

print(highest_avg_gpa_major)
## [1] "Computer Science"
write.csv(scsu_data, "C:/Users/Owner/Desktop/SCSU_Student_Data_Cleaned.csv", row.names = FALSE)

Overall:

26 top scholars with GPAs of 3.5 or higher, 30 students majoring in Comp Sci or Math, 38 students who aced finals with score above 80, 30 overachievers who scored more than 10 pts above their major’s average (performing beyond expectations), and comp sci had the highest GPA of all the majors.

38 students aced the exam, however, only 26 students are top scholars, which might suggest that some students perform better on specific tests despite not having the highest GPA; there is correlation between being a top scholar & having high GPA, but it’s not a perfect correlation (as seen from the scatter plot graph above).