# Question 1: Import data & Explore
# load file & libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
scsu_data <- read.csv("C:/Users/Owner/Downloads/SCSU_Student_Data_with_Missing_and_Outliers(1).csv")
# view summary stats
summary(scsu_data)
## Student_ID Major GPA Final_Exam_Score
## Min. : 1.00 Length:100 Min. :2.010 Min. : 0.00
## 1st Qu.: 25.75 Class :character 1st Qu.:2.480 1st Qu.: 65.00
## Median : 50.50 Mode :character Median :2.990 Median : 74.00
## Mean : 50.50 Mean :3.001 Mean : 74.05
## 3rd Qu.: 75.25 3rd Qu.:3.520 3rd Qu.: 85.00
## Max. :100.00 Max. :5.000 Max. :150.00
## NA's :5 NA's :4
head(scsu_data, 5)
## Student_ID Major GPA Final_Exam_Score
## 1 1 Psychology 3.60 55
## 2 2 Business 2.15 65
## 3 3 Biology NA 78
## 4 4 Business 3.54 150
## 5 5 Business 2.40 69
There are 5 missing values in GPA, and 4 missing values in the final exam score. Missing values/Outliers method: remove, replace (numerical data: use mean, but if data skewed, use median), outliers: recap with outbound or lowerbound
# calculating amount of missing values
sapply(scsu_data,function(x) sum(is.na(x)))
## Student_ID Major GPA Final_Exam_Score
## 0 0 5 4
# Question 2: Data Cleaning
# replace missing values in GPA and Final exam score
scsu_data$GPA[is.na(scsu_data$GPA)] <- median(scsu_data$GPA, na.rm = TRUE)
# find mode of majors and replace
mode_major <- names(sort(table(scsu_data$Major), decreasing = TRUE))[1]
scsu_data$Major[is.na(scsu_data$Major)] <- mode_major
# outliers using iqr method
Q1 <- quantile(scsu_data$Final_Exam_Score, 0.25, na.rm = TRUE)
Q3 <- quantile(scsu_data$Final_Exam_Score, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# calculate lb & ub
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# locate outliers
scsu_data$Final_Exam_Score <- pmin(pmax(scsu_data$Final_Exam_Score, lower_bound), upper_bound)
# Question 3: Data Analysis & Visualization
# Histograms for GPA and Final Exam Score
par(mfrow = c(1, 2))
hist(scsu_data$GPA, main = "GPA", xlab = "GPA", col = "pink")
hist(scsu_data$Final_Exam_Score, main = "Final Exam Scores", xlab = "Final Exam Scores", col = "lightblue")
# bar chart for the majors
ggplot(scsu_data, aes(x = Major)) +
geom_bar(fill = "lightyellow") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Majors", x = "Major", y = "Count")
# scatter plot
ggplot(scsu_data, aes(x = GPA, y = Final_Exam_Score)) +
geom_point(color = "lightblue") +
geom_smooth(method = "lm", se = FALSE, color = "lightgreen") +
labs(title = "Relationship Between GPA and Final Exam Score", x = "GPA", y = "Final Exam Score")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Question 4: filtering the data ~
# top scholars (3.5 or higher)
top_scholars <- scsu_data %>% filter(GPA >= 3.5)
print(nrow(top_scholars))
## [1] 26
# computer science or math majors
cs_mat_students <- scsu_data %>% filter(Major %in% c("Computer Science", "Mathematics"))
print(nrow(cs_mat_students))
## [1] 30
# students that aced finals (above 80)
aced_finals <- scsu_data %>% filter(Final_Exam_Score > 80)
print(nrow(aced_finals))
## [1] 38
# overachievers
avg_score_major <- scsu_data %>%
group_by(Major) %>%
summarise(avg_score = mean(Final_Exam_Score, na.rm = TRUE))
scsu_data <- scsu_data %>%
left_join(avg_score_major, by = "Major") %>%
mutate(Above_Average_Exam = Final_Exam_Score > avg_score + 10)
overachievers <- scsu_data %>% filter(Above_Average_Exam == TRUE)
print(nrow(overachievers))
## [1] 30
# Major with the Highest Average GPA
avg_gpa_major <- scsu_data %>%
group_by(Major) %>%
summarise(avg_gpa = mean(GPA, na.rm = TRUE))
highest_avg_gpa_major <- avg_gpa_major %>%
filter(avg_gpa == max(avg_gpa)) %>%
pull(Major)
print(highest_avg_gpa_major)
## [1] "Computer Science"
write.csv(scsu_data, "C:/Users/Owner/Desktop/SCSU_Student_Data_Cleaned.csv", row.names = FALSE)
Overall:
26 top scholars with GPAs of 3.5 or higher, 30 students majoring in Comp Sci or Math, 38 students who aced finals with score above 80, 30 overachievers who scored more than 10 pts above their major’s average (performing beyond expectations), and comp sci had the highest GPA of all the majors.
38 students aced the exam, however, only 26 students are top scholars, which might suggest that some students perform better on specific tests despite not having the highest GPA; there is correlation between being a top scholar & having high GPA, but it’s not a perfect correlation (as seen from the scatter plot graph above).