1. Data Import and Exploration
# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Load the dataset
student_data <- read.csv("~/Downloads/SCSU_Student_Data_with_Missing_and_Outliers.csv")
# View the first few rows and summary statistics
head(student_data)
## Student_ID Major GPA Final_Exam_Score
## 1 1 Psychology 3.60 55
## 2 2 Business 2.15 65
## 3 3 Biology NA 78
## 4 4 Business 3.54 150
## 5 5 Business 2.40 69
## 6 6 Mathematics 2.01 85
summary(student_data)
## Student_ID Major GPA Final_Exam_Score
## Min. : 1.00 Length:100 Min. :2.010 Min. : 0.00
## 1st Qu.: 25.75 Class :character 1st Qu.:2.480 1st Qu.: 65.00
## Median : 50.50 Mode :character Median :2.990 Median : 74.00
## Mean : 50.50 Mean :3.001 Mean : 74.05
## 3rd Qu.: 75.25 3rd Qu.:3.520 3rd Qu.: 85.00
## Max. :100.00 Max. :5.000 Max. :150.00
## NA's :5 NA's :4
# Check for missing values
sapply(student_data, function(x) sum(is.na(x)))
## Student_ID Major GPA Final_Exam_Score
## 0 0 5 4
2. Data Cleaning
Imputing Missing Values
# Impute missing GPA values with the median
student_data$GPA[is.na(student_data$GPA)] <- median(student_data$GPA, na.rm = TRUE)
# Impute missing Major values with the mode
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major
## [1] "Psychology" "Business" "Biology"
## [4] "Business" "Business" "Mathematics"
## [7] "Biology" "Biology" "Biology"
## [10] "Business" "Psychology" "Biology"
## [13] "Education" "Business" "Mathematics"
## [16] "Psychology" "Education" ""
## [19] "Mathematics" "Psychology" "Business"
## [22] "Computer Science" "Psychology" "Mathematics"
## [25] "Education" "Business" "Psychology"
## [28] "Computer Science" "Computer Science" "Biology"
## [31] "Biology" "Mathematics" "Psychology"
## [34] "Psychology" "Education" "Education"
## [37] "Education" "Biology" "Psychology"
## [40] "Psychology" "Computer Science" "Biology"
## [43] "Business" "" "Business"
## [46] "Computer Science" "Mathematics" "Psychology"
## [49] "Computer Science" "Psychology" "Education"
## [52] "Mathematics" "Mathematics" "Computer Science"
## [55] "Mathematics" "Business" "Mathematics"
## [58] "Psychology" "Psychology" "Psychology"
## [61] "Psychology" "Business" "Biology"
## [64] "Education" "Computer Science" "Psychology"
## [67] "Mathematics" "Psychology" "Mathematics"
## [70] "Education" "Education" "Education"
## [73] "Mathematics" "Psychology" ""
## [76] "Business" "Mathematics" "Mathematics"
## [79] "Psychology" "Mathematics" "Mathematics"
## [82] "Education" "Psychology" "Education"
## [85] "Education" "Psychology" "Computer Science"
## [88] "Education" "Business" "Business"
## [91] "Mathematics" "Business" "Mathematics"
## [94] "Computer Science" "Psychology" "Psychology"
## [97] "Psychology" "Business" "Computer Science"
## [100] "Business"
Handling Outliers in Final Exam Score
# Calculate IQR for Final Exam Score
Q1 <- quantile(student_data$Final_Exam_Score, 0.25, na.rm = TRUE)
Q3 <- quantile(student_data$Final_Exam_Score, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Cap the outliers
# Method1
student_data$Final_Exam_Score <- pmin(pmax(student_data$Final_Exam_Score, lower_bound), upper_bound)
# Method2
student_data$Final_Exam_Score[student_data$Final_Exam_Score < lower_bound] <- lower_bound
3. Data Analysis and Visualization
Histograms for GPA and Final Exam Score
par(mfrow = c(1, 2))
hist(student_data$GPA, main = "Distribution of GPA", xlab = "GPA", col = "skyblue", border = "black")
hist(student_data$Final_Exam_Score, main = "Distribution of Final Exam Scores", xlab = "Final Exam Score", col = "lightgreen", border = "black")

Bar Chart for Majors Distribution
ggplot(student_data, aes(x = Major)) +
geom_bar() +
labs(title = "Distribution of Majors", x = "Major", y = "Count")

# Convert empty strings and spaces to NA
student_data$Major[student_data$Major == ""] <- NA
student_data$Major[trimws(student_data$Major) == ""] <- NA
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major
## [1] "Psychology" "Business" "Biology"
## [4] "Business" "Business" "Mathematics"
## [7] "Biology" "Biology" "Biology"
## [10] "Business" "Psychology" "Biology"
## [13] "Education" "Business" "Mathematics"
## [16] "Psychology" "Education" "Psychology"
## [19] "Mathematics" "Psychology" "Business"
## [22] "Computer Science" "Psychology" "Mathematics"
## [25] "Education" "Business" "Psychology"
## [28] "Computer Science" "Computer Science" "Biology"
## [31] "Biology" "Mathematics" "Psychology"
## [34] "Psychology" "Education" "Education"
## [37] "Education" "Biology" "Psychology"
## [40] "Psychology" "Computer Science" "Biology"
## [43] "Business" "Psychology" "Business"
## [46] "Computer Science" "Mathematics" "Psychology"
## [49] "Computer Science" "Psychology" "Education"
## [52] "Mathematics" "Mathematics" "Computer Science"
## [55] "Mathematics" "Business" "Mathematics"
## [58] "Psychology" "Psychology" "Psychology"
## [61] "Psychology" "Business" "Biology"
## [64] "Education" "Computer Science" "Psychology"
## [67] "Mathematics" "Psychology" "Mathematics"
## [70] "Education" "Education" "Education"
## [73] "Mathematics" "Psychology" "Psychology"
## [76] "Business" "Mathematics" "Mathematics"
## [79] "Psychology" "Mathematics" "Mathematics"
## [82] "Education" "Psychology" "Education"
## [85] "Education" "Psychology" "Computer Science"
## [88] "Education" "Business" "Business"
## [91] "Mathematics" "Business" "Mathematics"
## [94] "Computer Science" "Psychology" "Psychology"
## [97] "Psychology" "Business" "Computer Science"
## [100] "Business"
#When you read a CSV, you can instruct R to treat specific values as NA using na.strings:
#student_data <- read.csv("your_file.csv", na.strings = c("", " ", "NA"), stringsAsFactors = FALSE)
Scatter Plot with Regression Line: GPA vs. Final Exam Score
ggplot(student_data, aes(x = GPA, y = Final_Exam_Score)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relationship Between GPA and Final Exam Score", x = "GPA", y = "Final Exam Score")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).

4. Advanced Data Filtering
Top Scholars (GPA >= 3.5)
top_scholars <- student_data %>% filter(GPA >= 3.5)
nrow(top_scholars)
## [1] 26
Computer Science and Mathematics Majors
cs_math_students <- student_data %>% filter(Major %in% c("Computer Science", "Mathematics"))
cs_math_students
## Student_ID Major GPA Final_Exam_Score
## 1 6 Mathematics 2.01 85
## 2 15 Mathematics 5.00 82
## 3 19 Mathematics 2.65 89
## 4 22 Computer Science 5.00 NA
## 5 24 Mathematics 2.24 99
## 6 28 Computer Science 3.54 91
## 7 29 Computer Science 2.99 NA
## 8 32 Mathematics 2.05 75
## 9 41 Computer Science 3.51 81
## 10 46 Computer Science 3.86 97
## 11 47 Mathematics 3.62 65
## 12 49 Computer Science 3.74 69
## 13 52 Mathematics 2.99 73
## 14 53 Mathematics 3.08 60
## 15 54 Computer Science 3.61 98
## 16 55 Mathematics 3.79 57
## 17 57 Mathematics 2.22 87
## 18 65 Computer Science 2.44 78
## 19 67 Mathematics 2.68 95
## 20 69 Mathematics 2.65 35
## 21 73 Mathematics 3.94 85
## 22 77 Mathematics 2.60 63
## 23 78 Mathematics 2.57 70
## 24 80 Mathematics 3.22 69
## 25 81 Mathematics 3.01 57
## 26 87 Computer Science 2.98 61
## 27 91 Mathematics 3.52 79
## 28 93 Mathematics 3.46 87
## 29 94 Computer Science 2.74 94
## 30 99 Computer Science 3.67 70
nrow(cs_math_students)
## [1] 30
Students Who Aced Finals (Score > 80)
aced_finals <- student_data %>% filter(Final_Exam_Score > 80)
nrow(aced_finals)
## [1] 38
Overachievers: Final Exam Scores Far Above Major Average
ave_by_major <- aggregate(Final_Exam_Score ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))
ave_by_major
## Major Final_Exam_Score
## 1 Biology 67.80000
## 2 Business 77.47059
## 3 Computer Science 82.11111
## 4 Education 70.50000
## 5 Mathematics 74.31579
## 6 Psychology 74.11111
# calculate overachievers method1
overachievers <- student_data %>%
group_by(Major) %>%
filter(Final_Exam_Score > mean(Final_Exam_Score, na.rm = TRUE) + 10)
overachievers
## # A tibble: 29 × 4
## # Groups: Major [6]
## Student_ID Major GPA Final_Exam_Score
## <int> <chr> <dbl> <dbl>
## 1 3 Biology 2.99 78
## 2 4 Business 3.54 115
## 3 6 Mathematics 2.01 85
## 4 12 Biology 2.72 81
## 5 14 Business 3.73 90
## 6 17 Education 2.13 88
## 7 19 Mathematics 2.65 89
## 8 24 Mathematics 2.24 99
## 9 33 Psychology 2.22 97
## 10 34 Psychology 2.99 98
## # ℹ 19 more rows
Major with the Highest Average GPA
avg_gpa_by_major <- aggregate(GPA ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))
highest_avg_gpa_major <- avg_gpa_by_major$Major[which.max(avg_gpa_by_major$GPA)]
# Output the Major
highest_avg_gpa_major
## [1] "Computer Science"
5. Saving the Cleaned Dataset
write.csv(student_data, "SCSU_Student_Data_Cleaned.csv", row.names = FALSE)
Conclusion
- Missing values were imputed.
- Outliers in Final Exam Scores were capped.
- Visualizations of GPA, Final Exam Scores, and Major distributions
were created.
- Filtered insights on top scholars, specific majors, and
overachievers were obtained.
- Cleaned dataset is saved for future use.