1. Data Import and Exploration

# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Load the dataset
student_data <- read.csv("~/Downloads/SCSU_Student_Data_with_Missing_and_Outliers.csv")

# View the first few rows and summary statistics
head(student_data)
##   Student_ID       Major  GPA Final_Exam_Score
## 1          1  Psychology 3.60               55
## 2          2    Business 2.15               65
## 3          3     Biology   NA               78
## 4          4    Business 3.54              150
## 5          5    Business 2.40               69
## 6          6 Mathematics 2.01               85
summary(student_data)
##    Student_ID        Major                GPA        Final_Exam_Score
##  Min.   :  1.00   Length:100         Min.   :2.010   Min.   :  0.00  
##  1st Qu.: 25.75   Class :character   1st Qu.:2.480   1st Qu.: 65.00  
##  Median : 50.50   Mode  :character   Median :2.990   Median : 74.00  
##  Mean   : 50.50                      Mean   :3.001   Mean   : 74.05  
##  3rd Qu.: 75.25                      3rd Qu.:3.520   3rd Qu.: 85.00  
##  Max.   :100.00                      Max.   :5.000   Max.   :150.00  
##                                      NA's   :5       NA's   :4
# Check for missing values
sapply(student_data, function(x) sum(is.na(x)))
##       Student_ID            Major              GPA Final_Exam_Score 
##                0                0                5                4

2. Data Cleaning

Imputing Missing Values

# Impute missing GPA values with the median
student_data$GPA[is.na(student_data$GPA)] <- median(student_data$GPA, na.rm = TRUE)

# Impute missing Major values with the mode
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major
##   [1] "Psychology"       "Business"         "Biology"         
##   [4] "Business"         "Business"         "Mathematics"     
##   [7] "Biology"          "Biology"          "Biology"         
##  [10] "Business"         "Psychology"       "Biology"         
##  [13] "Education"        "Business"         "Mathematics"     
##  [16] "Psychology"       "Education"        ""                
##  [19] "Mathematics"      "Psychology"       "Business"        
##  [22] "Computer Science" "Psychology"       "Mathematics"     
##  [25] "Education"        "Business"         "Psychology"      
##  [28] "Computer Science" "Computer Science" "Biology"         
##  [31] "Biology"          "Mathematics"      "Psychology"      
##  [34] "Psychology"       "Education"        "Education"       
##  [37] "Education"        "Biology"          "Psychology"      
##  [40] "Psychology"       "Computer Science" "Biology"         
##  [43] "Business"         ""                 "Business"        
##  [46] "Computer Science" "Mathematics"      "Psychology"      
##  [49] "Computer Science" "Psychology"       "Education"       
##  [52] "Mathematics"      "Mathematics"      "Computer Science"
##  [55] "Mathematics"      "Business"         "Mathematics"     
##  [58] "Psychology"       "Psychology"       "Psychology"      
##  [61] "Psychology"       "Business"         "Biology"         
##  [64] "Education"        "Computer Science" "Psychology"      
##  [67] "Mathematics"      "Psychology"       "Mathematics"     
##  [70] "Education"        "Education"        "Education"       
##  [73] "Mathematics"      "Psychology"       ""                
##  [76] "Business"         "Mathematics"      "Mathematics"     
##  [79] "Psychology"       "Mathematics"      "Mathematics"     
##  [82] "Education"        "Psychology"       "Education"       
##  [85] "Education"        "Psychology"       "Computer Science"
##  [88] "Education"        "Business"         "Business"        
##  [91] "Mathematics"      "Business"         "Mathematics"     
##  [94] "Computer Science" "Psychology"       "Psychology"      
##  [97] "Psychology"       "Business"         "Computer Science"
## [100] "Business"

Handling Outliers in Final Exam Score

# Calculate IQR for Final Exam Score
Q1 <- quantile(student_data$Final_Exam_Score, 0.25, na.rm = TRUE)
Q3 <- quantile(student_data$Final_Exam_Score, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

# Define bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Cap the outliers
# Method1
student_data$Final_Exam_Score <- pmin(pmax(student_data$Final_Exam_Score, lower_bound), upper_bound)

# Method2
student_data$Final_Exam_Score[student_data$Final_Exam_Score < lower_bound] <- lower_bound

3. Data Analysis and Visualization

Histograms for GPA and Final Exam Score

par(mfrow = c(1, 2))
hist(student_data$GPA, main = "Distribution of GPA", xlab = "GPA", col = "skyblue", border = "black")
hist(student_data$Final_Exam_Score, main = "Distribution of Final Exam Scores", xlab = "Final Exam Score", col = "lightgreen", border = "black")

Bar Chart for Majors Distribution

ggplot(student_data, aes(x = Major)) +
  geom_bar() +
  labs(title = "Distribution of Majors", x = "Major", y = "Count")

# Convert empty strings and spaces to NA
student_data$Major[student_data$Major == ""] <- NA
student_data$Major[trimws(student_data$Major) == ""] <- NA
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major
##   [1] "Psychology"       "Business"         "Biology"         
##   [4] "Business"         "Business"         "Mathematics"     
##   [7] "Biology"          "Biology"          "Biology"         
##  [10] "Business"         "Psychology"       "Biology"         
##  [13] "Education"        "Business"         "Mathematics"     
##  [16] "Psychology"       "Education"        "Psychology"      
##  [19] "Mathematics"      "Psychology"       "Business"        
##  [22] "Computer Science" "Psychology"       "Mathematics"     
##  [25] "Education"        "Business"         "Psychology"      
##  [28] "Computer Science" "Computer Science" "Biology"         
##  [31] "Biology"          "Mathematics"      "Psychology"      
##  [34] "Psychology"       "Education"        "Education"       
##  [37] "Education"        "Biology"          "Psychology"      
##  [40] "Psychology"       "Computer Science" "Biology"         
##  [43] "Business"         "Psychology"       "Business"        
##  [46] "Computer Science" "Mathematics"      "Psychology"      
##  [49] "Computer Science" "Psychology"       "Education"       
##  [52] "Mathematics"      "Mathematics"      "Computer Science"
##  [55] "Mathematics"      "Business"         "Mathematics"     
##  [58] "Psychology"       "Psychology"       "Psychology"      
##  [61] "Psychology"       "Business"         "Biology"         
##  [64] "Education"        "Computer Science" "Psychology"      
##  [67] "Mathematics"      "Psychology"       "Mathematics"     
##  [70] "Education"        "Education"        "Education"       
##  [73] "Mathematics"      "Psychology"       "Psychology"      
##  [76] "Business"         "Mathematics"      "Mathematics"     
##  [79] "Psychology"       "Mathematics"      "Mathematics"     
##  [82] "Education"        "Psychology"       "Education"       
##  [85] "Education"        "Psychology"       "Computer Science"
##  [88] "Education"        "Business"         "Business"        
##  [91] "Mathematics"      "Business"         "Mathematics"     
##  [94] "Computer Science" "Psychology"       "Psychology"      
##  [97] "Psychology"       "Business"         "Computer Science"
## [100] "Business"
#When you read a CSV, you can instruct R to treat specific values as NA using na.strings:
#student_data <- read.csv("your_file.csv", na.strings = c("", " ", "NA"), stringsAsFactors = FALSE)

Scatter Plot with Regression Line: GPA vs. Final Exam Score

ggplot(student_data, aes(x = GPA, y = Final_Exam_Score)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Relationship Between GPA and Final Exam Score", x = "GPA", y = "Final Exam Score")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).

4. Advanced Data Filtering

Top Scholars (GPA >= 3.5)

top_scholars <- student_data %>% filter(GPA >= 3.5)
nrow(top_scholars)
## [1] 26

Computer Science and Mathematics Majors

cs_math_students <- student_data %>% filter(Major %in% c("Computer Science", "Mathematics"))
cs_math_students
##    Student_ID            Major  GPA Final_Exam_Score
## 1           6      Mathematics 2.01               85
## 2          15      Mathematics 5.00               82
## 3          19      Mathematics 2.65               89
## 4          22 Computer Science 5.00               NA
## 5          24      Mathematics 2.24               99
## 6          28 Computer Science 3.54               91
## 7          29 Computer Science 2.99               NA
## 8          32      Mathematics 2.05               75
## 9          41 Computer Science 3.51               81
## 10         46 Computer Science 3.86               97
## 11         47      Mathematics 3.62               65
## 12         49 Computer Science 3.74               69
## 13         52      Mathematics 2.99               73
## 14         53      Mathematics 3.08               60
## 15         54 Computer Science 3.61               98
## 16         55      Mathematics 3.79               57
## 17         57      Mathematics 2.22               87
## 18         65 Computer Science 2.44               78
## 19         67      Mathematics 2.68               95
## 20         69      Mathematics 2.65               35
## 21         73      Mathematics 3.94               85
## 22         77      Mathematics 2.60               63
## 23         78      Mathematics 2.57               70
## 24         80      Mathematics 3.22               69
## 25         81      Mathematics 3.01               57
## 26         87 Computer Science 2.98               61
## 27         91      Mathematics 3.52               79
## 28         93      Mathematics 3.46               87
## 29         94 Computer Science 2.74               94
## 30         99 Computer Science 3.67               70
nrow(cs_math_students)
## [1] 30

Students Who Aced Finals (Score > 80)

aced_finals <- student_data %>% filter(Final_Exam_Score > 80)
nrow(aced_finals)
## [1] 38

Overachievers: Final Exam Scores Far Above Major Average

ave_by_major <- aggregate(Final_Exam_Score ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))
ave_by_major
##              Major Final_Exam_Score
## 1          Biology         67.80000
## 2         Business         77.47059
## 3 Computer Science         82.11111
## 4        Education         70.50000
## 5      Mathematics         74.31579
## 6       Psychology         74.11111
# calculate overachievers method1
overachievers <- student_data %>%
  group_by(Major) %>%
  filter(Final_Exam_Score > mean(Final_Exam_Score, na.rm = TRUE) + 10)
overachievers
## # A tibble: 29 × 4
## # Groups:   Major [6]
##    Student_ID Major         GPA Final_Exam_Score
##         <int> <chr>       <dbl>            <dbl>
##  1          3 Biology      2.99               78
##  2          4 Business     3.54              115
##  3          6 Mathematics  2.01               85
##  4         12 Biology      2.72               81
##  5         14 Business     3.73               90
##  6         17 Education    2.13               88
##  7         19 Mathematics  2.65               89
##  8         24 Mathematics  2.24               99
##  9         33 Psychology   2.22               97
## 10         34 Psychology   2.99               98
## # ℹ 19 more rows

Major with the Highest Average GPA

avg_gpa_by_major <- aggregate(GPA ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))

highest_avg_gpa_major <- avg_gpa_by_major$Major[which.max(avg_gpa_by_major$GPA)]

# Output the Major
highest_avg_gpa_major
## [1] "Computer Science"

5. Saving the Cleaned Dataset

write.csv(student_data, "SCSU_Student_Data_Cleaned.csv", row.names = FALSE)

Conclusion