Lab 3: Data Cleaning and Analysis of SCSU Student Data

1. Data Import and Exploration

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

# Load the dataset
student_data <- read.csv("~/Downloads/SCSU_Student_Data_with_Missing_and_Outliers.csv")

# View the first few rows and summary statistics
head(student_data)

##   Student_ID       Major  GPA Final_Exam_Score
## 1          1  Psychology 3.60               55
## 2          2    Business 2.15               65
## 3          3     Biology   NA               78
## 4          4    Business 3.54              150
## 5          5    Business 2.40               69
## 6          6 Mathematics 2.01               85

summary(student_data)

##    Student_ID        Major                GPA        Final_Exam_Score
##  Min.   :  1.00   Length:100         Min.   :2.010   Min.   :  0.00  
##  1st Qu.: 25.75   Class :character   1st Qu.:2.480   1st Qu.: 65.00  
##  Median : 50.50   Mode  :character   Median :2.990   Median : 74.00  
##  Mean   : 50.50                      Mean   :3.001   Mean   : 74.05  
##  3rd Qu.: 75.25                      3rd Qu.:3.520   3rd Qu.: 85.00  
##  Max.   :100.00                      Max.   :5.000   Max.   :150.00  
##                                      NA's   :5       NA's   :4

# Check for missing values
sapply(student_data, function(x) sum(is.na(x)))

##       Student_ID            Major              GPA Final_Exam_Score 
##                0                0                5                4

2. Data Cleaning

Imputing Missing Values

# Impute missing GPA values with the median
student_data$GPA[is.na(student_data$GPA)] <- median(student_data$GPA, na.rm = TRUE)

# Impute missing Major values with the mode
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major

##   [1] "Psychology"       "Business"         "Biology"         
##   [4] "Business"         "Business"         "Mathematics"     
##   [7] "Biology"          "Biology"          "Biology"         
##  [10] "Business"         "Psychology"       "Biology"         
##  [13] "Education"        "Business"         "Mathematics"     
##  [16] "Psychology"       "Education"        ""                
##  [19] "Mathematics"      "Psychology"       "Business"        
##  [22] "Computer Science" "Psychology"       "Mathematics"     
##  [25] "Education"        "Business"         "Psychology"      
##  [28] "Computer Science" "Computer Science" "Biology"         
##  [31] "Biology"          "Mathematics"      "Psychology"      
##  [34] "Psychology"       "Education"        "Education"       
##  [37] "Education"        "Biology"          "Psychology"      
##  [40] "Psychology"       "Computer Science" "Biology"         
##  [43] "Business"         ""                 "Business"        
##  [46] "Computer Science" "Mathematics"      "Psychology"      
##  [49] "Computer Science" "Psychology"       "Education"       
##  [52] "Mathematics"      "Mathematics"      "Computer Science"
##  [55] "Mathematics"      "Business"         "Mathematics"     
##  [58] "Psychology"       "Psychology"       "Psychology"      
##  [61] "Psychology"       "Business"         "Biology"         
##  [64] "Education"        "Computer Science" "Psychology"      
##  [67] "Mathematics"      "Psychology"       "Mathematics"     
##  [70] "Education"        "Education"        "Education"       
##  [73] "Mathematics"      "Psychology"       ""                
##  [76] "Business"         "Mathematics"      "Mathematics"     
##  [79] "Psychology"       "Mathematics"      "Mathematics"     
##  [82] "Education"        "Psychology"       "Education"       
##  [85] "Education"        "Psychology"       "Computer Science"
##  [88] "Education"        "Business"         "Business"        
##  [91] "Mathematics"      "Business"         "Mathematics"     
##  [94] "Computer Science" "Psychology"       "Psychology"      
##  [97] "Psychology"       "Business"         "Computer Science"
## [100] "Business"

Handling Outliers in Final Exam Score

# Calculate IQR for Final Exam Score
Q1 <- quantile(student_data$Final_Exam_Score, 0.25, na.rm = TRUE)
Q3 <- quantile(student_data$Final_Exam_Score, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

# Define bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Cap the outliers
# Method1
student_data$Final_Exam_Score <- pmin(pmax(student_data$Final_Exam_Score, lower_bound), upper_bound)

# Method2
student_data$Final_Exam_Score[student_data$Final_Exam_Score < lower_bound] <- lower_bound

3. Data Analysis and Visualization

Histograms for GPA and Final Exam Score

par(mfrow = c(1, 2))
hist(student_data$GPA, main = "Distribution of GPA", xlab = "GPA", col = "skyblue", border = "black")
hist(student_data$Final_Exam_Score, main = "Distribution of Final Exam Scores", xlab = "Final Exam Score", col = "lightgreen", border = "black")

Bar Chart for Majors Distribution

ggplot(student_data, aes(x = Major)) +
  geom_bar() +
  labs(title = "Distribution of Majors", x = "Major", y = "Count")

# Convert empty strings and spaces to NA
student_data$Major[student_data$Major == ""] <- NA
student_data$Major[trimws(student_data$Major) == ""] <- NA
mode_major <- names(sort(table(student_data$Major), decreasing = TRUE))[1]
student_data$Major[is.na(student_data$Major)] <- mode_major
student_data$Major

##   [1] "Psychology"       "Business"         "Biology"         
##   [4] "Business"         "Business"         "Mathematics"     
##   [7] "Biology"          "Biology"          "Biology"         
##  [10] "Business"         "Psychology"       "Biology"         
##  [13] "Education"        "Business"         "Mathematics"     
##  [16] "Psychology"       "Education"        "Psychology"      
##  [19] "Mathematics"      "Psychology"       "Business"        
##  [22] "Computer Science" "Psychology"       "Mathematics"     
##  [25] "Education"        "Business"         "Psychology"      
##  [28] "Computer Science" "Computer Science" "Biology"         
##  [31] "Biology"          "Mathematics"      "Psychology"      
##  [34] "Psychology"       "Education"        "Education"       
##  [37] "Education"        "Biology"          "Psychology"      
##  [40] "Psychology"       "Computer Science" "Biology"         
##  [43] "Business"         "Psychology"       "Business"        
##  [46] "Computer Science" "Mathematics"      "Psychology"      
##  [49] "Computer Science" "Psychology"       "Education"       
##  [52] "Mathematics"      "Mathematics"      "Computer Science"
##  [55] "Mathematics"      "Business"         "Mathematics"     
##  [58] "Psychology"       "Psychology"       "Psychology"      
##  [61] "Psychology"       "Business"         "Biology"         
##  [64] "Education"        "Computer Science" "Psychology"      
##  [67] "Mathematics"      "Psychology"       "Mathematics"     
##  [70] "Education"        "Education"        "Education"       
##  [73] "Mathematics"      "Psychology"       "Psychology"      
##  [76] "Business"         "Mathematics"      "Mathematics"     
##  [79] "Psychology"       "Mathematics"      "Mathematics"     
##  [82] "Education"        "Psychology"       "Education"       
##  [85] "Education"        "Psychology"       "Computer Science"
##  [88] "Education"        "Business"         "Business"        
##  [91] "Mathematics"      "Business"         "Mathematics"     
##  [94] "Computer Science" "Psychology"       "Psychology"      
##  [97] "Psychology"       "Business"         "Computer Science"
## [100] "Business"

#When you read a CSV, you can instruct R to treat specific values as NA using na.strings:
#student_data <- read.csv("your_file.csv", na.strings = c("", " ", "NA"), stringsAsFactors = FALSE)

Scatter Plot with Regression Line: GPA vs. Final Exam Score

ggplot(student_data, aes(x = GPA, y = Final_Exam_Score)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Relationship Between GPA and Final Exam Score", x = "GPA", y = "Final Exam Score")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

4. Advanced Data Filtering

Top Scholars (GPA >= 3.5)

top_scholars <- student_data %>% filter(GPA >= 3.5)
nrow(top_scholars)

## [1] 26

Computer Science and Mathematics Majors

cs_math_students <- student_data %>% filter(Major %in% c("Computer Science", "Mathematics"))
cs_math_students

##    Student_ID            Major  GPA Final_Exam_Score
## 1           6      Mathematics 2.01               85
## 2          15      Mathematics 5.00               82
## 3          19      Mathematics 2.65               89
## 4          22 Computer Science 5.00               NA
## 5          24      Mathematics 2.24               99
## 6          28 Computer Science 3.54               91
## 7          29 Computer Science 2.99               NA
## 8          32      Mathematics 2.05               75
## 9          41 Computer Science 3.51               81
## 10         46 Computer Science 3.86               97
## 11         47      Mathematics 3.62               65
## 12         49 Computer Science 3.74               69
## 13         52      Mathematics 2.99               73
## 14         53      Mathematics 3.08               60
## 15         54 Computer Science 3.61               98
## 16         55      Mathematics 3.79               57
## 17         57      Mathematics 2.22               87
## 18         65 Computer Science 2.44               78
## 19         67      Mathematics 2.68               95
## 20         69      Mathematics 2.65               35
## 21         73      Mathematics 3.94               85
## 22         77      Mathematics 2.60               63
## 23         78      Mathematics 2.57               70
## 24         80      Mathematics 3.22               69
## 25         81      Mathematics 3.01               57
## 26         87 Computer Science 2.98               61
## 27         91      Mathematics 3.52               79
## 28         93      Mathematics 3.46               87
## 29         94 Computer Science 2.74               94
## 30         99 Computer Science 3.67               70

nrow(cs_math_students)

## [1] 30

Students Who Aced Finals (Score > 80)

aced_finals <- student_data %>% filter(Final_Exam_Score > 80)
nrow(aced_finals)

## [1] 38

Overachievers: Final Exam Scores Far Above Major Average

ave_by_major <- aggregate(Final_Exam_Score ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))
ave_by_major

##              Major Final_Exam_Score
## 1          Biology         67.80000
## 2         Business         77.47059
## 3 Computer Science         82.11111
## 4        Education         70.50000
## 5      Mathematics         74.31579
## 6       Psychology         74.11111

# calculate overachievers method1
overachievers <- student_data %>%
  group_by(Major) %>%
  filter(Final_Exam_Score > mean(Final_Exam_Score, na.rm = TRUE) + 10)
overachievers

## # A tibble: 29 × 4
## # Groups:   Major [6]
##    Student_ID Major         GPA Final_Exam_Score
##         <int> <chr>       <dbl>            <dbl>
##  1          3 Biology      2.99               78
##  2          4 Business     3.54              115
##  3          6 Mathematics  2.01               85
##  4         12 Biology      2.72               81
##  5         14 Business     3.73               90
##  6         17 Education    2.13               88
##  7         19 Mathematics  2.65               89
##  8         24 Mathematics  2.24               99
##  9         33 Psychology   2.22               97
## 10         34 Psychology   2.99               98
## # ℹ 19 more rows

Major with the Highest Average GPA

avg_gpa_by_major <- aggregate(GPA ~ Major, data = student_data, function(x) mean(x, na.rm = TRUE))

highest_avg_gpa_major <- avg_gpa_by_major$Major[which.max(avg_gpa_by_major$GPA)]

# Output the Major
highest_avg_gpa_major

## [1] "Computer Science"

5. Saving the Cleaned Dataset

write.csv(student_data, "SCSU_Student_Data_Cleaned.csv", row.names = FALSE)

Conclusion

Missing values were imputed.
Outliers in Final Exam Scores were capped.
Visualizations of GPA, Final Exam Scores, and Major distributions were created.
Filtered insights on top scholars, specific majors, and overachievers were obtained.
Cleaned dataset is saved for future use.

Lab 3: Data Cleaning and Analysis of SCSU Student Data

Yulei Pang

March 11, 2025

1. Data Import and Exploration

2. Data Cleaning

Imputing Missing Values

Handling Outliers in Final Exam Score

3. Data Analysis and Visualization

Histograms for GPA and Final Exam Score

Bar Chart for Majors Distribution

Scatter Plot with Regression Line: GPA vs. Final Exam Score

4. Advanced Data Filtering

Top Scholars (GPA >= 3.5)

Computer Science and Mathematics Majors

Students Who Aced Finals (Score > 80)

Overachievers: Final Exam Scores Far Above Major Average

Major with the Highest Average GPA

5. Saving the Cleaned Dataset

Conclusion