Dataset 1 - School data

Load data

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(tidyr)
library(forcats)


url <- "https://raw.githubusercontent.com/j-song-npc/607-Project-2-/refs/heads/main/NYC%20Gifted%20and%20Talented%20Grades%202018-19%20-%20Sheet5(2).csv"
nycschools_df <- read.csv(url) 
View(nycschools_df)

Split columns

max_pref <- nycschools_df %>%
  mutate(School.Preferences = replace_na(School.Preferences, "")) %>%
  mutate(School.Preferences = str_count(School.Preferences, ",") + 1) %>%
  summarize(max(School.Preferences)) %>%
  pull()

nycschools <- nycschools_df %>%
  separate(School.Preferences, 
           into=paste0("School_preferences",1:max_pref), 
           sep = ",|/|\\.",
           fill= "right")

Clean data

nycschools <- nycschools %>% 
  select(-X)
nycschools <- nycschools %>%
  mutate(across(everything(), 
                ~ recode(str_to_upper(.),
                         "Y" = "Yes",
                         "N" = "No")))


nycschools <-nycschools %>%
  mutate(across(everything(),
                ~str_trim(., side="left"))) 

nycschools <- nycschools %>%
  mutate(across(starts_with("School_preferences"), 
                ~recode(., 
                        "NEST" = "NEST+M",
                        "Nest" = "NEST+M",
                        "NEST+m" = "NEST+M",
                        "Tag" = "TAG")))


nycschools <- nycschools %>%
  mutate(across(everything(),
                ~ ifelse(str_detect(tolower(.),
                "none|no|n/a|not|N/A|stay|next|keep|:|in|at"), 
                NA_character_, .)))

nycschools <- nycschools %>%
  mutate(across(everything(), 
                ~ replace(., . == "", NA)))
nycschools <- nycschools %>%
  mutate(across(starts_with("School_preferences"),
                ~case_when(
                 str_detect(., "(?i)ps\\s*\\d+") 
                ~ str_extract(., "(?i)ps\\s*\\d+") %>% str_replace_all("\\s+", ""), TRUE ~ .)))

##Analysis Since there was no suggested analysis in the discussion, I evaluated popular school preferences

nycschools_long <- nycschools %>%
  pivot_longer(cols = starts_with("School_preferences"), 
               names_to = "Preference_Type", 
               values_to = "School")

nycschools_long <- nycschools_long %>% filter(!is.na(School))

nycschools_long <- nycschools_long %>%
  mutate(School = fct_lump_n(School, n=10, other_level = "Others"))

plot <- ggplot(nycschools_long, 
  aes(x = fct_infreq(School))) +
  geom_bar() +
  labs(title = "School Preferences", x = "School", y = "Count") +
  theme_minimal() +
  coord_flip()

print(plot)

Conclusion

We can see that NEST+M was the most popular choice among this group of students, followed by Anderson.