# Loading libraries
suppressWarnings({
  library(tidyverse)
  library(sf)
  library(leaflet)
  library(mapview)
  library(tigris)
  library(tidycensus)
  library(dplyr)
  library(readxl)
  library(ggpubr)
  library(ggthemes)
  library(ggplot2)
  library(broom)
  library(patchwork)
  library(MASS)
})
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## 
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## 
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:patchwork':
## 
##     area
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select

Part I: Loading Data

suppressWarnings({
  
#Loading Data
data <- "/Users/natania.w/Downloads/sdcta.org-education-1.2.6-test_scores.csv"
ed <- read.csv(data)
ed$year <- as.factor(ed$year)
ed$dist_code <- as.factor(ed$dist_code)
head(ed, 5)

})
##   year dist_code score_all score_white score_black score_high_ses score_low_ses
## 1 2015    110017   2389.60          NA      2362.3             NA        2393.9
## 2 2016    110017   2427.10          NA          NA             NA        2418.3
## 3 2017    110017   2412.80          NA          NA             NA        2407.9
## 4 2018    110017   2412.70          NA          NA             NA        2412.3
## 5 2015    131617   2410.15          NA          NA         2435.7            NA

Part II: Summary Statistics

Exploring sum of NA across columns (variables)

Based on the summary statistics, it appears that scores for both Black individuals and those with a high socioeconomic status exhibit the highest amount of missing data.

# Exploring the Sum of NAs across columns
sum(is.na(ed$score_white))
## [1] 952
sum(is.na(ed$score_black))
## [1] 2649
sum(is.na(ed$score_high_ses))
## [1] 769
sum(is.na(ed$score_low_ses))
## [1] 527

Part III: Data Manipulation and EDA

Data Manipulation: Grouping data by year and sum of NAs across columns

## Exploring number of NA by year
first_year_na <- ed %>%
  group_by(year) %>%
  summarize_all(.funs = list(~ sum(is.na(.)))) 

total_year_na <- first_year_na %>%
  group_by(year)%>%
  summarize(total_na = sum(score_all, score_black, score_white, score_high_ses, score_low_ses))

year_na <- merge(first_year_na, total_year_na, by="year")
year_na <- year_na[, -c(2, 3)]

Fig. 1: Total Count of NA Values by Year

ggplot(year_na, aes(x = year, y=total_na)) +
  geom_bar(stat = "identity", color = "black", fill = "lightblue") +
  labs(x = "Year", y = "Count of NA Values") +
  ggtitle("Fig. 1: Total Count of NA Values by Year") +
  theme_minimal() +
  theme(legend.title = element_blank())

Fig. 2: Count of NA Values Across Columns by Year

ggplot(year_na, aes(x = year)) +
  geom_bar(aes(y = score_black, fill = "Black"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black", alpha = 0.7) +
  geom_bar(aes(y = score_white, fill = "White"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black") +
  geom_bar(aes(y = score_high_ses, fill = "High SES"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black") +
  geom_bar(aes(y = score_low_ses, fill = "Low SES"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black", alpha = 0.65) +
  scale_fill_manual(values = c("White" = "lightblue", "Black" = "lightpink", 
                               "High SES" = "grey", "Low SES" = "salmon")) +
  labs(x = "Year", y = "Count of NA Values") +
  ggtitle("Fig. 2: Count of NA Values Across Columns by Year") +
  theme_minimal() +
  theme(legend.title = element_blank())

Data Manipulation by Race

# score_black vs score_white
na_black <- ed[is.na(ed$score_black),]
na_black <- na_black %>%
  group_by(year) %>%
  summarize(sum = sum(is.na(score_black)))

na_white <- ed[is.na(ed$score_white),]
na_white <- na_white %>%
  group_by(year) %>%
  summarize(sum = sum(is.na(score_white)))

scores <-  merge(na_white, na_black, by="year") %>%
  rename(
    na_white = sum.x, 
    na_black = sum.y     
  )

Fig. 3: Count of NA Values in score_black and score_white by Year

# Create a side-by-side bar plot
ggplot(scores, aes(x = year)) +
  geom_bar(aes(y = na_black, fill = "Black"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black", alpha = 0.7) +
  geom_bar(aes(y = na_white, fill = "White"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black") +
  labs(x = "Year", y = "Count of NA Values") +
  scale_fill_manual(values = c("White" = "lightblue", "Black" = "lightpink")) +
  ggtitle("Fig. 3: Count of NA Values in score_black and score_white by Year") +
  theme_minimal() +
  theme(legend.title = element_blank())

Data Manipulation by Socioeconomic Status (SES)

# score_low_ses vs score_high_ses
na_low <- ed[is.na(ed$score_low_ses),]
na_low <- na_low %>%
  group_by(year) %>%
  summarize(sum = sum(is.na(score_low_ses)))

na_high <- ed[is.na(ed$score_high_ses),]
na_high <- na_high %>%
  group_by(year) %>%
  summarize(sum = sum(is.na(score_high_ses)))

ses_scores <-  merge(na_high, na_low, by="year") %>%
  rename(
    na_high_ses = sum.x,    
    na_low_ses = sum.y    
  )

Fig. 4: Count of NA Values in Low vs High SES by Year

# Create a side-by-side bar plot
ggplot(ses_scores, aes(x = year)) +
    geom_bar(aes(y = na_high_ses, fill = "High SES"), 
             position = "dodge", stat = "identity", 
             width = 0.8, color = "black") +
  geom_bar(aes(y = na_low_ses, fill = "Low SES"), 
           position = "dodge", stat = "identity", 
           width = 0.8, color = "black", alpha = 0.7) +
  labs(x = "Year", y = "Count of NA Values") +
  scale_fill_manual(values = c("High SES" = "grey", "Low SES" = "salmon")) +
  ggtitle("Fig. 4: Count of NA Values in Low vs High SES by Year") +
  theme_minimal() +
  theme(legend.title = element_blank())

Data Manipulation: Grouping NA data by district

In the code below, data is manipulated and grouped by district. It is printed to present the top 24 district with most total number of missing data.

## Exploring number of NA by district
first_dist_na <- ed %>%
  group_by(dist_code) %>%
  summarize_all(.funs = list(~ sum(is.na(.)))) 

total_dist_na <- first_dist_na %>%
  group_by(dist_code)%>%
  summarize(total_na = sum(score_all, score_black, score_white, score_high_ses, score_low_ses))

dist_na <- merge(first_dist_na, total_dist_na, by="dist_code")
dist_na <- dist_na[, -c(2, 3)]

dist_na <- dist_na %>%
  arrange(desc(total_na)) 

head(dist_na, 24)
##    dist_code score_white score_black score_high_ses score_low_ses total_na
## 1    1062109           5           5              5             5       20
## 2    2465649           5           5              5             5       20
## 3    1663933           5           5              4             5       19
## 4    5472264           5           5              5             4       19
## 5    5572363           5           5              5             4       19
## 6    2065177           4           5              5             4       18
## 7    2573593           5           5              5             3       18
## 8    3166795           3           5              5             5       18
## 9    5572405           3           5              5             5       18
## 10    961846           4           5              5             4       18
## 11   1062331           5           5              5             2       17
## 12   1363222           5           5              5             2       17
## 13   1363230           5           5              5             1       16
## 14   1563610           4           4              4             4       16
## 15   1663958           5           5              5             1       16
## 16   2365599           4           4              4             4       16
## 17   2373916           4           4              4             4       16
## 18   2673668           4           4              4             4       16
## 19   3110314           4           5              4             3       16
## 20   3767983           5           5              5             1       16
## 21   4269179           4           4              4             4       16
## 22   4970722           4           4              4             4       16
## 23   5472132           4           4              4             4       16
## 24   5572421           4           4              4             4       16

Conclusion

It is advisable to delve deeper into the data, such as narrowing down to school district in specific neighborhood, or combining and visualizing the district data alongside shapefiles. This will help gain a better understanding of potential patterns regarding the locations of districts with the highest amount of missing data.