# Loading libraries
suppressWarnings({
library(tidyverse)
library(sf)
library(leaflet)
library(mapview)
library(tigris)
library(tidycensus)
library(dplyr)
library(readxl)
library(ggpubr)
library(ggthemes)
library(ggplot2)
library(broom)
library(patchwork)
library(MASS)
})
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
##
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
##
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
##
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:patchwork':
##
## area
##
##
## The following object is masked from 'package:dplyr':
##
## select
suppressWarnings({
#Loading Data
data <- "/Users/natania.w/Downloads/sdcta.org-education-1.2.6-test_scores.csv"
ed <- read.csv(data)
ed$year <- as.factor(ed$year)
ed$dist_code <- as.factor(ed$dist_code)
head(ed, 5)
})
## year dist_code score_all score_white score_black score_high_ses score_low_ses
## 1 2015 110017 2389.60 NA 2362.3 NA 2393.9
## 2 2016 110017 2427.10 NA NA NA 2418.3
## 3 2017 110017 2412.80 NA NA NA 2407.9
## 4 2018 110017 2412.70 NA NA NA 2412.3
## 5 2015 131617 2410.15 NA NA 2435.7 NA
Based on the summary statistics, it appears that scores for both Black individuals and those with a high socioeconomic status exhibit the highest amount of missing data.
# Exploring the Sum of NAs across columns
sum(is.na(ed$score_white))
## [1] 952
sum(is.na(ed$score_black))
## [1] 2649
sum(is.na(ed$score_high_ses))
## [1] 769
sum(is.na(ed$score_low_ses))
## [1] 527
## Exploring number of NA by year
first_year_na <- ed %>%
group_by(year) %>%
summarize_all(.funs = list(~ sum(is.na(.))))
total_year_na <- first_year_na %>%
group_by(year)%>%
summarize(total_na = sum(score_all, score_black, score_white, score_high_ses, score_low_ses))
year_na <- merge(first_year_na, total_year_na, by="year")
year_na <- year_na[, -c(2, 3)]
ggplot(year_na, aes(x = year, y=total_na)) +
geom_bar(stat = "identity", color = "black", fill = "lightblue") +
labs(x = "Year", y = "Count of NA Values") +
ggtitle("Fig. 1: Total Count of NA Values by Year") +
theme_minimal() +
theme(legend.title = element_blank())
ggplot(year_na, aes(x = year)) +
geom_bar(aes(y = score_black, fill = "Black"),
position = "dodge", stat = "identity",
width = 0.8, color = "black", alpha = 0.7) +
geom_bar(aes(y = score_white, fill = "White"),
position = "dodge", stat = "identity",
width = 0.8, color = "black") +
geom_bar(aes(y = score_high_ses, fill = "High SES"),
position = "dodge", stat = "identity",
width = 0.8, color = "black") +
geom_bar(aes(y = score_low_ses, fill = "Low SES"),
position = "dodge", stat = "identity",
width = 0.8, color = "black", alpha = 0.65) +
scale_fill_manual(values = c("White" = "lightblue", "Black" = "lightpink",
"High SES" = "grey", "Low SES" = "salmon")) +
labs(x = "Year", y = "Count of NA Values") +
ggtitle("Fig. 2: Count of NA Values Across Columns by Year") +
theme_minimal() +
theme(legend.title = element_blank())
# score_black vs score_white
na_black <- ed[is.na(ed$score_black),]
na_black <- na_black %>%
group_by(year) %>%
summarize(sum = sum(is.na(score_black)))
na_white <- ed[is.na(ed$score_white),]
na_white <- na_white %>%
group_by(year) %>%
summarize(sum = sum(is.na(score_white)))
scores <- merge(na_white, na_black, by="year") %>%
rename(
na_white = sum.x,
na_black = sum.y
)
# Create a side-by-side bar plot
ggplot(scores, aes(x = year)) +
geom_bar(aes(y = na_black, fill = "Black"),
position = "dodge", stat = "identity",
width = 0.8, color = "black", alpha = 0.7) +
geom_bar(aes(y = na_white, fill = "White"),
position = "dodge", stat = "identity",
width = 0.8, color = "black") +
labs(x = "Year", y = "Count of NA Values") +
scale_fill_manual(values = c("White" = "lightblue", "Black" = "lightpink")) +
ggtitle("Fig. 3: Count of NA Values in score_black and score_white by Year") +
theme_minimal() +
theme(legend.title = element_blank())
# score_low_ses vs score_high_ses
na_low <- ed[is.na(ed$score_low_ses),]
na_low <- na_low %>%
group_by(year) %>%
summarize(sum = sum(is.na(score_low_ses)))
na_high <- ed[is.na(ed$score_high_ses),]
na_high <- na_high %>%
group_by(year) %>%
summarize(sum = sum(is.na(score_high_ses)))
ses_scores <- merge(na_high, na_low, by="year") %>%
rename(
na_high_ses = sum.x,
na_low_ses = sum.y
)
# Create a side-by-side bar plot
ggplot(ses_scores, aes(x = year)) +
geom_bar(aes(y = na_high_ses, fill = "High SES"),
position = "dodge", stat = "identity",
width = 0.8, color = "black") +
geom_bar(aes(y = na_low_ses, fill = "Low SES"),
position = "dodge", stat = "identity",
width = 0.8, color = "black", alpha = 0.7) +
labs(x = "Year", y = "Count of NA Values") +
scale_fill_manual(values = c("High SES" = "grey", "Low SES" = "salmon")) +
ggtitle("Fig. 4: Count of NA Values in Low vs High SES by Year") +
theme_minimal() +
theme(legend.title = element_blank())
In the code below, data is manipulated and grouped by district. It is printed to present the top 24 district with most total number of missing data.
## Exploring number of NA by district
first_dist_na <- ed %>%
group_by(dist_code) %>%
summarize_all(.funs = list(~ sum(is.na(.))))
total_dist_na <- first_dist_na %>%
group_by(dist_code)%>%
summarize(total_na = sum(score_all, score_black, score_white, score_high_ses, score_low_ses))
dist_na <- merge(first_dist_na, total_dist_na, by="dist_code")
dist_na <- dist_na[, -c(2, 3)]
dist_na <- dist_na %>%
arrange(desc(total_na))
head(dist_na, 24)
## dist_code score_white score_black score_high_ses score_low_ses total_na
## 1 1062109 5 5 5 5 20
## 2 2465649 5 5 5 5 20
## 3 1663933 5 5 4 5 19
## 4 5472264 5 5 5 4 19
## 5 5572363 5 5 5 4 19
## 6 2065177 4 5 5 4 18
## 7 2573593 5 5 5 3 18
## 8 3166795 3 5 5 5 18
## 9 5572405 3 5 5 5 18
## 10 961846 4 5 5 4 18
## 11 1062331 5 5 5 2 17
## 12 1363222 5 5 5 2 17
## 13 1363230 5 5 5 1 16
## 14 1563610 4 4 4 4 16
## 15 1663958 5 5 5 1 16
## 16 2365599 4 4 4 4 16
## 17 2373916 4 4 4 4 16
## 18 2673668 4 4 4 4 16
## 19 3110314 4 5 4 3 16
## 20 3767983 5 5 5 1 16
## 21 4269179 4 4 4 4 16
## 22 4970722 4 4 4 4 16
## 23 5472132 4 4 4 4 16
## 24 5572421 4 4 4 4 16
It is advisable to delve deeper into the data, such as narrowing down to school district in specific neighborhood, or combining and visualizing the district data alongside shapefiles. This will help gain a better understanding of potential patterns regarding the locations of districts with the highest amount of missing data.