district <- read_excel("district.xls")
df_new <- district %>%select(DISTNAME, DPETSPEP, DPFPASPEP)
summary(df_new$DPETSPEP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.90 12.10 12.27 14.20 51.70
summary(df_new$DPFPASPEP)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 5.800 8.900 9.711 12.500 49.000 5
cat("Missing DPETSPEP:", sum(is.na(df_new$DPETSPEP)), "\n")
## Missing DPETSPEP: 0
cat("Missing DPFPASPEP:", sum(is.na(df_new$DPFPASPEP)), "\n")
## Missing DPFPASPEP: 5
df_clean <- df_new %>%
drop_na()
cor(df_clean$DPFPASPEP, df_clean$DPETSPEP)
## [1] 0.3700234
cat("Remaining observations:", nrow(df_clean), "\n")
## Remaining observations: 1202
ggplot(df_clean, aes(x = DPETSPEP, y = DPFPASPEP)) +
geom_point() +
labs(title = "Special Education Spending vs Percent of Students",
x = "Percent Special Education (DPETSPEP)",
y = "Spending on Special Education (DPFPASPEP)")
They are correlated, with a few extra “special circumstances”