library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(readxl)
district_data <- read_excel("district.xls")
clean_data <- district_data |> select(DISTNAME, DDA00A001222R, DPFEAINSP, DPFPAREGP, DPETECOP, DPSTEXPA)
clean_data <- district_data |>select(district_name = DISTNAME,staar_meets = DDA00A001222R, exp_instruction = DPFEAINSP, exp_stuservices = DPFPAREGP, econ_disadv = DPETECOP, teacher_exp = DPSTEXPA)|>
mutate(across(where(is.character), readr::parse_number)) |>
drop_na(staar_meets, exp_instruction, exp_stuservices, econ_disadv, teacher_exp)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.character), readr::parse_number)`.
## Caused by warning:
## ! 1206 parsing failures.
## row col expected actual
## 1 -- a number CAYUGA ISD
## 2 -- a number ELKHART ISD
## 3 -- a number FRANKSTON ISD
## 4 -- a number NECHES ISD
## 5 -- a number PALESTINE ISD
## ... ... ........ .............
## See problems(...) for more details.
##Descriptive
summary(clean_data)
## district_name staar_meets exp_instruction exp_stuservices
## Min. : NA Min. : 0.00 Min. :18.50 Min. : 2.00
## 1st Qu.: NA 1st Qu.:37.00 1st Qu.:52.02 1st Qu.:35.12
## Median : NA Median :46.00 Median :55.10 Median :39.70
## Mean :NaN Mean :46.37 Mean :54.77 Mean :39.81
## 3rd Qu.: NA 3rd Qu.:55.00 3rd Qu.:57.80 3rd Qu.:43.90
## Max. : NA Max. :88.00 Max. :84.40 Max. :79.10
## NA's :1198
## econ_disadv teacher_exp
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 48.00 1st Qu.:10.10
## Median : 61.85 Median :12.00
## Mean : 60.82 Mean :11.78
## 3rd Qu.: 76.97 3rd Qu.:13.90
## Max. :100.00 Max. :22.90
##
pastecs::stat.desc(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")])
## staar_meets exp_instruction exp_stuservices econ_disadv
## nbr.val 1.198000e+03 1.198000e+03 1.198000e+03 1.198000e+03
## nbr.null 2.000000e+00 0.000000e+00 0.000000e+00 3.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 1.850000e+01 2.000000e+00 0.000000e+00
## max 8.800000e+01 8.440000e+01 7.910000e+01 1.000000e+02
## range 8.800000e+01 6.590000e+01 7.710000e+01 1.000000e+02
## sum 5.555300e+04 6.561810e+04 4.769220e+04 7.285790e+04
## median 4.600000e+01 5.510000e+01 3.970000e+01 6.185000e+01
## mean 4.637145e+01 5.477304e+01 3.980985e+01 6.081628e+01
## SE.mean 3.864952e-01 1.511511e-01 2.387496e-01 6.212789e-01
## CI.mean.0.95 7.582834e-01 2.965506e-01 4.684142e-01 1.218917e+00
## var 1.789555e+02 2.737029e+01 6.828762e+01 4.624129e+02
## std.dev 1.337742e+01 5.231662e+00 8.263633e+00 2.150379e+01
## coef.var 2.884840e-01 9.551529e-02 2.075776e-01 3.535861e-01
## teacher_exp
## nbr.val 1.198000e+03
## nbr.null 4.000000e+00
## nbr.na 0.000000e+00
## min 0.000000e+00
## max 2.290000e+01
## range 2.290000e+01
## sum 1.411090e+04
## median 1.200000e+01
## mean 1.177871e+01
## SE.mean 9.601165e-02
## CI.mean.0.95 1.883698e-01
## var 1.104345e+01
## std.dev 3.323168e+00
## coef.var 2.821334e-01
##Correlation
cor_mat <- cor(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")], use = "complete.obs", method = "pearson")
round(cor_mat, 3)
## staar_meets exp_instruction exp_stuservices econ_disadv
## staar_meets 1.000 0.215 0.354 -0.696
## exp_instruction 0.215 1.000 0.484 -0.192
## exp_stuservices 0.354 0.484 1.000 -0.476
## econ_disadv -0.696 -0.192 -0.476 1.000
## teacher_exp 0.333 0.130 -0.025 -0.233
## teacher_exp
## staar_meets 0.333
## exp_instruction 0.130
## exp_stuservices -0.025
## econ_disadv -0.233
## teacher_exp 1.000
pairs(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")], main = "Pairs: STAAR, Spending, Poverty, Experience",pch = 19, col = rgb(0,0,0,0.35), lower.panel = panel.smooth, upper.panel = NULL)

##Correlation Test for STAAR vs Economic Disadvantage
cor.test(clean_data$staar_meets, clean_data$econ_disadv, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: clean_data$staar_meets and clean_data$econ_disadv
## t = -33.561, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7244803 -0.6660534
## sample estimates:
## cor
## -0.6964191
cor.test(clean_data$staar_meets, clean_data$econ_disadv, method = "spearman", exact = FALSE)
##
## Spearman's rank correlation rho
##
## data: clean_data$staar_meets and clean_data$econ_disadv
## S = 483156301, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.6860434
##Scatterplot ggplot
ggplot(clean_data, aes(x = econ_disadv, y = staar_meets)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE) + labs(title = "STAAR Meets vs Economically Disadvantaged", x = "Economically Disadvantaged (%)", y = "STAAR Meets (%)") + theme_minimal(base_size = 12)
## `geom_smooth()` using formula = 'y ~ x'

clean_data <- clean_data |> mutate(log_econ = log(econ_disadv + 1))
cor.test(clean_data$staar_meets, clean_data$log_econ, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: clean_data$staar_meets and clean_data$log_econ
## t = -27.832, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6601605 -0.5913262
## sample estimates:
## cor
## -0.6269655
cor.test(clean_data$staar_meets, clean_data$exp_instruction)
##
## Pearson's product-moment correlation
##
## data: clean_data$staar_meets and clean_data$exp_instruction
## t = 7.6143, df = 1196, p-value = 5.362e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1603386 0.2683911
## sample estimates:
## cor
## 0.2150228
cor.test(clean_data$staar_meets, clean_data$exp_stuservices)
##
## Pearson's product-moment correlation
##
## data: clean_data$staar_meets and clean_data$exp_stuservices
## t = 13.104, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3037893 0.4028815
## sample estimates:
## cor
## 0.3543297
cor.test(clean_data$staar_meets, clean_data$teacher_exp)
##
## Pearson's product-moment correlation
##
## data: clean_data$staar_meets and clean_data$teacher_exp
## t = 12.228, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2820490 0.3827706
## sample estimates:
## cor
## 0.3333607
p1 <- ggplot(clean_data, aes(x = econ_disadv, y = staar_meets)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE) + labs(title = "STAAR Meets vs Economically Disadvantaged", x = "Economically Disadvantaged (%)", y = "STAAR Meets (%)") + theme_minimal(base_size = 12)
ggsave("staar_vs_econdisadv.png", p1, width = 7, height = 5, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'