library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(readxl)

district_data <- read_excel("district.xls")
clean_data <- district_data |> select(DISTNAME, DDA00A001222R, DPFEAINSP, DPFPAREGP, DPETECOP, DPSTEXPA)

clean_data <- district_data |>select(district_name = DISTNAME,staar_meets = DDA00A001222R, exp_instruction = DPFEAINSP, exp_stuservices = DPFPAREGP, econ_disadv = DPETECOP, teacher_exp = DPSTEXPA)|>
mutate(across(where(is.character), readr::parse_number)) |>
drop_na(staar_meets, exp_instruction, exp_stuservices, econ_disadv, teacher_exp)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.character), readr::parse_number)`.
## Caused by warning:
## ! 1206 parsing failures.
## row col expected        actual
##   1  -- a number CAYUGA ISD   
##   2  -- a number ELKHART ISD  
##   3  -- a number FRANKSTON ISD
##   4  -- a number NECHES ISD   
##   5  -- a number PALESTINE ISD
## ... ... ........ .............
## See problems(...) for more details.
##Descriptive
summary(clean_data)
##  district_name   staar_meets    exp_instruction exp_stuservices
##  Min.   : NA    Min.   : 0.00   Min.   :18.50   Min.   : 2.00  
##  1st Qu.: NA    1st Qu.:37.00   1st Qu.:52.02   1st Qu.:35.12  
##  Median : NA    Median :46.00   Median :55.10   Median :39.70  
##  Mean   :NaN    Mean   :46.37   Mean   :54.77   Mean   :39.81  
##  3rd Qu.: NA    3rd Qu.:55.00   3rd Qu.:57.80   3rd Qu.:43.90  
##  Max.   : NA    Max.   :88.00   Max.   :84.40   Max.   :79.10  
##  NA's   :1198                                                  
##   econ_disadv      teacher_exp   
##  Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 48.00   1st Qu.:10.10  
##  Median : 61.85   Median :12.00  
##  Mean   : 60.82   Mean   :11.78  
##  3rd Qu.: 76.97   3rd Qu.:13.90  
##  Max.   :100.00   Max.   :22.90  
## 
pastecs::stat.desc(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")])
##               staar_meets exp_instruction exp_stuservices  econ_disadv
## nbr.val      1.198000e+03    1.198000e+03    1.198000e+03 1.198000e+03
## nbr.null     2.000000e+00    0.000000e+00    0.000000e+00 3.000000e+00
## nbr.na       0.000000e+00    0.000000e+00    0.000000e+00 0.000000e+00
## min          0.000000e+00    1.850000e+01    2.000000e+00 0.000000e+00
## max          8.800000e+01    8.440000e+01    7.910000e+01 1.000000e+02
## range        8.800000e+01    6.590000e+01    7.710000e+01 1.000000e+02
## sum          5.555300e+04    6.561810e+04    4.769220e+04 7.285790e+04
## median       4.600000e+01    5.510000e+01    3.970000e+01 6.185000e+01
## mean         4.637145e+01    5.477304e+01    3.980985e+01 6.081628e+01
## SE.mean      3.864952e-01    1.511511e-01    2.387496e-01 6.212789e-01
## CI.mean.0.95 7.582834e-01    2.965506e-01    4.684142e-01 1.218917e+00
## var          1.789555e+02    2.737029e+01    6.828762e+01 4.624129e+02
## std.dev      1.337742e+01    5.231662e+00    8.263633e+00 2.150379e+01
## coef.var     2.884840e-01    9.551529e-02    2.075776e-01 3.535861e-01
##               teacher_exp
## nbr.val      1.198000e+03
## nbr.null     4.000000e+00
## nbr.na       0.000000e+00
## min          0.000000e+00
## max          2.290000e+01
## range        2.290000e+01
## sum          1.411090e+04
## median       1.200000e+01
## mean         1.177871e+01
## SE.mean      9.601165e-02
## CI.mean.0.95 1.883698e-01
## var          1.104345e+01
## std.dev      3.323168e+00
## coef.var     2.821334e-01
##Correlation
cor_mat <- cor(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")], use = "complete.obs", method = "pearson")
round(cor_mat, 3)
##                 staar_meets exp_instruction exp_stuservices econ_disadv
## staar_meets           1.000           0.215           0.354      -0.696
## exp_instruction       0.215           1.000           0.484      -0.192
## exp_stuservices       0.354           0.484           1.000      -0.476
## econ_disadv          -0.696          -0.192          -0.476       1.000
## teacher_exp           0.333           0.130          -0.025      -0.233
##                 teacher_exp
## staar_meets           0.333
## exp_instruction       0.130
## exp_stuservices      -0.025
## econ_disadv          -0.233
## teacher_exp           1.000
pairs(clean_data[, c("staar_meets","exp_instruction","exp_stuservices","econ_disadv","teacher_exp")], main = "Pairs: STAAR, Spending, Poverty, Experience",pch = 19, col = rgb(0,0,0,0.35), lower.panel = panel.smooth, upper.panel = NULL)

##Correlation Test for STAAR vs Economic Disadvantage
cor.test(clean_data$staar_meets, clean_data$econ_disadv, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$staar_meets and clean_data$econ_disadv
## t = -33.561, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7244803 -0.6660534
## sample estimates:
##        cor 
## -0.6964191
cor.test(clean_data$staar_meets, clean_data$econ_disadv, method = "spearman", exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  clean_data$staar_meets and clean_data$econ_disadv
## S = 483156301, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6860434
##Scatterplot ggplot
ggplot(clean_data, aes(x = econ_disadv, y = staar_meets)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE) + labs(title = "STAAR Meets vs Economically Disadvantaged", x = "Economically Disadvantaged (%)", y = "STAAR Meets (%)") + theme_minimal(base_size = 12)
## `geom_smooth()` using formula = 'y ~ x'

clean_data <- clean_data |> mutate(log_econ = log(econ_disadv + 1))
cor.test(clean_data$staar_meets, clean_data$log_econ, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$staar_meets and clean_data$log_econ
## t = -27.832, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6601605 -0.5913262
## sample estimates:
##        cor 
## -0.6269655
cor.test(clean_data$staar_meets, clean_data$exp_instruction)
## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$staar_meets and clean_data$exp_instruction
## t = 7.6143, df = 1196, p-value = 5.362e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1603386 0.2683911
## sample estimates:
##       cor 
## 0.2150228
cor.test(clean_data$staar_meets, clean_data$exp_stuservices)
## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$staar_meets and clean_data$exp_stuservices
## t = 13.104, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3037893 0.4028815
## sample estimates:
##       cor 
## 0.3543297
cor.test(clean_data$staar_meets, clean_data$teacher_exp)
## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$staar_meets and clean_data$teacher_exp
## t = 12.228, df = 1196, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2820490 0.3827706
## sample estimates:
##       cor 
## 0.3333607
p1 <- ggplot(clean_data, aes(x = econ_disadv, y = staar_meets)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE) + labs(title = "STAAR Meets vs Economically Disadvantaged", x = "Economically Disadvantaged (%)", y = "STAAR Meets (%)") + theme_minimal(base_size = 12)
ggsave("staar_vs_econdisadv.png", p1, width = 7, height = 5, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'