library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(readxl)
district_data <- read_excel("district.xls")
clean_data <- district_data |> select(DISTNAME, DDA00A001222R, DPFEAINSP, DPFPAREGP, DPETECOP, DPSTEXPA)
clean_data <- district_data |>select(district_name = DISTNAME,staar_meets = DDA00A001222R, exp_instruction = DPFEAINSP, exp_stuservices = DPFPAREGP, econ_disadv = DPETECOP, teacher_exp = DPSTEXPA) |>
mutate(across(where(is.character), readr::parse_number)) |>
drop_na(staar_meets, exp_instruction, exp_stuservices, econ_disadv, teacher_exp)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.character), readr::parse_number)`.
## Caused by warning:
## ! 1206 parsing failures.
## row col expected actual
## 1 -- a number CAYUGA ISD
## 2 -- a number ELKHART ISD
## 3 -- a number FRANKSTON ISD
## 4 -- a number NECHES ISD
## 5 -- a number PALESTINE ISD
## ... ... ........ .............
## See problems(...) for more details.
pastecs::stat.desc(clean_data$econ_disadv)
## nbr.val nbr.null nbr.na min max range
## 1.198000e+03 3.000000e+00 0.000000e+00 0.000000e+00 1.000000e+02 1.000000e+02
## sum median mean SE.mean CI.mean.0.95 var
## 7.285790e+04 6.185000e+01 6.081628e+01 6.212789e-01 1.218917e+00 4.624129e+02
## std.dev coef.var
## 2.150379e+01 3.535861e-01
hist(clean_data$econ_disadv, main = "Distribution of Economically Disadvantaged Students", xlab = "Economically Disadvantaged (%)", col = "blue", border = "white")

clean_data <- clean_data |> mutate(log_econ_disadv = log(econ_disadv + 1))
hist(clean_data$log_econ_disadv, main = "Log-Transformed Distribution of Economic Disadvantage", xlab = "Log(Economically Disadvantaged + 1)", col = "Red", border = "white")
