library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(readxl)

district_data <- read_excel("district.xls")
clean_data <- district_data |> select(DISTNAME, DDA00A001222R, DPFEAINSP, DPFPAREGP, DPETECOP, DPSTEXPA)
clean_data <- district_data |>select(district_name = DISTNAME,staar_meets = DDA00A001222R, exp_instruction = DPFEAINSP, exp_stuservices = DPFPAREGP, econ_disadv = DPETECOP, teacher_exp = DPSTEXPA) |>
mutate(across(where(is.character), readr::parse_number)) |>
drop_na(staar_meets, exp_instruction, exp_stuservices, econ_disadv, teacher_exp)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.character), readr::parse_number)`.
## Caused by warning:
## ! 1206 parsing failures.
## row col expected        actual
##   1  -- a number CAYUGA ISD   
##   2  -- a number ELKHART ISD  
##   3  -- a number FRANKSTON ISD
##   4  -- a number NECHES ISD   
##   5  -- a number PALESTINE ISD
## ... ... ........ .............
## See problems(...) for more details.
summary(clean_data)
##  district_name   staar_meets    exp_instruction exp_stuservices
##  Min.   : NA    Min.   : 0.00   Min.   :18.50   Min.   : 2.00  
##  1st Qu.: NA    1st Qu.:37.00   1st Qu.:52.02   1st Qu.:35.12  
##  Median : NA    Median :46.00   Median :55.10   Median :39.70  
##  Mean   :NaN    Mean   :46.37   Mean   :54.77   Mean   :39.81  
##  3rd Qu.: NA    3rd Qu.:55.00   3rd Qu.:57.80   3rd Qu.:43.90  
##  Max.   : NA    Max.   :88.00   Max.   :84.40   Max.   :79.10  
##  NA's   :1198                                                  
##   econ_disadv      teacher_exp   
##  Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 48.00   1st Qu.:10.10  
##  Median : 61.85   Median :12.00  
##  Mean   : 60.82   Mean   :11.78  
##  3rd Qu.: 76.97   3rd Qu.:13.90  
##  Max.   :100.00   Max.   :22.90  
## 
plot(clean_data$exp_instruction, clean_data$staar_meets,main = "Instructional Spending vs STAAR Meets",xlab = "Instructional Expenditure (%)",ylab = "STAAR Meets (%)",pch = 19,col = "red")

plot(clean_data$econ_disadv, clean_data$staar_meets, main = "Economic Disadvantage vs STAAR Meets", xlab = "Economically Disadvantaged (%)", ylab = "STAAR Meets (%)", pch = 19,col = "blue")

cor(select(clean_data, staar_meets, exp_instruction, exp_stuservices, econ_disadv, teacher_exp),use = "complete.obs")
##                 staar_meets exp_instruction exp_stuservices econ_disadv
## staar_meets       1.0000000       0.2150228      0.35432970  -0.6964191
## exp_instruction   0.2150228       1.0000000      0.48358599  -0.1924036
## exp_stuservices   0.3543297       0.4835860      1.00000000  -0.4761955
## econ_disadv      -0.6964191      -0.1924036     -0.47619545   1.0000000
## teacher_exp       0.3333607       0.1297148     -0.02474583  -0.2327761
##                 teacher_exp
## staar_meets      0.33336067
## exp_instruction  0.12971478
## exp_stuservices -0.02474583
## econ_disadv     -0.23277614
## teacher_exp      1.00000000
hist(clean_data$econ_disadv, main = "Distribution of Economic Disadvantage (%)", xlab = "Economically Disadvantaged Students (%)", col = "Red", border = "white")

hist(clean_data$exp_instruction, main = "Distribution of Instructional Expenditure (%)", xlab = "Instructional Expenditure (%)",col = "green", border = "white")

hist(clean_data$staar_meets, main = "Distribution of STAAR Meets (%)", xlab = "STAAR Meets (%)",col = "blue", border = "white")

hist(clean_data$exp_stuservices,main = "Distribution of Student Services Expenditure (%)",xlab = "Student Services Expenditure (%)", col = "pink", border = "white")

hist(clean_data$teacher_exp,main = "Distribution of Teacher Experience (Years)", xlab = "Average Years of Experience", col = "gray", border = "white")