library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
district <- read_excel("district.xls")
new_district <- district |> select(DISTNAME,DPETSPEP,DPFPASPEP)
View(new_district)
summary(new_district$DPFPASPEP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   5.800   8.900   9.711  12.500  49.000       5
summary(new_district$DPETSPEP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.90   12.10   12.27   14.20   51.70

Question 4: Variable DPFPASPEP (money spend on special education) has missing values.

new_district |> drop_na(DPFPASPEP)
## # A tibble: 1,202 × 3
##    DISTNAME                     DPETSPEP DPFPASPEP
##    <chr>                           <dbl>     <dbl>
##  1 CAYUGA ISD                       14.6      28.9
##  2 ELKHART ISD                      12.1       8.8
##  3 FRANKSTON ISD                    13.1       8.4
##  4 NECHES ISD                       10.5      10.1
##  5 PALESTINE ISD                    13.5       6.1
##  6 WESTWOOD ISD                     14.5       9.4
##  7 SLOCUM ISD                       14.7       9.9
##  8 ANDREWS ISD                      10.4      10.9
##  9 PINEYWOODS COMMUNITY ACADEMY     11.6       9.2
## 10 HUDSON ISD                       11.9      10.3
## # ℹ 1,192 more rows

Question 5: After removing NA observations, there are 1,202 observations left.

ggplot(new_district, aes(x=DPETSPEP, y=DPFPASPEP)) + geom_point()
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

Question 6: There seems to be somewhat of a correlation between DPETSPEP (percent special education) and DPFPASPEP (money spent on special education).

new_DPFPASPEP<-new_district |> drop_na(DPFPASPEP)

side note: I have a feeling this was not how I was supposed to do it but I couldn’t figure out how else to make the NA’s not show up in the correlation below except for making an entirely new dataset with the NA’s omitted and testing correlation with the new dataset….so, yeah

cor(new_DPFPASPEP$DPFPASPEP, new_DPFPASPEP$DPETSPEP)
## [1] 0.3700234

Question 8: Based on the correlation above, I would say that there seems to be a weak connection between percent of special education students and the amount of money spent on special education. While one might think that schools would spend SPED dollars in relation to the amount of students in their SPED population, this result indicates that there may be other factors at play - perhaps state/local funding, overall school demographics, etc.