library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
district<-read_excel("district.xls")
data_frame(district$DISTNAME, district$DPETSPEP, district$DPFPASPEP)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1,207 × 3
##    `district$DISTNAME`          `district$DPETSPEP` `district$DPFPASPEP`
##    <chr>                                      <dbl>                <dbl>
##  1 CAYUGA ISD                                  14.6                 28.9
##  2 ELKHART ISD                                 12.1                  8.8
##  3 FRANKSTON ISD                               13.1                  8.4
##  4 NECHES ISD                                  10.5                 10.1
##  5 PALESTINE ISD                               13.5                  6.1
##  6 WESTWOOD ISD                                14.5                  9.4
##  7 SLOCUM ISD                                  14.7                  9.9
##  8 ANDREWS ISD                                 10.4                 10.9
##  9 PINEYWOODS COMMUNITY ACADEMY                11.6                  9.2
## 10 HUDSON ISD                                  11.9                 10.3
## # ℹ 1,197 more rows
summary(district$DPETSPEP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.90   12.10   12.27   14.20   51.70
summary(district$DPFPASPEP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   5.800   8.900   9.711  12.500  49.000       5

#dpfpaspep has missing values

DPET<-district$DPETSPEP
DPAP<-district$DPFPASPEP
CleanDPAP<-DPAP[!is.na(DPAP)]

1202 are left

ggplot(district,aes(x=DPFPASPEP,y=DPETSPEP))+geom_point()
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

cor(district$DPFPASPEP, district$DPETSPEP)
## [1] NA

DPAP needs to be cleaned, done

new_data<-district %>% filter(!is.na(DPFPASPEP))
cor(new_data$DPFPASPEP, new_data$DPETSPEP)
## [1] 0.3700234

#There’s a small (37%) correlation but not significant