library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
district<-read_excel("district.xls")
CleanDistrict<-district%>%select( "DISTNAME", "DPETSPEP","DPFPASPEP")
glimpse(CleanDistrict)
## Rows: 1,207
## Columns: 3
## $ DISTNAME  <chr> "CAYUGA ISD", "ELKHART ISD", "FRANKSTON ISD", "NECHES ISD", …
## $ DPETSPEP  <dbl> 14.6, 12.1, 13.1, 10.5, 13.5, 14.5, 14.7, 10.4, 11.6, 11.9, …
## $ DPFPASPEP <dbl> 28.9, 8.8, 8.4, 10.1, 6.1, 9.4, 9.9, 10.9, 9.2, 10.3, 10.7, …
summary(CleanDistrict)
##    DISTNAME            DPETSPEP       DPFPASPEP     
##  Length:1207        Min.   : 0.00   Min.   : 0.000  
##  Class :character   1st Qu.: 9.90   1st Qu.: 5.800  
##  Mode  :character   Median :12.10   Median : 8.900  
##                     Mean   :12.27   Mean   : 9.711  
##                     3rd Qu.:14.20   3rd Qu.:12.500  
##                     Max.   :51.70   Max.   :49.000  
##                                     NA's   :5
CleanDistrict%>%rename(DistrictName=DISTNAME, DistrictPer=DPETSPEP, DistrictSpend=DPFPASPEP)
## # A tibble: 1,207 × 3
##    DistrictName                 DistrictPer DistrictSpend
##    <chr>                              <dbl>         <dbl>
##  1 CAYUGA ISD                          14.6          28.9
##  2 ELKHART ISD                         12.1           8.8
##  3 FRANKSTON ISD                       13.1           8.4
##  4 NECHES ISD                          10.5          10.1
##  5 PALESTINE ISD                       13.5           6.1
##  6 WESTWOOD ISD                        14.5           9.4
##  7 SLOCUM ISD                          14.7           9.9
##  8 ANDREWS ISD                         10.4          10.9
##  9 PINEYWOODS COMMUNITY ACADEMY        11.6           9.2
## 10 HUDSON ISD                          11.9          10.3
## # ℹ 1,197 more rows
CleanDistrict%>%drop_na(DPFPASPEP)
## # A tibble: 1,202 × 3
##    DISTNAME                     DPETSPEP DPFPASPEP
##    <chr>                           <dbl>     <dbl>
##  1 CAYUGA ISD                       14.6      28.9
##  2 ELKHART ISD                      12.1       8.8
##  3 FRANKSTON ISD                    13.1       8.4
##  4 NECHES ISD                       10.5      10.1
##  5 PALESTINE ISD                    13.5       6.1
##  6 WESTWOOD ISD                     14.5       9.4
##  7 SLOCUM ISD                       14.7       9.9
##  8 ANDREWS ISD                      10.4      10.9
##  9 PINEYWOODS COMMUNITY ACADEMY     11.6       9.2
## 10 HUDSON ISD                       11.9      10.3
## # ℹ 1,192 more rows
ggplot(CleanDistrict, aes(x = DPETSPEP, y = DPFPASPEP)) + geom_point() + labs(title = "Special Education Spending vs Percent",
x = "Percent Special Ed", y = "Spending Special Ed")
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

CleanDistrict %>% select(DPETSPEP, DPFPASPEP) %>% cor(use = "complete.obs")
##            DPETSPEP DPFPASPEP
## DPETSPEP  1.0000000 0.3700234
## DPFPASPEP 0.3700234 1.0000000
  1. Which variable has missing values? District Spending
  2. remove the missing observations. How many are left overall? 4
  3. Create a point graph (hint: ggplot + geom_point()) to compare DPFPASPEP and DPETSPEP. Are they correlated? Somewhat
  4. Do a mathematical check (cor()) of DPFPASPEP and DPETSPEP. What is the result? .37
  5. How would you interpret these results? (No real right or wrong answer – just tell me what you see). There’s a moderate positive relationship between the two. The percentage of special education students goes up, spending usually goes up too, but it’s not a perfect match. The correlation of about 0.37 tells me there’s a connection, but it’s not super strong, which makes sense because spending is probably influenced by other factors too. relationship.