library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
read_excel("district.xls")
## # A tibble: 1,207 × 137
## DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 CAYUGA ISD 001902 001 AND… 07 A 3 574 4.4
## 2 ELKHART ISD 001903 001 AND… 07 A 4 1150 4
## 3 FRANKSTON ISD 001904 001 AND… 07 A 3 808 8.5
## 4 NECHES ISD 001906 001 AND… 07 A 2 342 8.2
## 5 PALESTINE ISD 001907 001 AND… 07 B 6 3360 25.1
## 6 WESTWOOD ISD 001908 001 AND… 07 B 4 1332 19.7
## 7 SLOCUM ISD 001909 001 AND… 07 B 2 361 0.3
## 8 ANDREWS ISD 002901 002 AND… 18 B 6 4131 0.8
## 9 PINEYWOODS COMM… 003801 003 ANG… 07 A 4 995 15.7
## 10 HUDSON ISD 003902 003 ANG… 07 A 5 2799 7.2
## # ℹ 1,197 more rows
## # ℹ 129 more variables: DPETHISP <dbl>, DPETWHIP <dbl>, DPETINDP <dbl>,
## # DPETASIP <dbl>, DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>,
## # DPETLEPP <dbl>, DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>,
## # DPETGIFP <dbl>, DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>,
## # DAGC5X20R <dbl>, DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>,
## # DDA00A001S22R <dbl>, DDA00A001222R <dbl>, DDA00A001322R <dbl>, …
district <- read_excel("district.xls")
stat.desc(district$DA0912DR21R)
## nbr.val nbr.null nbr.na min max range
## 1095.0000000 421.0000000 112.0000000 -1.0000000 50.5000000 51.5000000
## sum median mean SE.mean CI.mean.0.95 var
## 1361.3000000 0.4000000 1.2431963 0.1003733 0.1969460 11.0319075
## std.dev coef.var
## 3.3214315 2.6716870
The variable is called “DA0912DR21R” and it measures dropout rates for the year 2020 and 2021. It measures how many students did not continue highschool that year.
library(dplyr)
districtclean <- district %>%
filter(!is.na(DA0912DR21R)) %>%
filter(DA0912DR21R >= 0)
hist(districtclean$DA0912DR21R)
districtclean <- districtclean %>%
mutate(DA0912DR21R = log1p(DA0912DR21R))
hist(districtclean$DA0912DR21R)