library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
read_excel("district.xls")
## # A tibble: 1,207 × 137
##    DISTNAME         DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP
##    <chr>            <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>
##  1 CAYUGA ISD       001902   001 AND… 07     A               3      574      4.4
##  2 ELKHART ISD      001903   001 AND… 07     A               4     1150      4  
##  3 FRANKSTON ISD    001904   001 AND… 07     A               3      808      8.5
##  4 NECHES ISD       001906   001 AND… 07     A               2      342      8.2
##  5 PALESTINE ISD    001907   001 AND… 07     B               6     3360     25.1
##  6 WESTWOOD ISD     001908   001 AND… 07     B               4     1332     19.7
##  7 SLOCUM ISD       001909   001 AND… 07     B               2      361      0.3
##  8 ANDREWS ISD      002901   002 AND… 18     B               6     4131      0.8
##  9 PINEYWOODS COMM… 003801   003 ANG… 07     A               4      995     15.7
## 10 HUDSON ISD       003902   003 ANG… 07     A               5     2799      7.2
## # ℹ 1,197 more rows
## # ℹ 129 more variables: DPETHISP <dbl>, DPETWHIP <dbl>, DPETINDP <dbl>,
## #   DPETASIP <dbl>, DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>,
## #   DPETLEPP <dbl>, DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>,
## #   DPETGIFP <dbl>, DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>,
## #   DAGC5X20R <dbl>, DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>,
## #   DDA00A001S22R <dbl>, DDA00A001222R <dbl>, DDA00A001322R <dbl>, …
district <- read_excel("district.xls")
stat.desc(district$DA0912DR21R)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1095.0000000  421.0000000  112.0000000   -1.0000000   50.5000000   51.5000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1361.3000000    0.4000000    1.2431963    0.1003733    0.1969460   11.0319075 
##      std.dev     coef.var 
##    3.3214315    2.6716870

The variable is called “DA0912DR21R” and it measures dropout rates for the year 2020 and 2021. It measures how many students did not continue highschool that year.

library(dplyr)
districtclean <- district %>%
  filter(!is.na(DA0912DR21R)) %>%
  filter(DA0912DR21R >= 0)
hist(districtclean$DA0912DR21R)

districtclean <- districtclean %>%
  mutate(DA0912DR21R = log1p(DA0912DR21R))
hist(districtclean$DA0912DR21R)