library(readxl)
district<-read_excel("district.xls")
pastecs::stat.desc("district.xls")
##  nbr.val nbr.null   nbr.na      min      max    range      sum   median 
##       NA       NA       NA       NA       NA       NA       NA       NA 
##     mean  SE.mean  CI.mean      var  std.dev coef.var 
##       NA       NA       NA       NA       NA       NA
library(tidyr)
library(dplyr)
clean_district_data<-district |> drop_na()
print(clean_district_data)
## # A tibble: 323 × 137
##    DISTNAME         DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP
##    <chr>            <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>
##  1 PALESTINE ISD    001907   001 AND… 07     B               6     3360     25.1
##  2 HUDSON ISD       003902   003 ANG… 07     A               5     2799      7.2
##  3 LUFKIN ISD       003903   003 ANG… 07     B              17     7318     28.7
##  4 HUNTINGTON ISD   003904   003 ANG… 07     B               5     1612      2.4
##  5 ARANSAS COUNTY … 004901   004 ARA… 02     B               4     3005      1.3
##  6 PLEASANTON ISD   007905   007 ATA… 20     B               6     3374      0.7
##  7 BANDERA ISD      010902   010 BAN… 20     Not Rat…        4     2301      0.2
##  8 BASTROP ISD      011901   011 BAS… 13     C              14    11947      3  
##  9 ELGIN ISD        011902   011 BAS… 13     C               8     4985      9  
## 10 SMITHVILLE ISD   011904   011 BAS… 13     B               4     1780      6.9
## # ℹ 313 more rows
## # ℹ 129 more variables: DPETHISP <dbl>, DPETWHIP <dbl>, DPETINDP <dbl>,
## #   DPETASIP <dbl>, DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>,
## #   DPETLEPP <dbl>, DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>,
## #   DPETGIFP <dbl>, DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>,
## #   DAGC5X20R <dbl>, DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>,
## #   DDA00A001S22R <dbl>, DDA00A001222R <dbl>, DDA00A001322R <dbl>, …
pastecs::stat.desc(clean_district_data$DPSTBLFP)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##  323.0000000   20.0000000    0.0000000    0.0000000   80.2000000   80.2000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2580.4000000    3.9000000    7.9888545    0.6178283    1.2154898  123.2929189 
##      std.dev     coef.var 
##   11.1037345    1.3899032

#This variable is measuring the % of African American teachers within a given district. I will use this as my dependent variable to see if it has a positive effect on African American students’ STAAR test scores.

hist(clean_district_data$DPSTBLFP)

clean_district_data_sqrt<-clean_district_data |> mutate(DPSTBLFP_SQRT=sqrt(DPSTBLFP)) |> select(DPSTBLFP,DPSTBLFP_SQRT)

head(clean_district_data_sqrt)
## # A tibble: 6 × 2
##   DPSTBLFP DPSTBLFP_SQRT
##      <dbl>         <dbl>
## 1      9.6         3.10 
## 2      0.5         0.707
## 3     12.1         3.48 
## 4      0           0    
## 5      1.4         1.18 
## 6      1.2         1.10
hist(clean_district_data_sqrt$DPSTBLFP_SQRT)