library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
teacher_data <- read_csv("Teacher_Hiring_Certification_Turnover.csv")
## Rows: 33 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): REGION, distname, geotype_new, region_lea, Year
## dbl (20): district, schyr, intern, other_temp, oos_std, lag_starter, no_cert...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(teacher_data$no_cert)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 25.00 44.73 48.00 395.00
pastecs::stat.desc(teacher_data$no_cert)
## nbr.val nbr.null nbr.na min max range
## 33.000000 2.000000 0.000000 0.000000 395.000000 395.000000
## sum median mean SE.mean CI.mean.0.95 var
## 1476.000000 25.000000 44.727273 13.831800 28.174455 6313.517045
## std.dev coef.var
## 79.457643 1.776492
The no_cert variable represents newly hired teachers who lack Texas certification each year from 2012-2023 from three Houston-area school districts. This variable is used to measure the prevalence of uncertified teachers entering the Texas education system and may correlate with various factors like teacher retention and student outcomes.
teacher_data <- teacher_data %>% filter(!is.na(no_cert))
There are no NA’s that need to be removed.
hist(teacher_data$no_cert, main="Histogram of Noncertified Teachers", xlab="No Certification", breaks=50)
td_log <- teacher_data%>%mutate(no_cert_log = log(no_cert))
head(td_log$no_cert_log)
## [1] 3.091042 3.912023 3.637586 4.007333 4.304065 4.007333
summary(td_log$no_cert_log)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -Inf 1.609 3.219 -Inf 3.871 5.979
There are some zeros in the data, which is why the min and mean come out as -Inf because log(0) does not exist.
td_sqrt <- teacher_data%>%mutate(no_cert_sqrt = sqrt(no_cert))
head(td_sqrt$no_cert_sqrt)
## [1] 4.690416 7.071068 6.164414 7.416198 8.602325 7.416198
summary(td_sqrt$no_cert_sqrt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.236 5.000 5.300 6.928 19.875
Here, it makes more sense to use the square root transformation than the log transformation.
hist(td_sqrt$no_cert_sqrt, main="Histogram of Noncertified Teachers (sqrt)", xlab="No Certification", breaks=50)