library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
teacher_data <- read_csv("Teacher_Hiring_Certification_Turnover.csv")
## Rows: 33 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): REGION, distname, geotype_new, region_lea, Year
## dbl (20): district, schyr, intern, other_temp, oos_std, lag_starter, no_cert...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(teacher_data)
## # A tibble: 6 × 25
## district schyr REGION intern other_temp oos_std lag_starter no_cert reenterer
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 101902 2013 04 145 71 11 60 22 165
## 2 101902 2014 04 201 102 8 36 50 215
## 3 101902 2015 04 267 120 16 21 38 162
## 4 101902 2016 04 306 105 14 27 55 159
## 5 101902 2017 04 371 106 15 17 74 179
## 6 101902 2018 04 245 70 8 9 55 117
## # ℹ 16 more variables: emer <dbl>, std_all <dbl>, distname <chr>,
## # geotype_new <chr>, total_new_hires <dbl>, region_lea <chr>, Year <chr>,
## # total_teachers <dbl>, turnover_rate_teachers <dbl>, beg_year <dbl>,
## # `1-5_years` <dbl>, `6-10_years` <dbl>, `11-20_years` <dbl>,
## # over20_years <dbl>, `st-per-tch` <dbl>, num_st_mem <dbl>
turnover_model <- lm(turnover_rate_teachers~intern + other_temp + oos_std + lag_starter + no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data=teacher_data)
plot(turnover_model,which=1)
raintest(turnover_model)
##
## Rainbow test
##
## data: turnover_model
## Rain = 0.93786, df1 = 17, df2 = 1, p-value = 0.6837
durbinWatsonTest(turnover_model)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.004579254 1.77478 0.074
## Alternative hypothesis: rho != 0
plot(turnover_model,which=3)
bptest(turnover_model)
##
## studentized Breusch-Pagan test
##
## data: turnover_model
## BP = 12.186, df = 14, p-value = 0.5914
plot(turnover_model,which=2)
shapiro.test(turnover_model$residuals)
##
## Shapiro-Wilk normality test
##
## data: turnover_model$residuals
## W = 0.87576, p-value = 0.001327
vif(turnover_model)
## intern other_temp oos_std lag_starter no_cert
## 19.250431 43.882459 5.595996 3.893520 11.215370
## reenterer emer std_all beg_year `1-5_years`
## 8.923769 2.915729 80.274319 16.047797 33.021766
## `6-10_years` `11-20_years` over20_years `st-per-tch`
## 46.687250 40.442599 53.253683 2.385037
kitchen_sink_vars <- teacher_data %>% dplyr::select(intern, other_temp, oos_std,lag_starter, no_cert, reenterer, emer, std_all, beg_year,`1-5_years`, `6-10_years`, `11-20_years`, `over20_years`, `st-per-tch`)
cor(kitchen_sink_vars)
## intern other_temp oos_std lag_starter no_cert
## intern 1.00000000 0.80645667 0.6396149 0.30425355 0.09815067
## other_temp 0.80645667 1.00000000 0.6229393 0.51104315 0.23115484
## oos_std 0.63961490 0.62293932 1.0000000 0.38350871 0.34262975
## lag_starter 0.30425355 0.51104315 0.3835087 1.00000000 0.05730342
## no_cert 0.09815067 0.23115484 0.3426298 0.05730342 1.00000000
## reenterer 0.57768791 0.59818987 0.5605843 0.62990296 0.44184310
## emer 0.40351531 0.33333515 0.2029714 0.11237210 0.38136220
## std_all 0.78264794 0.95906769 0.6346246 0.63699505 0.14676882
## beg_year 0.76913394 0.78390655 0.7234533 0.37562180 0.60511265
## 1-5_years 0.88361596 0.80738743 0.5215350 0.36948541 0.21723501
## 6-10_years 0.83826020 0.90303087 0.5938319 0.59706602 0.25935390
## 11-20_years 0.83964120 0.76215101 0.5359403 0.37266141 0.27829356
## over20_years 0.79722535 0.82601821 0.4635441 0.34273789 0.32579863
## st-per-tch -0.30938909 -0.02821343 -0.4343193 0.19820649 0.01245402
## reenterer emer std_all beg_year 1-5_years
## intern 0.57768791 0.40351531 0.782647938 0.7691339 0.8836160
## other_temp 0.59818987 0.33333515 0.959067693 0.7839065 0.8073874
## oos_std 0.56058428 0.20297142 0.634624632 0.7234533 0.5215350
## lag_starter 0.62990296 0.11237210 0.636995047 0.3756218 0.3694854
## no_cert 0.44184310 0.38136220 0.146768819 0.6051126 0.2172350
## reenterer 1.00000000 0.43759712 0.690464009 0.7226507 0.6833765
## emer 0.43759712 1.00000000 0.318472049 0.4508522 0.5075710
## std_all 0.69046401 0.31847205 1.000000000 0.7555316 0.8234062
## beg_year 0.72265065 0.45085217 0.755531641 1.0000000 0.7730621
## 1-5_years 0.68337648 0.50757103 0.823406241 0.7730621 1.0000000
## 6-10_years 0.77191271 0.48013784 0.940187810 0.8094086 0.9126570
## 11-20_years 0.72456384 0.60224765 0.788301599 0.7967198 0.9577039
## over20_years 0.60808758 0.62674134 0.802438004 0.7656409 0.9396855
## st-per-tch -0.07730141 0.03802326 0.001465585 -0.1650754 -0.1689958
## 6-10_years 11-20_years over20_years st-per-tch
## intern 0.83826020 0.8396412 0.79722535 -0.309389092
## other_temp 0.90303087 0.7621510 0.82601821 -0.028213430
## oos_std 0.59383190 0.5359403 0.46354405 -0.434319277
## lag_starter 0.59706602 0.3726614 0.34273789 0.198206493
## no_cert 0.25935390 0.2782936 0.32579863 0.012454017
## reenterer 0.77191271 0.7245638 0.60808758 -0.077301407
## emer 0.48013784 0.6022477 0.62674134 0.038023257
## std_all 0.94018781 0.7883016 0.80243800 0.001465585
## beg_year 0.80940860 0.7967198 0.76564091 -0.165075395
## 1-5_years 0.91265703 0.9577039 0.93968552 -0.168995808
## 6-10_years 1.00000000 0.9086697 0.90256531 -0.059100021
## 11-20_years 0.90866972 1.0000000 0.94340000 -0.151934361
## over20_years 0.90256531 0.9434000 1.00000000 -0.075875846
## st-per-tch -0.05910002 -0.1519344 -0.07587585 1.000000000
Linearity: the residuals vs. fitted plot and high p-value (0.6837) from the rainbow test suggest that the linearity assumption is met
Independence of Errors: the p-value of the Durbin-Watson test is 0.084, suggesting that the errors are independent
Homoscedasticity: the Breusch-Pagan test p-value = 0.5914, suggesting that the homoscedasticity assumption is likely satisfied. the scale-location plot trends up slightly
Normality of Residuals: there is slight deviation in the QQ plot. the p-value (0.001327) from the Shapiro-Wilk test is below 0.05, suggesting that the residuals are significantly different from a normal distribution
No Multicollinearity: the VIF values are above 5 for most of the variables, and the correlation matrix shows high correlations among predictors like other_temp and std_all, and between various experience-year variables (1-5_years, 6-10_years, 11-20_years, and over20_years), suggesting “no multicollinearity” is violated.
The model violates the assumptions of Normality of Residuals and No Multicollinearity
log transformation to mitigate normality of residuals violation did not work, p-value = 0.001527
turnover_model_log <- lm(log(turnover_rate_teachers) ~ intern + other_temp + oos_std + lag_starter +no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data = teacher_data)
plot(turnover_model_log, which = 2)
shapiro.test(residuals(turnover_model_log))
##
## Shapiro-Wilk normality test
##
## data: residuals(turnover_model_log)
## W = 0.87831, p-value = 0.001527
square root transformation to mitigate normality of residuals violation did not work, p-value = 0.001238
turnover_model_sqrt <- lm(sqrt(turnover_rate_teachers) ~ intern + other_temp + oos_std + lag_starter + no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data = teacher_data)
plot(turnover_model_sqrt, which = 2)
shapiro.test(residuals(turnover_model_sqrt))
##
## Shapiro-Wilk normality test
##
## data: residuals(turnover_model_sqrt)
## W = 0.8745, p-value = 0.001238
to mitigate multicollinearity, I removed the years of experience variables and other_temp (One Year, Out of State Certified)
turnover_model_reduced <- lm(turnover_rate_teachers ~ intern + oos_std + lag_starter + no_cert +reenterer + emer + std_all + `st-per-tch`, data = teacher_data)
vif(turnover_model_reduced)
## intern oos_std lag_starter no_cert reenterer emer
## 4.607931 3.155704 2.935829 1.937917 3.621410 1.564944
## std_all `st-per-tch`
## 5.819113 2.023929