library(car)        
## Loading required package: carData
library(lmtest)     
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
district_data <- read_csv("r4data.csv")
## Rows: 450 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Year, distname, geotype_new
## dbl (20): total teachers, total_new_hires, new_hires_per, std_all, std_all_p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dep_var <- district_data[, c("turnover_rate")]

indie_vars <- district_data[, c("std_all_per", "intern_per", "other_temp_per", "oos_std_per", "lag_starter_per", "no_cert_per", "reenterer_per", "emer_per", "new_hires_per")]

summary(dep_var)
##  turnover_rate   
##  Min.   : 6.337  
##  1st Qu.:13.100  
##  Median :15.800  
##  Mean   :17.705  
##  3rd Qu.:20.375  
##  Max.   :51.800
summary(indie_vars)
##   std_all_per      intern_per     other_temp_per     oos_std_per    
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:13.04   1st Qu.: 22.92   1st Qu.: 0.6475   1st Qu.: 0.000  
##  Median :17.55   Median : 30.59   Median : 4.0250   Median : 1.800  
##  Mean   :17.47   Mean   : 31.46   Mean   : 4.3983   Mean   : 2.293  
##  3rd Qu.:22.53   3rd Qu.: 39.13   3rd Qu.: 6.5125   3rd Qu.: 3.225  
##  Max.   :51.16   Max.   :100.00   Max.   :25.0000   Max.   :25.000  
##  lag_starter_per   no_cert_per     reenterer_per      emer_per      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.0000  
##  1st Qu.: 2.365   1st Qu.: 1.417   1st Qu.:25.07   1st Qu.: 0.0000  
##  Median : 4.225   Median : 3.585   Median :31.79   Median : 0.0000  
##  Mean   : 5.516   Mean   : 5.351   Mean   :32.72   Mean   : 0.8016  
##  3rd Qu.: 7.447   3rd Qu.: 7.122   3rd Qu.:38.54   3rd Qu.: 0.3900  
##  Max.   :50.000   Max.   :66.670   Max.   :80.00   Max.   :42.8600  
##  new_hires_per   
##  Min.   : 2.480  
##  1st Qu.: 8.662  
##  Median :10.910  
##  Mean   :12.232  
##  3rd Qu.:14.168  
##  Max.   :47.240
all_vars <- district_data[, c("turnover_rate", "std_all_per", "intern_per", "other_temp_per", "oos_std_per", "lag_starter_per", "no_cert_per", "reenterer_per", "emer_per", "new_hires_per")]

summary(all_vars)
##  turnover_rate     std_all_per      intern_per     other_temp_per   
##  Min.   : 6.337   Min.   : 0.00   Min.   :  0.00   Min.   : 0.0000  
##  1st Qu.:13.100   1st Qu.:13.04   1st Qu.: 22.92   1st Qu.: 0.6475  
##  Median :15.800   Median :17.55   Median : 30.59   Median : 4.0250  
##  Mean   :17.705   Mean   :17.47   Mean   : 31.46   Mean   : 4.3983  
##  3rd Qu.:20.375   3rd Qu.:22.53   3rd Qu.: 39.13   3rd Qu.: 6.5125  
##  Max.   :51.800   Max.   :51.16   Max.   :100.00   Max.   :25.0000  
##   oos_std_per     lag_starter_per   no_cert_per     reenterer_per  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.: 2.365   1st Qu.: 1.417   1st Qu.:25.07  
##  Median : 1.800   Median : 4.225   Median : 3.585   Median :31.79  
##  Mean   : 2.293   Mean   : 5.516   Mean   : 5.351   Mean   :32.72  
##  3rd Qu.: 3.225   3rd Qu.: 7.447   3rd Qu.: 7.122   3rd Qu.:38.54  
##  Max.   :25.000   Max.   :50.000   Max.   :66.670   Max.   :80.00  
##     emer_per       new_hires_per   
##  Min.   : 0.0000   Min.   : 2.480  
##  1st Qu.: 0.0000   1st Qu.: 8.662  
##  Median : 0.0000   Median :10.910  
##  Mean   : 0.8016   Mean   :12.232  
##  3rd Qu.: 0.3900   3rd Qu.:14.168  
##  Max.   :42.8600   Max.   :47.240
pastecs::stat.desc(all_vars)
##              turnover_rate  std_all_per   intern_per other_temp_per
## nbr.val        450.0000000  450.0000000 4.500000e+02    450.0000000
## nbr.null         0.0000000   26.0000000 9.000000e+00    112.0000000
## nbr.na           0.0000000    0.0000000 0.000000e+00      0.0000000
## min              6.3370000    0.0000000 0.000000e+00      0.0000000
## max             51.8000000   51.1600000 1.000000e+02     25.0000000
## range           45.4630000   51.1600000 1.000000e+02     25.0000000
## sum           7967.2280000 7860.7300000 1.415512e+04   1979.2200000
## median          15.8000000   17.5450000 3.059000e+01      4.0250000
## mean            17.7049511   17.4682889 3.145582e+01      4.3982667
## SE.mean          0.3262882    0.3967071 5.966893e-01      0.1846020
## CI.mean.0.95     0.6412415    0.7796332 1.172651e+00      0.3627911
## var             47.9087826   70.8194414 1.602172e+02     15.3350460
## std.dev          6.9216171    8.4154288 1.265769e+01      3.9159987
## coef.var         0.3909425    0.4817546 4.023959e-01      0.8903505
##               oos_std_per lag_starter_per  no_cert_per reenterer_per
## nbr.val       450.0000000     450.0000000  450.0000000  4.500000e+02
## nbr.null      142.0000000      65.0000000   89.0000000  3.000000e+00
## nbr.na          0.0000000       0.0000000    0.0000000  0.000000e+00
## min             0.0000000       0.0000000    0.0000000  0.000000e+00
## max            25.0000000      50.0000000   66.6700000  8.000000e+01
## range          25.0000000      50.0000000   66.6700000  8.000000e+01
## sum          1031.7900000    2482.0400000 2407.8300000  1.472253e+04
## median          1.8000000       4.2250000    3.5850000  3.179000e+01
## mean            2.2928667       5.5156444    5.3507333  3.271673e+01
## SE.mean         0.1282076       0.2575389    0.3079684  5.411812e-01
## CI.mean.0.95    0.2519614       0.5061312    0.6052384  1.063563e+00
## var             7.3967314      29.8468237   42.6800429  1.317947e+02
## std.dev         2.7196933       5.4632247    6.5329965  1.148019e+01
## coef.var        1.1861541       0.9904962    1.2209535  3.508965e-01
##                 emer_per new_hires_per
## nbr.val      450.0000000   450.0000000
## nbr.null     299.0000000     0.0000000
## nbr.na         0.0000000     0.0000000
## min            0.0000000     2.4800000
## max           42.8600000    47.2400000
## range         42.8600000    44.7600000
## sum          360.7300000  5504.2700000
## median         0.0000000    10.9100000
## mean           0.8016222    12.2317111
## SE.mean        0.1467364     0.2758708
## CI.mean.0.95   0.2883754     0.5421583
## var            9.6892052    34.2471162
## std.dev        3.1127488     5.8521036
## coef.var       3.8830620     0.4784370
#linear regression model
lm_model <- lm(turnover_rate ~ std_all_per + intern_per + other_temp_per +
              oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, 
            data = district_data)

summary(lm_model)
## 
## Call:
## lm(formula = turnover_rate ~ std_all_per + intern_per + other_temp_per + 
##     oos_std_per + lag_starter_per + no_cert_per + reenterer_per + 
##     emer_per + new_hires_per, data = district_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.973  -2.128  -0.242   1.811  12.227 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -274.36765 1918.33604  -0.143    0.886    
## std_all_per        2.71881   19.18370   0.142    0.887    
## intern_per         2.83087   19.18392   0.148    0.883    
## other_temp_per     2.69553   19.18251   0.141    0.888    
## oos_std_per        2.75144   19.18213   0.143    0.886    
## lag_starter_per    2.89853   19.18442   0.151    0.880    
## no_cert_per        2.83716   19.18388   0.148    0.882    
## reenterer_per      2.81565   19.18345   0.147    0.883    
## emer_per           2.99491   19.18349   0.156    0.876    
## new_hires_per      0.95491    0.02972  32.129   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.31 on 440 degrees of freedom
## Multiple R-squared:  0.7758, Adjusted R-squared:  0.7712 
## F-statistic: 169.2 on 9 and 440 DF,  p-value: < 2.2e-16
vif(lm_model)
##     std_all_per      intern_per  other_temp_per     oos_std_per lag_starter_per 
##    1.067775e+06    2.415721e+06    2.311846e+05    1.115055e+05    4.500476e+05 
##     no_cert_per   reenterer_per        emer_per   new_hires_per 
##    6.435180e+05    1.987075e+06    1.460853e+05    1.239437e+00
#linear regression model with interactions

lm_inter_model <- lm(turnover_rate ~ new_hires_per*std_all_per + new_hires_per*intern_per + new_hires_per*other_temp_per + new_hires_per*oos_std_per + new_hires_per*lag_starter_per + new_hires_per*no_cert_per + new_hires_per*reenterer_per + new_hires_per*emer_per, data = district_data)

summary(lm_inter_model)
## 
## Call:
## lm(formula = turnover_rate ~ new_hires_per * std_all_per + new_hires_per * 
##     intern_per + new_hires_per * other_temp_per + new_hires_per * 
##     oos_std_per + new_hires_per * lag_starter_per + new_hires_per * 
##     no_cert_per + new_hires_per * reenterer_per + new_hires_per * 
##     emer_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9131  -2.1332  -0.3303   1.6913  13.4939 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -3136.235   4868.284  -0.644    0.520
## new_hires_per                   200.248    382.419   0.524    0.601
## std_all_per                      31.481     48.683   0.647    0.518
## intern_per                       31.401     48.684   0.645    0.519
## other_temp_per                   31.408     48.676   0.645    0.519
## oos_std_per                      31.301     48.687   0.643    0.521
## lag_starter_per                  31.451     48.688   0.646    0.519
## no_cert_per                      31.539     48.680   0.648    0.517
## reenterer_per                    31.411     48.682   0.645    0.519
## emer_per                         31.532     48.672   0.648    0.517
## new_hires_per:std_all_per        -2.006      3.824  -0.525    0.600
## new_hires_per:intern_per         -1.988      3.824  -0.520    0.603
## new_hires_per:other_temp_per     -2.000      3.824  -0.523    0.601
## new_hires_per:oos_std_per        -1.989      3.824  -0.520    0.603
## new_hires_per:lag_starter_per    -1.987      3.825  -0.520    0.604
## new_hires_per:no_cert_per        -1.998      3.824  -0.522    0.602
## new_hires_per:reenterer_per      -1.991      3.824  -0.521    0.603
## new_hires_per:emer_per           -1.988      3.823  -0.520    0.603
## 
## Residual standard error: 3.233 on 432 degrees of freedom
## Multiple R-squared:   0.79,  Adjusted R-squared:  0.7818 
## F-statistic: 95.61 on 17 and 432 DF,  p-value: < 2.2e-16
vif(lm_inter_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                 new_hires_per                   std_all_per 
##                   215082818.9                     7207783.5 
##                    intern_per                other_temp_per 
##                    16307331.6                     1560344.5 
##                   oos_std_per               lag_starter_per 
##                      752947.3                     3038364.6 
##                   no_cert_per                 reenterer_per 
##                     4343428.7                    13413159.1 
##                      emer_per     new_hires_per:std_all_per 
##                      985709.0                     7737093.5 
##      new_hires_per:intern_per  new_hires_per:other_temp_per 
##                    41631515.0                     3592898.9 
##     new_hires_per:oos_std_per new_hires_per:lag_starter_per 
##                     2263915.2                     4988271.2 
##     new_hires_per:no_cert_per   new_hires_per:reenterer_per 
##                    17256936.3                    38100844.8 
##        new_hires_per:emer_per 
##                     5784502.3
# --- Assumption Testing ---

# Linearity
plot(lm_model, which = 1)

plot(lm_inter_model, which = 1)

raintest(lm_model)
## 
##  Rainbow test
## 
## data:  lm_model
## Rain = 1.2308, df1 = 225, df2 = 215, p-value = 0.06251
raintest(lm_inter_model)
## 
##  Rainbow test
## 
## data:  lm_inter_model
## Rain = 1.2256, df1 = 225, df2 = 207, p-value = 0.06853
# Independence of Errors: Durbin-Watson Test
dwtest(lm_model)
## 
##  Durbin-Watson test
## 
## data:  lm_model
## DW = 1.6862, p-value = 0.0002815
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(lm_inter_model)
## 
##  Durbin-Watson test
## 
## data:  lm_inter_model
## DW = 1.7088, p-value = 0.0005942
## alternative hypothesis: true autocorrelation is greater than 0
#homoscedasticity
plot(lm_model, which = 3)

plot(lm_inter_model, which = 3)

# Normality of Residuals: Q-Q Plot
plot(lm_model, which = 2)

plot(lm_inter_model, which = 2)

# Shapiro-Wilk test for normality
shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.97656, p-value = 1.22e-06
shapiro.test(lm_inter_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_inter_model$residuals
## W = 0.96611, p-value = 1.098e-08
# 5 Multicollinearity
vif_results1 <- vif(lm_model)
print(vif_results1)
##     std_all_per      intern_per  other_temp_per     oos_std_per lag_starter_per 
##    1.067775e+06    2.415721e+06    2.311846e+05    1.115055e+05    4.500476e+05 
##     no_cert_per   reenterer_per        emer_per   new_hires_per 
##    6.435180e+05    1.987075e+06    1.460853e+05    1.239437e+00
vif_results2 <- vif(lm_inter_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(vif_results2)
##                 new_hires_per                   std_all_per 
##                   215082818.9                     7207783.5 
##                    intern_per                other_temp_per 
##                    16307331.6                     1560344.5 
##                   oos_std_per               lag_starter_per 
##                      752947.3                     3038364.6 
##                   no_cert_per                 reenterer_per 
##                     4343428.7                    13413159.1 
##                      emer_per     new_hires_per:std_all_per 
##                      985709.0                     7737093.5 
##      new_hires_per:intern_per  new_hires_per:other_temp_per 
##                    41631515.0                     3592898.9 
##     new_hires_per:oos_std_per new_hires_per:lag_starter_per 
##                     2263915.2                     4988271.2 
##     new_hires_per:no_cert_per   new_hires_per:reenterer_per 
##                    17256936.3                    38100844.8 
##        new_hires_per:emer_per 
##                     5784502.3
#MITIGATION
#mitigating violation of independence of variables

rlm_model <- rlm(turnover_rate ~ std_all_per + intern_per + other_temp_per + oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, data = district_data)


rlm_inter_model <- rlm(turnover_rate ~ new_hires_per * std_all_per + new_hires_per * intern_per + new_hires_per * other_temp_per + new_hires_per * oos_std_per + new_hires_per * lag_starter_per + new_hires_per * no_cert_per + new_hires_per * reenterer_per + new_hires_per * emer_per, data = district_data)

summary(rlm_model)
## 
## Call: rlm(formula = turnover_rate ~ std_all_per + intern_per + other_temp_per + 
##     oos_std_per + lag_starter_per + no_cert_per + reenterer_per + 
##     emer_per + new_hires_per, data = district_data)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.8208  -1.9127  -0.1411   1.8909  12.9665 
## 
## Coefficients:
##                 Value     Std. Error t value  
## (Intercept)     -538.3054 1752.4176    -0.3072
## std_all_per        5.3751   17.5245     0.3067
## intern_per         5.4642   17.5247     0.3118
## other_temp_per     5.3080   17.5234     0.3029
## oos_std_per        5.4189   17.5231     0.3092
## lag_starter_per    5.5115   17.5251     0.3145
## no_cert_per        5.4782   17.5247     0.3126
## reenterer_per      5.4439   17.5243     0.3107
## emer_per           5.6185   17.5243     0.3206
## new_hires_per      0.9768    0.0272    35.9781
## 
## Residual standard error: 2.81 on 440 degrees of freedom
summary(rlm_inter_model)
## 
## Call: rlm(formula = turnover_rate ~ new_hires_per * std_all_per + new_hires_per * 
##     intern_per + new_hires_per * other_temp_per + new_hires_per * 
##     oos_std_per + new_hires_per * lag_starter_per + new_hires_per * 
##     no_cert_per + new_hires_per * reenterer_per + new_hires_per * 
##     emer_per, data = district_data)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.1016  -1.8265  -0.1292   1.8063  14.4610 
## 
## Coefficients:
##                               Value      Std. Error t value   
## (Intercept)                   -5972.3160  4288.6750    -1.3926
## new_hires_per                   424.8084   336.8893     1.2610
## std_all_per                      59.8704    42.8865     1.3960
## intern_per                       59.7461    42.8877     1.3931
## other_temp_per                   59.7472    42.8809     1.3933
## oos_std_per                      59.7188    42.8903     1.3924
## lag_starter_per                  59.8198    42.8911     1.3947
## no_cert_per                      59.9069    42.8845     1.3969
## reenterer_per                    59.7565    42.8857     1.3934
## emer_per                         59.8428    42.8772     1.3957
## new_hires_per:std_all_per        -4.2537     3.3688    -1.2627
## new_hires_per:intern_per         -4.2328     3.3690    -1.2564
## new_hires_per:other_temp_per     -4.2456     3.3684    -1.2604
## new_hires_per:oos_std_per        -4.2351     3.3689    -1.2571
## new_hires_per:lag_starter_per    -4.2348     3.3692    -1.2569
## new_hires_per:no_cert_per        -4.2431     3.3685    -1.2597
## new_hires_per:reenterer_per      -4.2366     3.3689    -1.2576
## new_hires_per:emer_per           -4.2325     3.3681    -1.2566
## 
## Residual standard error: 2.699 on 432 degrees of freedom
#mitigating violation of heteroscedasticity

#log transformation

district_data <- district_data %>% mutate(log_turnover_rate = log(turnover_rate))

lm_model_log <- lm(log_turnover_rate ~ std_all_per + intern_per + other_temp_per + oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, data = district_data)

lm_inter_model_log <- lm(log_turnover_rate ~ new_hires_per * std_all_per +new_hires_per * intern_per + new_hires_per * other_temp_per + new_hires_per * oos_std_per + new_hires_per * lag_starter_per + new_hires_per * no_cert_per + new_hires_per * reenterer_per + new_hires_per * emer_per, data = district_data)

summary(lm_model_log)
## 
## Call:
## lm(formula = log_turnover_rate ~ std_all_per + intern_per + other_temp_per + 
##     oos_std_per + lag_starter_per + no_cert_per + reenterer_per + 
##     emer_per + new_hires_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67161 -0.11884 -0.00912  0.11775  0.64258 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.393e-01  1.090e+02   0.008    0.994    
## std_all_per     9.846e-03  1.090e+00   0.009    0.993    
## intern_per      1.677e-02  1.090e+00   0.015    0.988    
## other_temp_per  1.122e-02  1.090e+00   0.010    0.992    
## oos_std_per     8.430e-03  1.090e+00   0.008    0.994    
## lag_starter_per 1.916e-02  1.090e+00   0.018    0.986    
## no_cert_per     1.681e-02  1.090e+00   0.015    0.988    
## reenterer_per   1.355e-02  1.090e+00   0.012    0.990    
## emer_per        1.949e-02  1.090e+00   0.018    0.986    
## new_hires_per   4.483e-02  1.689e-03  26.545   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1881 on 440 degrees of freedom
## Multiple R-squared:  0.7098, Adjusted R-squared:  0.7038 
## F-statistic: 119.6 on 9 and 440 DF,  p-value: < 2.2e-16
summary(lm_inter_model_log)
## 
## Call:
## lm(formula = log_turnover_rate ~ new_hires_per * std_all_per + 
##     new_hires_per * intern_per + new_hires_per * other_temp_per + 
##     new_hires_per * oos_std_per + new_hires_per * lag_starter_per + 
##     new_hires_per * no_cert_per + new_hires_per * reenterer_per + 
##     new_hires_per * emer_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53666 -0.12728 -0.01082  0.11647  0.73175 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -192.9978   275.3325  -0.701    0.484
## new_hires_per                   12.3286    21.6283   0.570    0.569
## std_all_per                      1.9503     2.7533   0.708    0.479
## intern_per                       1.9534     2.7534   0.709    0.478
## other_temp_per                   1.9535     2.7530   0.710    0.478
## oos_std_per                      1.9497     2.7536   0.708    0.479
## lag_starter_per                  1.9597     2.7536   0.712    0.477
## no_cert_per                      1.9625     2.7532   0.713    0.476
## reenterer_per                    1.9494     2.7533   0.708    0.479
## emer_per                         1.9652     2.7527   0.714    0.476
## new_hires_per:std_all_per       -0.1230     0.2163  -0.569    0.570
## new_hires_per:intern_per        -0.1227     0.2163  -0.567    0.571
## new_hires_per:other_temp_per    -0.1232     0.2163  -0.570    0.569
## new_hires_per:oos_std_per       -0.1229     0.2163  -0.568    0.570
## new_hires_per:lag_starter_per   -0.1231     0.2163  -0.569    0.570
## new_hires_per:no_cert_per       -0.1233     0.2163  -0.570    0.569
## new_hires_per:reenterer_per     -0.1226     0.2163  -0.567    0.571
## new_hires_per:emer_per          -0.1232     0.2162  -0.570    0.569
## 
## Residual standard error: 0.1829 on 432 degrees of freedom
## Multiple R-squared:  0.7308, Adjusted R-squared:  0.7202 
## F-statistic: 68.97 on 17 and 432 DF,  p-value: < 2.2e-16
plot(lm_model_log, which = 3)

plot(lm_inter_model_log, which = 3)

bptest(lm_model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model_log
## BP = 45.985, df = 9, p-value = 6.062e-07
bptest(lm_inter_model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_inter_model_log
## BP = 37.544, df = 17, p-value = 0.002838
#square root transformation
district_data <- district_data %>% mutate(sqrt_turnover_rate = sqrt(turnover_rate))

lm_model_sqrt <- lm(sqrt_turnover_rate ~ std_all_per + intern_per + other_temp_per + oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, data = district_data)

lm_inter_model_sqrt <- lm(sqrt_turnover_rate ~ new_hires_per * std_all_per + new_hires_per * intern_per + new_hires_per * other_temp_per + new_hires_per * oos_std_per + new_hires_per * lag_starter_per + new_hires_per * no_cert_per + new_hires_per * reenterer_per + new_hires_per * emer_per, data = district_data)


summary(lm_inter_model_sqrt)
## 
## Call:
## lm(formula = sqrt_turnover_rate ~ new_hires_per * std_all_per + 
##     new_hires_per * intern_per + new_hires_per * other_temp_per + 
##     new_hires_per * oos_std_per + new_hires_per * lag_starter_per + 
##     new_hires_per * no_cert_per + new_hires_per * reenterer_per + 
##     new_hires_per * emer_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29888 -0.25033 -0.02642  0.22000  1.55202 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -376.6364   562.8777  -0.669    0.504
## new_hires_per                   23.7597    44.2159   0.537    0.591
## std_all_per                      3.7960     5.6287   0.674    0.500
## intern_per                       3.7947     5.6289   0.674    0.501
## other_temp_per                   3.7948     5.6280   0.674    0.500
## oos_std_per                      3.7857     5.6292   0.673    0.502
## lag_starter_per                  3.8046     5.6293   0.676    0.500
## no_cert_per                      3.8128     5.6285   0.677    0.499
## reenterer_per                    3.7915     5.6286   0.674    0.501
## emer_per                         3.8155     5.6275   0.678    0.498
## new_hires_per:std_all_per       -0.2375     0.4421  -0.537    0.591
## new_hires_per:intern_per        -0.2362     0.4422  -0.534    0.594
## new_hires_per:other_temp_per    -0.2373     0.4421  -0.537    0.592
## new_hires_per:oos_std_per       -0.2365     0.4422  -0.535    0.593
## new_hires_per:lag_starter_per   -0.2365     0.4422  -0.535    0.593
## new_hires_per:no_cert_per       -0.2373     0.4421  -0.537    0.592
## new_hires_per:reenterer_per     -0.2363     0.4422  -0.534    0.593
## new_hires_per:emer_per          -0.2367     0.4421  -0.535    0.593
## 
## Residual standard error: 0.3739 on 432 degrees of freedom
## Multiple R-squared:  0.764,  Adjusted R-squared:  0.7547 
## F-statistic: 82.26 on 17 and 432 DF,  p-value: < 2.2e-16
summary(lm_model_sqrt)
## 
## Call:
## lm(formula = sqrt_turnover_rate ~ std_all_per + intern_per + 
##     other_temp_per + oos_std_per + lag_starter_per + no_cert_per + 
##     reenterer_per + emer_per + new_hires_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19977 -0.24619 -0.02803  0.22799  1.38152 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -14.302653 220.990824  -0.065    0.948    
## std_all_per       0.162349   2.209947   0.073    0.941    
## intern_per        0.176311   2.209973   0.080    0.936    
## other_temp_per    0.162553   2.209811   0.074    0.941    
## oos_std_per       0.162382   2.209766   0.073    0.941    
## lag_starter_per   0.182520   2.210031   0.083    0.934    
## no_cert_per       0.176534   2.209968   0.080    0.936    
## reenterer_per     0.171990   2.209918   0.078    0.938    
## emer_per          0.187533   2.209923   0.085    0.932    
## new_hires_per     0.101740   0.003424  29.715   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3814 on 440 degrees of freedom
## Multiple R-squared:  0.7499, Adjusted R-squared:  0.7448 
## F-statistic: 146.6 on 9 and 440 DF,  p-value: < 2.2e-16
plot(lm_model_sqrt, which = 3)

plot(lm_inter_model_sqrt, which = 3)

bptest(lm_model_sqrt)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model_sqrt
## BP = 40.273, df = 9, p-value = 6.78e-06
bptest(lm_inter_model_sqrt)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_inter_model_sqrt
## BP = 34.445, df = 17, p-value = 0.007352
#mitigating violation of residual non-normality

#log transformation
district_data <- district_data %>% mutate(log_turnover_rate = log(turnover_rate))

lm_model_log <- lm(log_turnover_rate ~ std_all_per + intern_per + other_temp_per + oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, data = district_data)

lm_inter_model_log <- lm(log_turnover_rate ~ new_hires_per * std_all_per + new_hires_per * intern_per + new_hires_per * other_temp_per + new_hires_per * oos_std_per + new_hires_per * lag_starter_per + new_hires_per * no_cert_per + new_hires_per * reenterer_per + new_hires_per * emer_per, data = district_data)


summary(lm_model_log)
## 
## Call:
## lm(formula = log_turnover_rate ~ std_all_per + intern_per + other_temp_per + 
##     oos_std_per + lag_starter_per + no_cert_per + reenterer_per + 
##     emer_per + new_hires_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67161 -0.11884 -0.00912  0.11775  0.64258 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.393e-01  1.090e+02   0.008    0.994    
## std_all_per     9.846e-03  1.090e+00   0.009    0.993    
## intern_per      1.677e-02  1.090e+00   0.015    0.988    
## other_temp_per  1.122e-02  1.090e+00   0.010    0.992    
## oos_std_per     8.430e-03  1.090e+00   0.008    0.994    
## lag_starter_per 1.916e-02  1.090e+00   0.018    0.986    
## no_cert_per     1.681e-02  1.090e+00   0.015    0.988    
## reenterer_per   1.355e-02  1.090e+00   0.012    0.990    
## emer_per        1.949e-02  1.090e+00   0.018    0.986    
## new_hires_per   4.483e-02  1.689e-03  26.545   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1881 on 440 degrees of freedom
## Multiple R-squared:  0.7098, Adjusted R-squared:  0.7038 
## F-statistic: 119.6 on 9 and 440 DF,  p-value: < 2.2e-16
summary(lm_inter_model_log)
## 
## Call:
## lm(formula = log_turnover_rate ~ new_hires_per * std_all_per + 
##     new_hires_per * intern_per + new_hires_per * other_temp_per + 
##     new_hires_per * oos_std_per + new_hires_per * lag_starter_per + 
##     new_hires_per * no_cert_per + new_hires_per * reenterer_per + 
##     new_hires_per * emer_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53666 -0.12728 -0.01082  0.11647  0.73175 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -192.9978   275.3325  -0.701    0.484
## new_hires_per                   12.3286    21.6283   0.570    0.569
## std_all_per                      1.9503     2.7533   0.708    0.479
## intern_per                       1.9534     2.7534   0.709    0.478
## other_temp_per                   1.9535     2.7530   0.710    0.478
## oos_std_per                      1.9497     2.7536   0.708    0.479
## lag_starter_per                  1.9597     2.7536   0.712    0.477
## no_cert_per                      1.9625     2.7532   0.713    0.476
## reenterer_per                    1.9494     2.7533   0.708    0.479
## emer_per                         1.9652     2.7527   0.714    0.476
## new_hires_per:std_all_per       -0.1230     0.2163  -0.569    0.570
## new_hires_per:intern_per        -0.1227     0.2163  -0.567    0.571
## new_hires_per:other_temp_per    -0.1232     0.2163  -0.570    0.569
## new_hires_per:oos_std_per       -0.1229     0.2163  -0.568    0.570
## new_hires_per:lag_starter_per   -0.1231     0.2163  -0.569    0.570
## new_hires_per:no_cert_per       -0.1233     0.2163  -0.570    0.569
## new_hires_per:reenterer_per     -0.1226     0.2163  -0.567    0.571
## new_hires_per:emer_per          -0.1232     0.2162  -0.570    0.569
## 
## Residual standard error: 0.1829 on 432 degrees of freedom
## Multiple R-squared:  0.7308, Adjusted R-squared:  0.7202 
## F-statistic: 68.97 on 17 and 432 DF,  p-value: < 2.2e-16
#square root transformation

district_data <- district_data %>% mutate(sqrt_turnover_rate = sqrt(turnover_rate))

lm_model_sqrt <- lm(sqrt_turnover_rate ~ std_all_per + intern_per + other_temp_per + oos_std_per + lag_starter_per + no_cert_per + reenterer_per + emer_per + new_hires_per, data = district_data)

lm_inter_model_sqrt <- lm(sqrt_turnover_rate ~ new_hires_per * std_all_per + new_hires_per * intern_per + new_hires_per * other_temp_per + new_hires_per * oos_std_per + new_hires_per * lag_starter_per + new_hires_per * no_cert_per + new_hires_per * reenterer_per + new_hires_per * emer_per, data = district_data)


summary(lm_model_sqrt)
## 
## Call:
## lm(formula = sqrt_turnover_rate ~ std_all_per + intern_per + 
##     other_temp_per + oos_std_per + lag_starter_per + no_cert_per + 
##     reenterer_per + emer_per + new_hires_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19977 -0.24619 -0.02803  0.22799  1.38152 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -14.302653 220.990824  -0.065    0.948    
## std_all_per       0.162349   2.209947   0.073    0.941    
## intern_per        0.176311   2.209973   0.080    0.936    
## other_temp_per    0.162553   2.209811   0.074    0.941    
## oos_std_per       0.162382   2.209766   0.073    0.941    
## lag_starter_per   0.182520   2.210031   0.083    0.934    
## no_cert_per       0.176534   2.209968   0.080    0.936    
## reenterer_per     0.171990   2.209918   0.078    0.938    
## emer_per          0.187533   2.209923   0.085    0.932    
## new_hires_per     0.101740   0.003424  29.715   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3814 on 440 degrees of freedom
## Multiple R-squared:  0.7499, Adjusted R-squared:  0.7448 
## F-statistic: 146.6 on 9 and 440 DF,  p-value: < 2.2e-16
summary(lm_inter_model_sqrt)
## 
## Call:
## lm(formula = sqrt_turnover_rate ~ new_hires_per * std_all_per + 
##     new_hires_per * intern_per + new_hires_per * other_temp_per + 
##     new_hires_per * oos_std_per + new_hires_per * lag_starter_per + 
##     new_hires_per * no_cert_per + new_hires_per * reenterer_per + 
##     new_hires_per * emer_per, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29888 -0.25033 -0.02642  0.22000  1.55202 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -376.6364   562.8777  -0.669    0.504
## new_hires_per                   23.7597    44.2159   0.537    0.591
## std_all_per                      3.7960     5.6287   0.674    0.500
## intern_per                       3.7947     5.6289   0.674    0.501
## other_temp_per                   3.7948     5.6280   0.674    0.500
## oos_std_per                      3.7857     5.6292   0.673    0.502
## lag_starter_per                  3.8046     5.6293   0.676    0.500
## no_cert_per                      3.8128     5.6285   0.677    0.499
## reenterer_per                    3.7915     5.6286   0.674    0.501
## emer_per                         3.8155     5.6275   0.678    0.498
## new_hires_per:std_all_per       -0.2375     0.4421  -0.537    0.591
## new_hires_per:intern_per        -0.2362     0.4422  -0.534    0.594
## new_hires_per:other_temp_per    -0.2373     0.4421  -0.537    0.592
## new_hires_per:oos_std_per       -0.2365     0.4422  -0.535    0.593
## new_hires_per:lag_starter_per   -0.2365     0.4422  -0.535    0.593
## new_hires_per:no_cert_per       -0.2373     0.4421  -0.537    0.592
## new_hires_per:reenterer_per     -0.2363     0.4422  -0.534    0.593
## new_hires_per:emer_per          -0.2367     0.4421  -0.535    0.593
## 
## Residual standard error: 0.3739 on 432 degrees of freedom
## Multiple R-squared:  0.764,  Adjusted R-squared:  0.7547 
## F-statistic: 82.26 on 17 and 432 DF,  p-value: < 2.2e-16
plot(lm_model_log, which = 2)

plot(lm_model_sqrt, which = 2)

shapiro.test(lm_model_log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model_log$residuals
## W = 0.99413, p-value = 0.08043
shapiro.test(lm_model_sqrt$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model_sqrt$residuals
## W = 0.987, p-value = 0.0004792
shapiro.test(lm_inter_model_log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_inter_model_log$residuals
## W = 0.99116, p-value = 0.008649
shapiro.test(lm_inter_model_sqrt$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_inter_model_sqrt$residuals
## W = 0.98011, p-value = 7.771e-06
#mitigating violation of multicollinearity

kitchen_sink_vars <- district_data %>% dplyr::select(std_all_per, intern_per, other_temp_per, oos_std_per, lag_starter_per, no_cert_per, reenterer_per, emer_per, new_hires_per)

cor_matrix <- cor(kitchen_sink_vars, use = "pairwise.complete.obs")
print(cor_matrix)
##                  std_all_per  intern_per other_temp_per oos_std_per
## std_all_per      1.000000000 -0.22554188   -0.008392639  0.11767473
## intern_per      -0.225541884  1.00000000   -0.169005675 -0.18674578
## other_temp_per  -0.008392639 -0.16900567    1.000000000  0.15897515
## oos_std_per      0.117674730 -0.18674578    0.158975152  1.00000000
## lag_starter_per -0.038265217 -0.37848198    0.018435987 -0.07900271
## no_cert_per     -0.279407667 -0.04999227   -0.125190444 -0.15649815
## reenterer_per   -0.299354683 -0.59903772   -0.118744720 -0.07960925
## emer_per        -0.121070194 -0.10239384   -0.018563343  0.12835520
## new_hires_per   -0.299816472  0.10006796    0.104916438  0.03966198
##                 lag_starter_per no_cert_per reenterer_per    emer_per
## std_all_per         -0.03826522 -0.27940767   -0.29935468 -0.12107019
## intern_per          -0.37848198 -0.04999227   -0.59903772 -0.10239384
## other_temp_per       0.01843599 -0.12519044   -0.11874472 -0.01856334
## oos_std_per         -0.07900271 -0.15649815   -0.07960925  0.12835520
## lag_starter_per      1.00000000 -0.14677829    0.09547726 -0.11088175
## no_cert_per         -0.14677829  1.00000000   -0.16329654  0.01399734
## reenterer_per        0.09547726 -0.16329654    1.00000000 -0.04874726
## emer_per            -0.11088175  0.01399734   -0.04874726  1.00000000
## new_hires_per        0.02683779  0.18791220   -0.11894367  0.23440918
##                 new_hires_per
## std_all_per       -0.29981647
## intern_per         0.10006796
## other_temp_per     0.10491644
## oos_std_per        0.03966198
## lag_starter_per    0.02683779
## no_cert_per        0.18791220
## reenterer_per     -0.11894367
## emer_per           0.23440918
## new_hires_per      1.00000000
#combining certification variables

district_data <- district_data %>% mutate(cert = std_all_per + oos_std_per + lag_starter_per, alt = intern_per + other_temp_per, uncert = no_cert_per + emer_per)               
                                          
#cert = standard certs
#alt = alternative certs
#uncert = uncertified or temporary
#decided to drop reenterers. conceptually, they are not "new" teachers -- they are experienced but are coming back to the field

lm_model_combined <- lm(turnover_rate ~ cert + alt + uncert + new_hires_per, data = district_data)

lm_inter_model_combined <- lm(turnover_rate ~ new_hires_per * cert + new_hires_per * alt + new_hires_per * uncert, data = district_data)

summary(lm_model_combined)
## 
## Call:
## lm(formula = turnover_rate ~ cert + alt + uncert + new_hires_per, 
##     data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.6573  -2.1480  -0.4508   1.7160  13.9036 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.55897    1.02665   7.363 8.79e-13 ***
## cert          -0.06438    0.01947  -3.306  0.00102 ** 
## alt           -0.01459    0.01516  -0.963  0.33622    
## uncert         0.03833    0.02611   1.468  0.14293    
## new_hires_per  0.98601    0.02932  33.634  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.442 on 445 degrees of freedom
## Multiple R-squared:  0.7549, Adjusted R-squared:  0.7527 
## F-statistic: 342.7 on 4 and 445 DF,  p-value: < 2.2e-16
summary(lm_inter_model_combined)
## 
## Call:
## lm(formula = turnover_rate ~ new_hires_per * cert + new_hires_per * 
##     alt + new_hires_per * uncert, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9249  -2.1440  -0.4746   1.7226  13.7419 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.342547   1.876000   2.315  0.02108 *  
## new_hires_per         1.245757   0.131485   9.475  < 2e-16 ***
## cert                 -0.012084   0.040658  -0.297  0.76644    
## alt                   0.021852   0.030317   0.721  0.47141    
## uncert                0.125220   0.041745   3.000  0.00286 ** 
## new_hires_per:cert   -0.004195   0.003089  -1.358  0.17510    
## new_hires_per:alt    -0.003095   0.002056  -1.505  0.13297    
## new_hires_per:uncert -0.005705   0.002108  -2.707  0.00705 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.425 on 442 degrees of freedom
## Multiple R-squared:  0.759,  Adjusted R-squared:  0.7552 
## F-statistic: 198.8 on 7 and 442 DF,  p-value: < 2.2e-16
vif_lm_model_combined <- vif(lm_model_combined)
print(vif_lm_model_combined)
##          cert           alt        uncert new_hires_per 
##      1.546264      1.382419      1.368248      1.115453
vif_lm_inter_model_combined <- vif(lm_inter_model_combined)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(vif_lm_inter_model_combined)
##        new_hires_per                 cert                  alt 
##            22.663134             6.807575             5.586699 
##               uncert   new_hires_per:cert    new_hires_per:alt 
##             3.531244            10.133858            12.508046 
## new_hires_per:uncert 
##             6.180599
#log transformation of combined variables
district_data$cert_log <- log(district_data$cert + 1 )
district_data$alt_log <- log(district_data$alt + 1 )
district_data$uncert_log <- log(district_data$uncert + 1 )
district_data$new_hires_per_log <- log(district_data$new_hires_per + 1)


district_data$log_new_hires_cert <- district_data$new_hires_per_log * district_data$cert_log
district_data$log_new_hires_alt <- district_data$new_hires_per_log * district_data$alt_log
district_data$log_new_hires_uncert <- district_data$new_hires_per_log * district_data$uncert_log



lm_model_log2 <- lm(turnover_rate ~ cert_log + alt_log + uncert_log + new_hires_per_log, data = district_data)

lm_inter_model_log2 <- lm(turnover_rate ~ new_hires_per_log * cert_log + new_hires_per_log * alt_log + new_hires_per_log * uncert_log, data = district_data)


summary(lm_model_log2)
## 
## Call:
## lm(formula = turnover_rate ~ cert_log + alt_log + uncert_log + 
##     new_hires_per_log, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4326  -2.5171  -0.6843   2.2400  14.6057 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.5960     1.8990  -1.894   0.0589 .  
## cert_log           -1.9886     0.2751  -7.229 2.14e-12 ***
## alt_log            -2.4568     0.3230  -7.607 1.69e-13 ***
## uncert_log         -0.2873     0.2032  -1.414   0.1581    
## new_hires_per_log  14.6286     0.4684  31.230  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.694 on 445 degrees of freedom
## Multiple R-squared:  0.7177, Adjusted R-squared:  0.7152 
## F-statistic: 282.9 on 4 and 445 DF,  p-value: < 2.2e-16
summary(lm_inter_model_log2)
## 
## Call:
## lm(formula = turnover_rate ~ new_hires_per_log * cert_log + new_hires_per_log * 
##     alt_log + new_hires_per_log * uncert_log, data = district_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8370 -2.5064 -0.6714  2.1811 14.8261 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -13.2670     7.1950  -1.844   0.0659 .  
## new_hires_per_log             18.3214     2.8114   6.517 1.96e-10 ***
## cert_log                       0.2880     1.4788   0.195   0.8457    
## alt_log                       -1.2852     1.1926  -1.078   0.2818    
## uncert_log                    -0.8889     1.0027  -0.887   0.3758    
## new_hires_per_log:cert_log    -0.9189     0.5940  -1.547   0.1226    
## new_hires_per_log:alt_log     -0.4192     0.4425  -0.947   0.3440    
## new_hires_per_log:uncert_log   0.2485     0.3986   0.623   0.5334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.686 on 442 degrees of freedom
## Multiple R-squared:  0.7208, Adjusted R-squared:  0.7164 
## F-statistic:   163 on 7 and 442 DF,  p-value: < 2.2e-16
vif(lm_model_log2)
##          cert_log           alt_log        uncert_log new_hires_per_log 
##          1.104417          1.066457          1.144414          1.106576
vif(lm_inter_model_log2)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##            new_hires_per_log                     cert_log 
##                     40.03352                     32.05083 
##                      alt_log                   uncert_log 
##                     14.60348                     27.99157 
##   new_hires_per_log:cert_log    new_hires_per_log:alt_log 
##                     42.96771                     27.71856 
## new_hires_per_log:uncert_log 
##                     33.89596
#square root transformation of combined variables

district_data$cert_sqrt <- sqrt(district_data$cert)
district_data$alt_sqrt <- sqrt(district_data$alt)
district_data$uncert_sqrt <- sqrt(district_data$uncert)
district_data$new_hires_per_sqrt <- sqrt(district_data$new_hires_per)

district_data$sqrt_new_hires_cert <- district_data$new_hires_per_sqrt * district_data$cert_sqrt
district_data$sqrt_new_hires_alt <- district_data$new_hires_per_sqrt * district_data$alt_sqrt
district_data$sqrt_new_hires_uncert <- district_data$new_hires_per_sqrt * district_data$uncert_sqrt


lm_model_sqrt2 <- lm(turnover_rate ~ cert_sqrt + alt_sqrt + uncert_sqrt + new_hires_per_sqrt, data = district_data)

lm_inter_model_sqrt2 <- lm(turnover_rate ~ new_hires_per_sqrt * cert_sqrt + new_hires_per_sqrt * alt_sqrt +  new_hires_per_sqrt * uncert_sqrt, data = district_data)


summary(lm_model_sqrt2)
## 
## Call:
## lm(formula = turnover_rate ~ cert_sqrt + alt_sqrt + uncert_sqrt + 
##     new_hires_per_sqrt, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8802  -2.3148  -0.5312   1.9763  13.7126 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.6283     1.5398   0.408    0.683    
## cert_sqrt           -0.9269     0.1468  -6.313 6.64e-10 ***
## alt_sqrt            -0.7611     0.1448  -5.258 2.27e-07 ***
## uncert_sqrt         -0.1046     0.1358  -0.771    0.441    
## new_hires_per_sqrt   7.6916     0.2290  33.592  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.493 on 445 degrees of freedom
## Multiple R-squared:  0.7476, Adjusted R-squared:  0.7453 
## F-statistic: 329.5 on 4 and 445 DF,  p-value: < 2.2e-16
summary(lm_inter_model_sqrt2)
## 
## Call:
## lm(formula = turnover_rate ~ new_hires_per_sqrt * cert_sqrt + 
##     new_hires_per_sqrt * alt_sqrt + new_hires_per_sqrt * uncert_sqrt, 
##     data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5240  -2.3719  -0.6452   2.0435  14.0328 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -7.91650    4.74118  -1.670   0.0957 .  
## new_hires_per_sqrt             10.19253    1.33748   7.621 1.55e-13 ***
## cert_sqrt                       0.01635    0.58621   0.028   0.9778    
## alt_sqrt                       -0.17429    0.45012  -0.387   0.6988    
## uncert_sqrt                     0.40050    0.45128   0.887   0.3753    
## new_hires_per_sqrt:cert_sqrt   -0.28286    0.17118  -1.652   0.0992 .  
## new_hires_per_sqrt:alt_sqrt    -0.16421    0.11856  -1.385   0.1667    
## new_hires_per_sqrt:uncert_sqrt -0.15132    0.12628  -1.198   0.2314    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.491 on 442 degrees of freedom
## Multiple R-squared:  0.7496, Adjusted R-squared:  0.7457 
## F-statistic: 189.1 on 7 and 442 DF,  p-value: < 2.2e-16
vif(lm_model_sqrt2)
##          cert_sqrt           alt_sqrt        uncert_sqrt new_hires_per_sqrt 
##           1.276592           1.175454           1.221371           1.105554
vif(lm_inter_model_sqrt2)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##             new_hires_per_sqrt                      cert_sqrt 
##                       37.77730                       20.37647 
##                       alt_sqrt                    uncert_sqrt 
##                       11.38176                       13.51409 
##   new_hires_per_sqrt:cert_sqrt    new_hires_per_sqrt:alt_sqrt 
##                       27.65513                       22.16217 
## new_hires_per_sqrt:uncert_sqrt 
##                       18.89142