library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
  1. Load your preferred dataset into R studio
teacher_data <- read_csv("Teacher_Hiring_Certification_Turnover.csv")
## Rows: 33 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): REGION, distname, geotype_new, region_lea, Year
## dbl (20): district, schyr, intern, other_temp, oos_std, lag_starter, no_cert...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(teacher_data)
## # A tibble: 6 × 25
##   district schyr REGION intern other_temp oos_std lag_starter no_cert reenterer
##      <dbl> <dbl> <chr>   <dbl>      <dbl>   <dbl>       <dbl>   <dbl>     <dbl>
## 1   101902  2013 04        145         71      11          60      22       165
## 2   101902  2014 04        201        102       8          36      50       215
## 3   101902  2015 04        267        120      16          21      38       162
## 4   101902  2016 04        306        105      14          27      55       159
## 5   101902  2017 04        371        106      15          17      74       179
## 6   101902  2018 04        245         70       8           9      55       117
## # ℹ 16 more variables: emer <dbl>, std_all <dbl>, distname <chr>,
## #   geotype_new <chr>, total_new_hires <dbl>, region_lea <chr>, Year <chr>,
## #   total_teachers <dbl>, turnover_rate_teachers <dbl>, beg_year <dbl>,
## #   `1-5_years` <dbl>, `6-10_years` <dbl>, `11-20_years` <dbl>,
## #   over20_years <dbl>, `st-per-tch` <dbl>, num_st_mem <dbl>
  1. Create a linear model “lm()” from the variables, with a continuous dependent variable as the outcome
turnover_model <- lm(turnover_rate_teachers~intern + other_temp + oos_std + lag_starter + no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data=teacher_data)
  1. Check the following assumptions:
  1. Linearity (plot and raintest)
plot(turnover_model,which=1)

raintest(turnover_model)
## 
##  Rainbow test
## 
## data:  turnover_model
## Rain = 0.93786, df1 = 17, df2 = 1, p-value = 0.6837
  1. Independence of errors (durbin-watson)
durbinWatsonTest(turnover_model)
##  lag Autocorrelation D-W Statistic p-value
##    1    -0.004579254       1.77478   0.074
##  Alternative hypothesis: rho != 0
  1. Homoscedasticity (plot, bptest)
plot(turnover_model,which=3)

bptest(turnover_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  turnover_model
## BP = 12.186, df = 14, p-value = 0.5914
  1. Normality of residuals (QQ plot, shapiro test)
plot(turnover_model,which=2)

shapiro.test(turnover_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  turnover_model$residuals
## W = 0.87576, p-value = 0.001327
  1. No multicolinarity (VIF, cor)
vif(turnover_model)
##        intern    other_temp       oos_std   lag_starter       no_cert 
##     19.250431     43.882459      5.595996      3.893520     11.215370 
##     reenterer          emer       std_all      beg_year   `1-5_years` 
##      8.923769      2.915729     80.274319     16.047797     33.021766 
##  `6-10_years` `11-20_years`  over20_years  `st-per-tch` 
##     46.687250     40.442599     53.253683      2.385037
kitchen_sink_vars <- teacher_data %>% dplyr::select(intern, other_temp, oos_std,lag_starter, no_cert, reenterer, emer, std_all, beg_year,`1-5_years`, `6-10_years`, `11-20_years`, `over20_years`, `st-per-tch`)
cor(kitchen_sink_vars)
##                   intern  other_temp    oos_std lag_starter    no_cert
## intern        1.00000000  0.80645667  0.6396149  0.30425355 0.09815067
## other_temp    0.80645667  1.00000000  0.6229393  0.51104315 0.23115484
## oos_std       0.63961490  0.62293932  1.0000000  0.38350871 0.34262975
## lag_starter   0.30425355  0.51104315  0.3835087  1.00000000 0.05730342
## no_cert       0.09815067  0.23115484  0.3426298  0.05730342 1.00000000
## reenterer     0.57768791  0.59818987  0.5605843  0.62990296 0.44184310
## emer          0.40351531  0.33333515  0.2029714  0.11237210 0.38136220
## std_all       0.78264794  0.95906769  0.6346246  0.63699505 0.14676882
## beg_year      0.76913394  0.78390655  0.7234533  0.37562180 0.60511265
## 1-5_years     0.88361596  0.80738743  0.5215350  0.36948541 0.21723501
## 6-10_years    0.83826020  0.90303087  0.5938319  0.59706602 0.25935390
## 11-20_years   0.83964120  0.76215101  0.5359403  0.37266141 0.27829356
## over20_years  0.79722535  0.82601821  0.4635441  0.34273789 0.32579863
## st-per-tch   -0.30938909 -0.02821343 -0.4343193  0.19820649 0.01245402
##                reenterer       emer     std_all   beg_year  1-5_years
## intern        0.57768791 0.40351531 0.782647938  0.7691339  0.8836160
## other_temp    0.59818987 0.33333515 0.959067693  0.7839065  0.8073874
## oos_std       0.56058428 0.20297142 0.634624632  0.7234533  0.5215350
## lag_starter   0.62990296 0.11237210 0.636995047  0.3756218  0.3694854
## no_cert       0.44184310 0.38136220 0.146768819  0.6051126  0.2172350
## reenterer     1.00000000 0.43759712 0.690464009  0.7226507  0.6833765
## emer          0.43759712 1.00000000 0.318472049  0.4508522  0.5075710
## std_all       0.69046401 0.31847205 1.000000000  0.7555316  0.8234062
## beg_year      0.72265065 0.45085217 0.755531641  1.0000000  0.7730621
## 1-5_years     0.68337648 0.50757103 0.823406241  0.7730621  1.0000000
## 6-10_years    0.77191271 0.48013784 0.940187810  0.8094086  0.9126570
## 11-20_years   0.72456384 0.60224765 0.788301599  0.7967198  0.9577039
## over20_years  0.60808758 0.62674134 0.802438004  0.7656409  0.9396855
## st-per-tch   -0.07730141 0.03802326 0.001465585 -0.1650754 -0.1689958
##               6-10_years 11-20_years over20_years   st-per-tch
## intern        0.83826020   0.8396412   0.79722535 -0.309389092
## other_temp    0.90303087   0.7621510   0.82601821 -0.028213430
## oos_std       0.59383190   0.5359403   0.46354405 -0.434319277
## lag_starter   0.59706602   0.3726614   0.34273789  0.198206493
## no_cert       0.25935390   0.2782936   0.32579863  0.012454017
## reenterer     0.77191271   0.7245638   0.60808758 -0.077301407
## emer          0.48013784   0.6022477   0.62674134  0.038023257
## std_all       0.94018781   0.7883016   0.80243800  0.001465585
## beg_year      0.80940860   0.7967198   0.76564091 -0.165075395
## 1-5_years     0.91265703   0.9577039   0.93968552 -0.168995808
## 6-10_years    1.00000000   0.9086697   0.90256531 -0.059100021
## 11-20_years   0.90866972   1.0000000   0.94340000 -0.151934361
## over20_years  0.90256531   0.9434000   1.00000000 -0.075875846
## st-per-tch   -0.05910002  -0.1519344  -0.07587585  1.000000000
  1. does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.

Linearity: the residuals vs. fitted plot and high p-value (0.6837) from the rainbow test suggest that the linearity assumption is met

Independence of Errors: the p-value of the Durbin-Watson test is 0.084, suggesting that the errors are independent

Homoscedasticity: the Breusch-Pagan test p-value = 0.5914, suggesting that the homoscedasticity assumption is likely satisfied. the scale-location plot trends up slightly

Normality of Residuals: there is slight deviation in the QQ plot. the p-value (0.001327) from the Shapiro-Wilk test is below 0.05, suggesting that the residuals are significantly different from a normal distribution

No Multicollinearity: the VIF values are above 5 for most of the variables, and the correlation matrix shows high correlations among predictors like other_temp and std_all, and between various experience-year variables (1-5_years, 6-10_years, 11-20_years, and over20_years), suggesting “no multicollinearity” is violated.

  1. If your model violates an assumption, which one?

The model violates the assumptions of Normality of Residuals and No Multicollinearity

  1. What would you do to mitigate this assumption? Show your work.

log transformation to mitigate normality of residuals violation did not work, p-value = 0.001527

turnover_model_log <- lm(log(turnover_rate_teachers) ~ intern + other_temp + oos_std + lag_starter +no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data = teacher_data)

plot(turnover_model_log, which = 2) 

shapiro.test(residuals(turnover_model_log))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(turnover_model_log)
## W = 0.87831, p-value = 0.001527

square root transformation to mitigate normality of residuals violation did not work, p-value = 0.001238

turnover_model_sqrt <- lm(sqrt(turnover_rate_teachers) ~ intern + other_temp + oos_std + lag_starter + no_cert + reenterer + emer + std_all + beg_year + `1-5_years` + `6-10_years` + `11-20_years` + `over20_years` + `st-per-tch`, data = teacher_data)

plot(turnover_model_sqrt, which = 2) 

shapiro.test(residuals(turnover_model_sqrt))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(turnover_model_sqrt)
## W = 0.8745, p-value = 0.001238

to mitigate multicollinearity, I removed the years of experience variables and other_temp (One Year, Out of State Certified)

turnover_model_reduced <- lm(turnover_rate_teachers ~ intern + oos_std + lag_starter + no_cert +reenterer + emer + std_all + `st-per-tch`, data = teacher_data)


vif(turnover_model_reduced)
##       intern      oos_std  lag_starter      no_cert    reenterer         emer 
##     4.607931     3.155704     2.935829     1.937917     3.621410     1.564944 
##      std_all `st-per-tch` 
##     5.819113     2.023929