Load in a few packages

library(tidyverse)
library(psych)

Load in the data

library(haven)
els <- read_dta("ELS_Logistic.dta")

Quick Data Cleaning and Coding Lesson! Using case_when

glimpse(els)
Rows: 16,197
Columns: 55
$ stu_id   <dbl> 101101, 101102, 101104, 101105, 101106, 101107, 101108, 101109, 101110, 1011…
$ sch_id   <dbl> 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011…
$ strat_id <dbl> 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 1…
$ psu      <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ f1sch_id <dbl+lbl> 1011, 1011, 1011, 1011, 1011, 1011, 1011,   -8,   -8, 1011,   -8, 1011, …
$ f1univ1  <dbl+lbl> 101, 101, 101, 101, 101, 101, 101, 107, 107, 101, 101, 101, 101, 101, 10…
$ f1univ2a <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ f1univ2b <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 7, 5, 1, 1, 1, 1, 1, …
$ f2univ_p <dbl+lbl> 103, 101, 101, 101, 101, 101, 101, 104, 105, 101, 101, 102, 102, 101, 10…
$ bystuwt  <dbl+lbl> 178.9513,  28.2951, 589.7248, 235.7822, 178.9513, 256.9656, 192.4304, 62…
$ bysex    <dbl+lbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, …
$ byrace   <dbl+lbl> 5, 2, 7, 3, 4, 4, 4, 7, 4, 3, 3, 4, 3, 2, 2, 3, 3, 4, 7, 2, 4, 4, 5, 5, …
$ bystlang <dbl+lbl> 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, …
$ byses1   <dbl+lbl> -0.25,  0.58, -0.85, -0.80, -1.41, -1.07,  0.27, -0.16, -1.00, -1.22, -0…
$ byses1qu <dbl+lbl> 2, 4, 1, 1, 1, 1, 3, 2, 1, 1, 2, 2, 1, 2, 4, 3, 3, 1, 1, 4, 1, 1, 1, 1, …
$ bygrdrpt <dbl+lbl>  0,  0,  0, 98,  0,  0, 98,  0,  0, 98,  0,  0,  1,  0, 98,  0,  1, 98, …
$ byhomlit <dbl+lbl> 0, 3, 2, 1, 1, 2, 1, 3, 0, 2, 0, 3, 2, 1, 2, 0, 1, 0, 2, 2, 0, 2, 1, 2, …
$ byparasp <dbl+lbl> 5, 7, 7, 6, 2, 3, 5, 5, 5, 2, 5, 3, 6, 5, 6, 3, 5, 7, 3, 6, 7, 3, 6, 6, …
$ byiepflg <dbl+lbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ bytxcstd <dbl+lbl> 56.21, 57.66, 66.50, 46.46, 36.17, 30.72, 45.46, 68.39, 42.07, 43.17, 58…
$ bytxcqu  <dbl+lbl> 3, 4, 4, 2, 1, 1, 2, 4, 1, 2, 4, 2, 1, 3, 4, 1, 1, 3, 2, 4, 2, 2, 1, 1, …
$ bywrtnga <dbl+lbl>  1.191,  1.191,  0.996, -0.137, -0.435, -0.630, -0.246,  0.704, -1.116, …
$ byxtracu <dbl+lbl> 1, 3, 2, 0, 0, 0, 1, 2, 1, 2, 0, 0, 2, 1, 2, 0, 0, 0, 0, 3, 0, 0, 0, 1, …
$ byhmwrk  <dbl+lbl>  7,  5, -9, 11, 10, 13, 12, 12, -9, 23,  9,  4,  7, 11,  3,  1,  3,  3, …
$ bytvvigm <dbl+lbl> 99,  4,  1, 99,  4,  3,  2,  3, -9,  3,  8,  1,  4,  8,  1, 99, 99, 99, …
$ f1qwt    <dbl+lbl>  152.9769,   25.3577,  709.4246,  199.7193,  152.9769,  205.2692,  156.5…
$ f1pnlwt  <dbl+lbl>  155.6312,   25.4906,  725.6926,  205.1919,  155.6312,  211.4690,  159.5…
$ f1trscwt <dbl+lbl> 284.6529,  42.0937, 730.2420, 280.0837, 287.2398, 324.1828, 308.8419, 60…
$ f2qtscwt <dbl+lbl>   0.0000,  50.3749, 755.4681, 245.5287, 265.5541, 304.4922, 211.9139, 75…
$ f2qwt    <dbl+lbl>   0.0000,  28.2772, 660.0555, 198.5202, 153.7132, 213.8219, 168.9149, 67…
$ f2f1wt   <dbl+lbl>   0.0000,  29.8462, 713.1807, 227.0875, 163.9503, 221.4855, 178.8221,   …
$ f2bywt   <dbl+lbl>   0.0000,  27.8257, 662.1222, 212.3764, 158.2162, 214.4298, 168.4906, 67…
$ bys14    <dbl+lbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, …
$ bys15    <dbl+lbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, …
$ bys24d   <dbl+lbl>  1,  1,  1,  1,  1,  2,  2,  1,  2,  1,  1,  1,  2,  1,  2,  2,  3,  2, …
$ bys24e   <dbl+lbl>  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  2,  2,  1, …
$ bys24f   <dbl+lbl>  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  2,  1, …
$ bys33a   <dbl+lbl>  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, …
$ bys33b   <dbl+lbl>  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, …
$ bys43    <dbl+lbl>  2,  2,  3,  5,  2, 12, -9,  0, -9,  2,  0,  2,  0,  2,  0,  0,  0,  0, …
$ bys62a   <dbl+lbl> -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,  0, …
$ bys62g   <dbl+lbl> -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,  0, …
$ bys66a   <dbl+lbl>  4,  1,  1,  1,  1,  6,  1,  1,  1,  4, -9,  1,  1,  1,  1,  2, -6,  1, …
$ bys66b   <dbl+lbl>  6,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  2, -9, -1, …
$ bys66c   <dbl+lbl>  3, -1, -1,  6,  1, -3, -1,  6,  1,  6,  1,  1,  1,  2, -1, -3, -1, -1, …
$ bys66f   <dbl+lbl>  1,  1, -1,  1,  1, -3,  1,  1,  1,  1,  1,  1, -3, -1,  7, -3,  1, -3, …
$ bys66g   <dbl+lbl> -3, -3, -1,  1, -3, -9,  1,  1, -3, -1,  1,  1, -3, -1,  7, -3, -3, -3, …
$ bys67    <dbl+lbl> 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, …
$ bys71d   <dbl+lbl>  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0, …
$ bys71e   <dbl+lbl>  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0, …
$ bys83a   <dbl+lbl>  2,  5,  2,  2,  1, -9, -1, -1, -1,  2, -1,  6,  2, -1, -1,  2,  2,  1, …
$ bys83b   <dbl+lbl>  4,  7,  4,  2,  1, -9, -1, -1, -1,  2, -1,  2,  5, -1, -1,  2, -3,  1, …
$ bys87a   <dbl+lbl>  3,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  4,  3,  1,  2, …
$ bys87e   <dbl+lbl>  2,  2,  1,  2,  2, -9,  2,  2,  2,  2,  1,  2,  2,  3,  2,  3,  2,  2, …
$ bys87f   <dbl+lbl>  3,  2,  3,  1,  2,  1,  3,  1,  2,  1,  2,  2,  2,  2,  4,  3,  3,  4, …

First, let’s create an outcome to use, whether a student has an IEP. This is an important measure in disproportionality research (are certain groups of students disproportionately identified as needing special education services). Let’s check the variable labels and get a breakdown of values:

table(els$byiepflg)

  -9   -8   -4    0    1 
7314  179  648 7053 1003 
str(els$byiepflg)
 dbl+lbl [1:16197]  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,...
 @ label       : chr "Base year Individualized Education Plan"
 @ format.stata: chr "%12.0f"
 @ labels      : Named num [1:5] -9 -8 -4 0 1
  ..- attr(*, "names")= chr [1:5] "{Missing}" "{Survey component legitimate skip/NA}" "{Nonrespondent}" "No" ...

Now, after creating a new variable, iep, we are going to clean up iep using case_when by recoding it to only have two values- a student has an IEP(= 1) or not (= 0).

els.clean <- els %>%
  na_if(., -9) %>%
  na_if(., -8) %>% 
  na_if(., -4) %>%
  mutate(.,
         iep = case_when(
           byiepflg == 1 ~ 1,
           byiepflg == 0 ~ 0),
         comptest = bytxcstd,
         iep.fac = as_factor(iep),
         race.fac = as_factor(byrace),
         sex.fac = as_factor(bysex),
         comptest.cent = comptest - mean(comptest, na.rm = TRUE)) %>%
  glimpse()
Rows: 16,197
Columns: 61
$ stu_id        <dbl> 101101, 101102, 101104, 101105, 101106, 101107, 10…
$ sch_id        <dbl> 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 10…
$ strat_id      <dbl> 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, …
$ psu           <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ f1sch_id      <dbl+lbl> 1011, 1011, 1011, 1011, 1011, 1011, 1011,   NA…
$ f1univ1       <dbl+lbl> 101, 101, 101, 101, 101, 101, 101, 107, 107, 1…
$ f1univ2a      <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ f1univ2b      <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1…
$ f2univ_p      <dbl+lbl> 103, 101, 101, 101, 101, 101, 101, 104, 105, 1…
$ bystuwt       <dbl+lbl> 178.9513,  28.2951, 589.7248, 235.7822, 178.95…
$ bysex         <dbl+lbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1…
$ byrace        <dbl+lbl> 5, 2, 7, 3, 4, 4, 4, 7, 4, 3, 3, 4, 3, 2, 2, 3…
$ bystlang      <dbl+lbl> 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1…
$ byses1        <dbl+lbl> -0.25,  0.58, -0.85, -0.80, -1.41, -1.07,  0.2…
$ byses1qu      <dbl+lbl> 2, 4, 1, 1, 1, 1, 3, 2, 1, 1, 2, 2, 1, 2, 4, 3…
$ bygrdrpt      <dbl+lbl>  0,  0,  0, 98,  0,  0, 98,  0,  0, 98,  0,  0…
$ byhomlit      <dbl+lbl> 0, 3, 2, 1, 1, 2, 1, 3, 0, 2, 0, 3, 2, 1, 2, 0…
$ byparasp      <dbl+lbl> 5, 7, 7, 6, 2, 3, 5, 5, 5, 2, 5, 3, 6, 5, 6, 3…
$ byiepflg      <dbl+lbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ bytxcstd      <dbl+lbl> 56.21, 57.66, 66.50, 46.46, 36.17, 30.72, 45.4…
$ bytxcqu       <dbl+lbl> 3, 4, 4, 2, 1, 1, 2, 4, 1, 2, 4, 2, 1, 3, 4, 1…
$ bywrtnga      <dbl+lbl>  1.191,  1.191,  0.996, -0.137, -0.435, -0.630…
$ byxtracu      <dbl+lbl> 1, 3, 2, 0, 0, 0, 1, 2, 1, 2, 0, 0, 2, 1, 2, 0…
$ byhmwrk       <dbl+lbl>  7,  5, NA, 11, 10, 13, 12, 12, NA, 23,  9,  4…
$ bytvvigm      <dbl+lbl> 99,  4,  1, 99,  4,  3,  2,  3, NA,  3,  8,  1…
$ f1qwt         <dbl+lbl>  152.9769,   25.3577,  709.4246,  199.7193,  1…
$ f1pnlwt       <dbl+lbl>  155.6312,   25.4906,  725.6926,  205.1919,  1…
$ f1trscwt      <dbl+lbl> 284.6529,  42.0937, 730.2420, 280.0837, 287.23…
$ f2qtscwt      <dbl+lbl>   0.0000,  50.3749, 755.4681, 245.5287, 265.55…
$ f2qwt         <dbl+lbl>   0.0000,  28.2772, 660.0555, 198.5202, 153.71…
$ f2f1wt        <dbl+lbl>   0.0000,  29.8462, 713.1807, 227.0875, 163.95…
$ f2bywt        <dbl+lbl>   0.0000,  27.8257, 662.1222, 212.3764, 158.21…
$ bys14         <dbl+lbl> 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1…
$ bys15         <dbl+lbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0…
$ bys24d        <dbl+lbl>  1,  1,  1,  1,  1,  2,  2,  1,  2,  1,  1,  1…
$ bys24e        <dbl+lbl>  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1…
$ bys24f        <dbl+lbl>  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1…
$ bys33a        <dbl+lbl>  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0…
$ bys33b        <dbl+lbl>  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0…
$ bys43         <dbl+lbl>  2,  2,  3,  5,  2, 12, NA,  0, NA,  2,  0,  2…
$ bys62a        <dbl+lbl> -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3…
$ bys62g        <dbl+lbl> -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3…
$ bys66a        <dbl+lbl>  4,  1,  1,  1,  1,  6,  1,  1,  1,  4, NA,  1…
$ bys66b        <dbl+lbl>  6,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1…
$ bys66c        <dbl+lbl>  3, -1, -1,  6,  1, -3, -1,  6,  1,  6,  1,  1…
$ bys66f        <dbl+lbl>  1,  1, -1,  1,  1, -3,  1,  1,  1,  1,  1,  1…
$ bys66g        <dbl+lbl> -3, -3, -1,  1, -3, NA,  1,  1, -3, -1,  1,  1…
$ bys67         <dbl+lbl> 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1…
$ bys71d        <dbl+lbl>  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0…
$ bys71e        <dbl+lbl>  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1…
$ bys83a        <dbl+lbl>  2,  5,  2,  2,  1, NA, -1, -1, -1,  2, -1,  6…
$ bys83b        <dbl+lbl>  4,  7,  4,  2,  1, NA, -1, -1, -1,  2, -1,  2…
$ bys87a        <dbl+lbl>  3,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2…
$ bys87e        <dbl+lbl>  2,  2,  1,  2,  2, NA,  2,  2,  2,  2,  1,  2…
$ bys87f        <dbl+lbl>  3,  2,  3,  1,  2,  1,  3,  1,  2,  1,  2,  2…
$ iep           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ comptest      <dbl+lbl> 56.21, 57.66, 66.50, 46.46, 36.17, 30.72, 45.4…
$ iep.fac       <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ race.fac      <fct> "Hispanic, race specified", "Asian, Hawaii/Pac. Is…
$ sex.fac       <fct> Female, Female, Female, Female, Female, Male, Male…
$ comptest.cent <dbl> 5.552232, 7.002232, 15.842232, -4.197768, -14.4877…

Let’s take a look at the race/ethnicity measure. This is going to be one of our predictors of interest: ## Breakdown of race/ethnicity measure

summary(els.clean$race.fac)
   {Survey component legitimate skip/NA}                          {Nonrespondent} 
                                       0                                        0 
Amer. Indian/Alaska Native, non-Hispanic Asian, Hawaii/Pac. Islander,non-Hispanic 
                                     130                                     1460 
 Black or African American, non-Hispanic              Hispanic, no race specified 
                                    2020                                      996 
                Hispanic, race specified         More than one race, non-Hispanic 
                                    1221                                      735 
                     White, non-Hispanic                                     NA's 
                                    8682                                      953 

Breakdown of sex measure

summary(els.clean$sex.fac)
{Survey component legitimate skip/NA}                       {Nonrespondent} 
                                    0                                     0 
                                 Male                                Female 
                                 7653                                  7717 
                                 NA's 
                                  827 

Summary statistics for comptest

And here is comptest, which will be our second predictor of interest. This is a standardized composite achievement test.

describe(els.clean$comptest)

Density plot of comptest scores

kdensity1 <- ggplot(els.clean, aes(x=comptest)) +
geom_density() +
stat_function(fun = dnorm, n = 15892, args = list(mean = 51, sd = 10), color = "red", linetype = "dashed") +
labs(title="Distribution of Achievement Scores",x="Achievement Scores", y = "Density", caption = "N = 15892. The red line indicates a normal distribution. The black lines represents the observed distribution.")
kdensity1

Part 1: Running and interpreting a logistic regression

Running and interpreting the basic model

Do the odds of having an IEP differ by student sex? Do males and females have different odds of having an IEP? Here is the default/ basic command for running a logistic regression. This code will present the results as logit (log odds) coefficients:

logistic1 <- glm(iep.fac ~ sex.fac, family = binomial, data = els.clean)
summary(logistic1)

Call:
glm(formula = iep.fac ~ sex.fac, family = binomial, data = els.clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.5966  -0.5966  -0.4260  -0.4260   2.2112  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -1.63586    0.04287  -38.16   <2e-16 ***
sex.facFemale -0.71808    0.07029  -10.22   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6054.9  on 8055  degrees of freedom
Residual deviance: 5946.1  on 8054  degrees of freedom
  (8141 observations deleted due to missingness)
AIC: 5950.1

Number of Fisher Scoring iterations: 5

Going deeper with odds ratios

These estimates are logit coefficients. Long story short, negative numbers suggest lower odds relative to the reference group (here - male), and positive numbers suggest higher odds. So here, we would say that female students have significantly lower odds of having an IEP compared to male students. But that’s about all we can say without some more math.

We can exponentiate this coefficient to get an odds ratio- this makes things a little easier to interpret.

female.odds.ratio <- exp(-0.71808)
female.odds.ratio
[1] 0.4876877

When the OR is less than one (.49 in this case), we subtract (1 - OR) which equals .51. This means that for female students, the odds of having an IEP are 51% lower than for male children.

Getting Odds Ratios for a Whole Model

To make things easier when you have more than one predictor, we can exponentiate an entire model. Here, we get ORs and confidence intervals. Any interval that doesn’t contain 1 (notice, not 0) is statistically significant at the .05 level.

exp(cbind(OR = coef(logistic1), confint(logistic1)))
Waiting for profiling to be done...
                     OR     2.5 %    97.5 %
(Intercept)   0.1947857 0.1789419 0.2116958
sex.facFemale 0.4876878 0.4245970 0.5593280

A little fancier: calculating predicted probabilities

We can also get predicted probabilities associated with each estimate. To do this, we need to add up the logits and plug them into the formula below:

# Predicted probability for female student (logit = -1.63586 + -0.71808 = -2.35394)

exp(-2.35394)/(1+exp(-2.35394))
[1] 0.08675311

So, for a female student, the predicted probability of having an IEP is 8.7%.

What about for a male student?

# Predicted probability for a male student (logit = -1.63586)

exp(-1.63586)/(1+exp(-1.63586))
[1] 0.1630292

For a male student, the predicted probability of having an IEP is 16.3%.

Part 2: Adding more predictors

The previous model just included one categorical predictor (sex). Now, let’s try to add a continuous predictor, comptest. We get a slightly different research question: Do the odds of having an IEP differ by student sex, after adjusting for achievement scores in math and reading?

logistic2 <- glm(iep.fac ~ sex.fac + comptest.cent, family = binomial, data = els.clean)
summary(logistic2)

Call:
glm(formula = iep.fac ~ sex.fac + comptest.cent, family = binomial, 
    data = els.clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0584  -0.4500  -0.2471  -0.1336   3.6207  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -2.494531   0.066336 -37.604   <2e-16 ***
sex.facFemale -0.775766   0.081790  -9.485   <2e-16 ***
comptest.cent -0.156279   0.004982 -31.370   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5814.0  on 7980  degrees of freedom
Residual deviance: 4233.4  on 7978  degrees of freedom
  (8216 observations deleted due to missingness)
AIC: 4239.4

Number of Fisher Scoring iterations: 6

We can still get odds ratios for the whole model

exp(cbind(OR = coef(logistic2), confint(logistic2)))
Waiting for profiling to be done...
                      OR      2.5 %     97.5 %
(Intercept)   0.08253511 0.07231859 0.09380125
sex.facFemale 0.46035105 0.39182702 0.53997576
comptest.cent 0.85532086 0.84689421 0.86359899

And we can do predicted probabilities

# Predicted probability for female student with average comptest scores (logit = -2.494531 + -0.775766 = -3.270297)

exp(-3.270297)/(1+exp(-3.270297))
[1] 0.03660435

So, for a female student with average comptest scores, the predicted probability of having an IEP is 3.7%.

What about for a male student?

# Predicted probability for a male student with average comptest scores (logit = -2.494531)

exp(-2.494531)/(1+exp(-2.494531))
[1] 0.07624247

For a male student with average comptest scores, the predicted probability of having an IEP is 7.6%.

Part 3: Interpreting a multi-category predictor

Now, let’s try the same thing and add on a factor variable with more categories (race.fac): Do the odds of having an IEP differ by student race/ethnicity?

logistic3 <- glm(iep.fac ~ race.fac, family = binomial, data = els.clean)
summary(logistic3)

Call:
glm(formula = iep.fac ~ race.fac, family = binomial, data = els.clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.6332  -0.5313  -0.4642  -0.4642   2.3824  

Coefficients:
                                                 Estimate Std. Error
(Intercept)                                       -1.7228     0.3433
race.facAsian, Hawaii/Pac. Islander,non-Hispanic  -1.0547     0.3818
race.facBlack or African American, non-Hispanic    0.2024     0.3541
race.facHispanic, no race specified               -0.1638     0.3694
race.facHispanic, race specified                   0.2177     0.3590
race.facMore than one race, non-Hispanic           0.1071     0.3696
race.facWhite, non-Hispanic                       -0.4508     0.3465
                                                 z value Pr(>|z|)    
(Intercept)                                       -5.018 5.22e-07 ***
race.facAsian, Hawaii/Pac. Islander,non-Hispanic  -2.762  0.00574 ** 
race.facBlack or African American, non-Hispanic    0.572  0.56766    
race.facHispanic, no race specified               -0.444  0.65740    
race.facHispanic, race specified                   0.606  0.54424    
race.facMore than one race, non-Hispanic           0.290  0.77198    
race.facWhite, non-Hispanic                       -1.301  0.19325    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5814.0  on 7980  degrees of freedom
Residual deviance: 5716.5  on 7974  degrees of freedom
  (8216 observations deleted due to missingness)
AIC: 5730.5

Number of Fisher Scoring iterations: 5

Get odds ratios for the whole model…

exp(cbind(OR = coef(logistic3), confint(logistic3)))
Waiting for profiling to be done...
                                                        OR      2.5 %
(Intercept)                                      0.1785714 0.08575462
race.facAsian, Hawaii/Pac. Islander,non-Hispanic 0.3482815 0.17026338
race.facBlack or African American, non-Hispanic  1.2242915 0.63827452
race.facHispanic, no race specified              0.8488998 0.42739814
race.facHispanic, race specified                 1.2432000 0.64100252
race.facMore than one race, non-Hispanic         1.1130435 0.56019712
race.facWhite, non-Hispanic                      0.6370876 0.33784686
                                                    97.5 %
(Intercept)                                      0.3343003
race.facAsian, Hawaii/Pac. Islander,non-Hispanic 0.7720402
race.facBlack or African American, non-Hispanic  2.5949552
race.facHispanic, no race specified              1.8446732
race.facHispanic, race specified                 2.6562457
race.facMore than one race, non-Hispanic         2.4196913
race.facWhite, non-Hispanic                      1.3336940

Using relevel to choose a new reference category for a nominal variable

For equity or disproportionality research, it would usually be better to have white students as the reference category. Often in equity research, we want to know if the rates or odds of some event (IEP, disciplinary referral, etc.) differ for a group of interest compared to the dominant student group. R uses the lowest number (1 in this case) as the reference category by default. To change this, we would use relevel to specify a new reference category.This would make “White, non-Hispanic” your reference group.

els.clean$race.fac = relevel(els.clean$race.fac, ref="White, non-Hispanic")
logistic4 <- glm(iep.fac~race.fac, family = binomial, data = els.clean)
summary(logistic4)

Call:
glm(formula = iep.fac ~ race.fac, family = binomial, data = els.clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.6332  -0.5313  -0.4642  -0.4642   2.3824  

Coefficients:
                                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                                      -2.17361    0.04720 -46.054  < 2e-16 ***
race.facAmer. Indian/Alaska Native, non-Hispanic  0.45085    0.34653   1.301 0.193249    
race.facAsian, Hawaii/Pac. Islander,non-Hispanic -0.60390    0.17372  -3.476 0.000509 ***
race.facBlack or African American, non-Hispanic   0.65321    0.09874   6.615 3.71e-11 ***
race.facHispanic, no race specified               0.28703    0.14423   1.990 0.046575 *  
race.facHispanic, race specified                  0.66854    0.11505   5.811 6.22e-09 ***
race.facMore than one race, non-Hispanic          0.55795    0.14477   3.854 0.000116 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5814.0  on 7980  degrees of freedom
Residual deviance: 5716.5  on 7974  degrees of freedom
  (8216 observations deleted due to missingness)
AIC: 5730.5

Number of Fisher Scoring iterations: 5

Odds ratios

exp(cbind(OR = coef(logistic4), confint(logistic4)))
Waiting for profiling to be done...
                                                        OR     2.5 %    97.5 %
(Intercept)                                      0.1137656 0.1035940 0.1246522
race.facAmer. Indian/Alaska Native, non-Hispanic 1.5696429 0.7497972 2.9599209
race.facAsian, Hawaii/Pac. Islander,non-Hispanic 0.5466776 0.3829221 0.7580073
race.facBlack or African American, non-Hispanic  1.9217004 1.5800196 2.3273378
race.facHispanic, no race specified              1.3324694 0.9961699 1.7549678
race.facHispanic, race specified                 1.9513800 1.5516099 2.4367763
race.facMore than one race, non-Hispanic         1.7470807 1.3056327 2.3049727

Now, we can put it all together: all predictors we’ve tested before, in one model:

logistic5 <- glm(iep.fac ~ sex.fac + race.fac + comptest.cent, family = binomial, data = els.clean)
summary(logistic5)

Call:
glm(formula = iep.fac ~ sex.fac + race.fac + comptest.cent, family = binomial, 
    data = els.clean)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9758  -0.4431  -0.2446  -0.1286   3.8841  

Coefficients:
                                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                                      -2.314676   0.071147 -32.534  < 2e-16 ***
sex.facFemale                                    -0.781161   0.082563  -9.461  < 2e-16 ***
race.facAmer. Indian/Alaska Native, non-Hispanic -0.770781   0.400588  -1.924   0.0543 .  
race.facAsian, Hawaii/Pac. Islander,non-Hispanic -0.913587   0.190602  -4.793 1.64e-06 ***
race.facBlack or African American, non-Hispanic  -0.492579   0.116565  -4.226 2.38e-05 ***
race.facHispanic, no race specified              -0.826649   0.166245  -4.972 6.61e-07 ***
race.facHispanic, race specified                 -0.534474   0.136900  -3.904 9.46e-05 ***
race.facMore than one race, non-Hispanic          0.175067   0.177079   0.989   0.3228    
comptest.cent                                    -0.166120   0.005343 -31.091  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5814.0  on 7980  degrees of freedom
Residual deviance: 4166.7  on 7972  degrees of freedom
  (8216 observations deleted due to missingness)
AIC: 4184.7

Number of Fisher Scoring iterations: 6

Odds ratios

exp(cbind(OR = coef(logistic5), confint(logistic5)))
Waiting for profiling to be done...
                                                         OR      2.5 %    97.5 %
(Intercept)                                      0.09879824 0.08576459 0.1133597
sex.facFemale                                    0.45787431 0.38912331 0.5378791
race.facAmer. Indian/Alaska Native, non-Hispanic 0.46265139 0.20067124 0.9756694
race.facAsian, Hawaii/Pac. Islander,non-Hispanic 0.40108303 0.27227987 0.5757340
race.facBlack or African American, non-Hispanic  0.61104854 0.48506604 0.7661531
race.facHispanic, no race specified              0.43751303 0.31354525 0.6020302
race.facHispanic, race specified                 0.58597715 0.44646030 0.7637728
race.facMore than one race, non-Hispanic         1.19132614 0.83651352 1.6757887
comptest.cent                                    0.84694441 0.83799676 0.8557379

Part 4: Checking for Model Fit, Checking Assumptions

Ok, now how about some assumptions? There are a few important assumptions to check when running LR models. We also want to see how well our model “fits” - are our predictions any good? ## Checking for model fit The Hosmer-Lemeshow Goodness of Fit Test is the standard test of model fit. It works ok, but is not always entirely accurate! Notics that first we create a data frame called diagnostics using the same augment command that we learned about last week for checking regression assumptions.

Hosmer-Lemeshow test

library(broom)
diagnostics <- augment(logistic5) %>%
  mutate(.,
         fitted2cat = as_factor(case_when(
           .fitted < 0 ~ 0,
           .fitted > 0 ~ 1
         )),
         ID = row_number())

generalhoslem::logitgof(obs = diagnostics$iep.fac, exp = fitted(logistic5))

Sensitivity and specificity are also really important statistics to pay attention to. This shows how balanced your model is in its predictions.

Sensitivity

library(caret)
sensitivity(diagnostics$fitted2cat, reference = diagnostics$iep.fac)

Specificity

specificity(diagnostics$fitted2cat, reference = diagnostics$iep.fac)

Using a confusion matrix to represent model fit

conf_matrix<-table(diagnostics$fitted2cat,diagnostics$iep.fac)
conf_matrix

No Multicollinearity: same as a regular regression, use car::vif:

car::vif(logistic5)

To check for regression outliers:

ggplot(data = diagnostics, mapping = aes(x = .fitted, y = .cooksd, label = ID)) +
geom_point() + geom_text(nudge_x = .25) +
labs(title = "Cook's Distance Plot for Model",
x = "Predicted Value",
y = "Cook's Distance") +

geom_hline(yintercept = 4/7981, linetype = "dotted", color = "red")
---
title: "Multivariate Statistics: Module 6 - Introduction to Logistic Regression"
output: html_notebook
---

# Load in a few packages
```{r}
library(tidyverse)
library(psych)
```
# Load in the data
```{r}
library(haven)
els <- read_dta("ELS_Logistic.dta")
```

# Quick Data Cleaning and Coding Lesson! Using `case_when`
```{r}
glimpse(els)
```

First, let's create an outcome to use, whether a student has an IEP. This is an important measure in disproportionality research (are certain groups of students disproportionately identified as needing special education services). Let's check the variable labels and get a breakdown of values:
```{r}
table(els$byiepflg)
str(els$byiepflg)
```

Now, after creating a new variable, `iep`, we are going to clean up `iep` using `case_when` by recoding it to only have two values- a student has an IEP(= 1) or not (= 0).
```{r}
els.clean <- els %>%
  na_if(., -9) %>%
  na_if(., -8) %>% 
  na_if(., -4) %>%
  mutate(.,
         iep = case_when(
           byiepflg == 1 ~ 1,
           byiepflg == 0 ~ 0),
         comptest = bytxcstd,
         iep.fac = as_factor(iep),
         race.fac = as_factor(byrace),
         sex.fac = as_factor(bysex),
         comptest.cent = comptest - mean(comptest, na.rm = TRUE)) %>%
  glimpse()
```

Let's take a look at the race/ethnicity measure. This is going to be one of our predictors of interest:
## Breakdown of race/ethnicity measure
```{r}
summary(els.clean$race.fac)
```

## Breakdown of sex measure
```{r}
summary(els.clean$sex.fac)
```

## Summary statistics for `comptest`
And here is `comptest`, which will be our second predictor of interest. This is a standardized composite achievement test.
```{r}
describe(els.clean$comptest)
```
## Density plot of `comptest` scores
```{r}
kdensity1 <- ggplot(els.clean, aes(x=comptest)) +
geom_density() +
stat_function(fun = dnorm, n = 15892, args = list(mean = 51, sd = 10), color = "red", linetype = "dashed") +
labs(title="Distribution of Achievement Scores",x="Achievement Scores", y = "Density", caption = "N = 15892. The red line indicates a normal distribution. The black lines represents the observed distribution.")
kdensity1
```

# Part 1: Running and interpreting a logistic regression
## Running and interpreting the basic model

Do the odds of having an IEP differ by student sex? Do males and females have different odds of having an IEP? Here is the default/ basic command for running a logistic regression. This code will present the results as logit (log odds) coefficients:
```{r}
logistic1 <- glm(iep.fac ~ sex.fac, family = binomial, data = els.clean)
summary(logistic1)
```

## Going deeper with odds ratios
These estimates are logit coefficients. Long story short, negative numbers suggest lower odds relative to the reference group (here - male), and positive numbers suggest higher odds. So here, we would say that female students have significantly lower odds of having an IEP compared to male students. But that's about all we can say without some more math.

We can exponentiate this coefficient to get an odds ratio- this makes things a little easier to interpret.

```{r}
female.odds.ratio <- exp(-0.71808)
female.odds.ratio
```

When the OR is less than one (.49 in this case), we subtract (1 - OR) which equals .51. This means that for female students, the odds of having an IEP are 51% lower than for male children.

## Getting Odds Ratios for a Whole Model
To make things easier when you have more than one predictor, we can exponentiate an entire model. Here, we get ORs and confidence intervals. Any interval that doesn't contain 1 (notice, not 0) is statistically significant at the .05 level.
```{r}
exp(cbind(OR = coef(logistic1), confint(logistic1)))
```

## A little fancier: calculating predicted probabilities
We can also get *predicted probabilities* associated with each estimate. To do this, we need to add up the logits and plug them into the formula below:
```{r}
# Predicted probability for female student (logit = -1.63586 + -0.71808 = -2.35394)

exp(-2.35394)/(1+exp(-2.35394))
```
So, for a female student, the predicted probability of having an IEP is 8.7%.

What about for a male student?
```{r}
# Predicted probability for a male student (logit = -1.63586)

exp(-1.63586)/(1+exp(-1.63586))
```
For a male student, the predicted probability of having an IEP is 16.3%.

# Part 2: Adding more predictors
The previous model just included one categorical predictor (`sex`). Now, let's try to add a continuous predictor, `comptest`. We get a slightly different research question: Do the odds of having an IEP differ by student sex, after adjusting for achievement scores in math and reading?
```{r}
logistic2 <- glm(iep.fac ~ sex.fac + comptest.cent, family = binomial, data = els.clean)
summary(logistic2)
```

## We can still get odds ratios for the whole model
```{r}
exp(cbind(OR = coef(logistic2), confint(logistic2)))
```

## And we can do predicted probabilities
```{r}
# Predicted probability for female student with average comptest scores (logit = -2.494531 + -0.775766 = -3.270297)

exp(-3.270297)/(1+exp(-3.270297))
```
So, for a female student with average `comptest` scores, the predicted probability of having an IEP is 3.7%.

What about for a male student?
```{r}
# Predicted probability for a male student with average comptest scores (logit = -2.494531)

exp(-2.494531)/(1+exp(-2.494531))
```
For a male student with average `comptest` scores, the predicted probability of having an IEP is 7.6%.

# Part 3: Interpreting a multi-category predictor
Now, let's try the same thing and add on a factor variable with more categories (`race.fac`): Do the odds of having an IEP differ by student race/ethnicity?
```{r}
logistic3 <- glm(iep.fac ~ race.fac, family = binomial, data = els.clean)
summary(logistic3)
```

## Get odds ratios for the whole model...
```{r}
exp(cbind(OR = coef(logistic3), confint(logistic3)))
```

## Using `relevel` to choose a new reference category for a nominal variable
For equity or disproportionality research, it would usually be better to have white students as the reference category. Often in equity research, we want to know if the rates or odds of some event (IEP, disciplinary referral, etc.) differ for a group of interest compared to the dominant student group. `R` uses the lowest number (1 in this case) as the reference category by default. To change this, we would use `relevel` to specify a new reference category.This would make "White, non-Hispanic" your reference group.
```{r}
els.clean$race.fac = relevel(els.clean$race.fac, ref="White, non-Hispanic")
```

```{r}
logistic4 <- glm(iep.fac~race.fac, family = binomial, data = els.clean)
summary(logistic4)
```

# Odds ratios
```{r}
exp(cbind(OR = coef(logistic4), confint(logistic4)))
```

# Now, we can put it all together: all predictors we've tested before, in one model:
```{r}
logistic5 <- glm(iep.fac ~ sex.fac + race.fac + comptest.cent, family = binomial, data = els.clean)
summary(logistic5)
```

## Odds ratios
```{r}
exp(cbind(OR = coef(logistic5), confint(logistic5)))
```

# Part 4: Checking for Model Fit, Checking Assumptions
Ok, now how about some assumptions? There are a few important assumptions to check when running LR models. We also want to see how well our model "fits" - are our predictions any good?
## Checking for model fit
The Hosmer-Lemeshow Goodness of Fit Test is the standard test of model fit. It works ok, but is not always entirely accurate! Notics that first we create a data frame called `diagnostics` using the same `augment` command that we learned about last week for checking regression assumptions.

### Hosmer-Lemeshow test
```{r}
library(broom)
diagnostics <- augment(logistic5) %>%
  mutate(.,
         fitted2cat = as_factor(case_when(
           .fitted < 0 ~ 0,
           .fitted > 0 ~ 1
         )),
         ID = row_number())

generalhoslem::logitgof(obs = diagnostics$iep.fac, exp = fitted(logistic5))
```

Sensitivity and specificity are also really important statistics to pay attention to. This shows how balanced your model is in its predictions.

### Sensitivity
```{r}
library(caret)
sensitivity(diagnostics$fitted2cat, reference = diagnostics$iep.fac)
```

### Specificity
```{r}
specificity(diagnostics$fitted2cat, reference = diagnostics$iep.fac)
```

### Using a confusion matrix to represent model fit
```{r}
conf_matrix<-table(diagnostics$fitted2cat,diagnostics$iep.fac)
conf_matrix
```

## No Multicollinearity: same as a regular regression, use `car::vif`:
```{r}
car::vif(logistic5)
```

## To check for regression outliers:
```{r}
ggplot(data = diagnostics, mapping = aes(x = .fitted, y = .cooksd, label = ID)) +
geom_point() + geom_text(nudge_x = .25) +
labs(title = "Cook's Distance Plot for Model",
x = "Predicted Value",
y = "Cook's Distance") +

geom_hline(yintercept = 4/7981, linetype = "dotted", color = "red")
```