Objective: Determine how entering high school students make program choices among general program, vocational program and academic program. Their choice might be modeled using their writing score and their social economic status.

Approach: Using multinomial logistic regression to predict high schooler program choice based on ses, write.

## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1.) Quick data exploration

summary(Data_orig)
##        id            female        ses         schtyp          prog    
##  Min.   :  1.00   male  : 91   low   :47   public :168   general : 45  
##  1st Qu.: 50.75   female:109   middle:95   private: 32   academic:105  
##  Median :100.50                high  :58                 vocation: 50  
##  Mean   :100.50                                                        
##  3rd Qu.:150.25                                                        
##  Max.   :200.00                                                        
##       read           write            math          science     
##  Min.   :28.00   Min.   :31.00   Min.   :33.00   Min.   :26.00  
##  1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00   1st Qu.:44.00  
##  Median :50.00   Median :54.00   Median :52.00   Median :53.00  
##  Mean   :52.23   Mean   :52.77   Mean   :52.65   Mean   :51.85  
##  3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00   3rd Qu.:58.00  
##  Max.   :76.00   Max.   :67.00   Max.   :75.00   Max.   :74.00  
##      socst                honors        awards          cid       
##  Min.   :26.00   not enrolled:147   Min.   :0.00   Min.   : 1.00  
##  1st Qu.:46.00   enrolled    : 53   1st Qu.:0.00   1st Qu.: 5.00  
##  Median :52.00                      Median :1.00   Median :10.50  
##  Mean   :52.41                      Mean   :1.67   Mean   :10.43  
##  3rd Qu.:61.00                      3rd Qu.:2.00   3rd Qu.:15.00  
##  Max.   :71.00                      Max.   :7.00   Max.   :20.00
# Check for missing values per variable
sapply(Data_orig, function(x) sum(is.na(x)))
##      id  female     ses  schtyp    prog    read   write    math science 
##       0       0       0       0       0       0       0       0       0 
##   socst  honors  awards     cid 
##       0       0       0       0
# Check for unique values per variable
sapply(Data_orig, function(x) length(unique(x)))
##      id  female     ses  schtyp    prog    read   write    math science 
##     200       2       3       2       3      30      29      40      34 
##   socst  honors  awards     cid 
##      22       2       7      20
# A much nicer and conlidated approach
missmap(Data_orig, main = "Missing Values vs. Observed Values")

# Number of observations between [ses] and [prog] in table format {
## great for Nominal data type
with(Data_orig, table(ses, prog))
##         prog
## ses      general academic vocation
##   low         16       19       12
##   middle      20       44       31
##   high         9       42        7
# Mean and Std-Dev of [write] compared to [prog]
## great for Ordinal data type
with(Data_orig, do.call(rbind, tapply(write, prog, function(x) c(Mean = mean(x), SD = sd(x)))))
##              Mean       SD
## general  51.33333 9.397775
## academic 56.25714 7.943343
## vocation 46.76000 9.318754

2.) Data Cleansing / Massaging / Leveling

# Filter out useful data (keep only [prog], [ses], [write])
Data_master <- Data_orig %>% 
  select(prog, ses, write)

# Check for missing values per variable
sapply(Data_master, function(x) sum(is.na(x)))
##  prog   ses write 
##     0     0     0
# Check for default Levels for [prog]
levels(Data_master$prog)
## [1] "general"  "academic" "vocation"
# Re-Level [prog] as [prog2] with "academic" as baseline
Data_master$prog2 <- relevel(Data_orig$prog, ref = "academic")

# Check for default levels for [ses]
levels(Data_master$ses)
## [1] "low"    "middle" "high"

3.) Fit a Multinomial Logistics model

model1 <- multinom(prog2 ~ ses + write, data = Data_master)
## # weights:  15 (8 variable)
## initial  value 219.722458 
## iter  10 value 179.982880
## final  value 179.981726 
## converged
summary(model1)
## Call:
## multinom(formula = prog2 ~ ses + write, data = Data_master)
## 
## Coefficients:
##          (Intercept)  sesmiddle    seshigh      write
## general     2.852198 -0.5332810 -1.1628226 -0.0579287
## vocation    5.218260  0.2913859 -0.9826649 -0.1136037
## 
## Std. Errors:
##          (Intercept) sesmiddle   seshigh      write
## general     1.166441 0.4437323 0.5142196 0.02141097
## vocation    1.163552 0.4763739 0.5955665 0.02221996
## 
## Residual Deviance: 359.9635 
## AIC: 375.9635
# Gauge model's fit
z_test <- summary(model1)$coefficients / summary(model1)$standard.errors
z_test
##          (Intercept)  sesmiddle   seshigh     write
## general     2.445214 -1.2018081 -2.261334 -2.705562
## vocation    4.484769  0.6116747 -1.649967 -5.112689
p_value <- (1 - pnorm(abs(z_test), 0, 1)) * 2
p_value
##           (Intercept) sesmiddle    seshigh        write
## general  0.0144766100 0.2294379 0.02373856 6.818902e-03
## vocation 0.0000072993 0.5407530 0.09894976 3.176045e-07

RESULT

4.) Calculate Risk Ratio - extract the coefficients from the model and exponentiate

Ratio of the probability of choosing one outcome category over the probability of choosing the baseline category is often referred as relative risk (and it is also sometimes referred as odds as we have just used to described the regression parameters above).

exp(coef(model1))
##          (Intercept) sesmiddle   seshigh     write
## general     17.32582 0.5866769 0.3126026 0.9437172
## vocation   184.61262 1.3382809 0.3743123 0.8926116

RESULT

5.) Calculate Predicted Probabilities for each outcome levels

head(pp <- fitted(model1))
##    academic   general  vocation
## 1 0.1482764 0.3382454 0.5134781
## 2 0.1202017 0.1806283 0.6991700
## 3 0.4186747 0.2368082 0.3445171
## 4 0.1726885 0.3508384 0.4764731
## 5 0.1001231 0.1689374 0.7309395
## 6 0.3533566 0.2377976 0.4088458

Examine change in Predicted Probabilities between [write] and [ses]

d_prob_ses <- data.frame(ses = c("low", "middle", "high"), write = mean(Data_master$write))
pp.ses <- predict(model1, newdata = d_prob_ses, type = "probs")
pp.ses
##    academic   general  vocation
## 1 0.4396845 0.3581917 0.2021238
## 2 0.4777488 0.2283353 0.2939159
## 3 0.7009007 0.1784939 0.1206054

An alternative way to look at Predicted Probabilities is to look at the averaged predicted probabilities for different values of the continuous predictor variable write within each level of ses.

d_prob_write <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70), 3))


# store the predicted probabilities for each value of ses and write
pp.write <- cbind(d_prob_write, predict(model1, newdata = d_prob_write, type = "probs", se = TRUE))

# calculate the mean probabilities of prog levels within each level of ses
by(pp.write[, 3:5], pp.write$ses, colMeans)
## pp.write$ses: high
##  academic   general  vocation 
## 0.6164315 0.1808037 0.2027648 
## -------------------------------------------------------- 
## pp.write$ses: low
##  academic   general  vocation 
## 0.3972977 0.3278174 0.2748849 
## -------------------------------------------------------- 
## pp.write$ses: middle
##  academic   general  vocation 
## 0.4256198 0.2010864 0.3732938

6.) melt data set to long for ggplot2

lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")
head(lpp)
##   ses write variable probability
## 1 low    30 academic  0.09843588
## 2 low    31 academic  0.10716868
## 3 low    32 academic  0.11650390
## 4 low    33 academic  0.12645834
## 5 low    34 academic  0.13704576
## 6 low    35 academic  0.14827643
# Plot predicted probabilities across write values for each level of ses facetted by program type
ggplot(lpp, aes(x = write, y = probability, colour = ses)) +
  geom_line() +
  facet_grid(variable ~ ., scales="free")

Things to consider: