## Data analysis example 1: Students' Program Choices ####################################################

# Entering high school students make program choices among ‘general’, ‘vocational’ and ‘academic’ program.
# Outcome variables: general vs. academic vs. vocation
# Predictors: social economic status (categorical) and writing score (continuous)
# The number of observation is 200

# Response variables: Program choice
  # Academic is the reference category, j = 0
  # General is the j = 1 category
  # Vocation is the j = 2 category

# Explanatory variables
  # Write: continuous variable X1
  # Social economic status (ses): categorical variable
    # ses low (omitted reference category)
    # ses middle (X2=1)
    # ses high (X3=1) (Go back to slide 19)

# Load packages
require(foreign)
## Loading required package: foreign
require(nnet)
## Loading required package: nnet
require(ggplot2)
## Loading required package: ggplot2
require(reshape2)
## Loading required package: reshape2
# Read data
ml <- read.dta("https://stats.idre.ucla.edu/stat/data/hsbdemo.dta")
## Description of Data
with(ml, table(ses, prog))
##         prog
## ses      general academic vocation
##   low         16       19       12
##   middle      20       44       31
##   high         9       42        7
with(ml, do.call(rbind, tapply(write, prog, function(x) c(M = mean(x), SD = sd(x)))))
##                 M       SD
## general  51.33333 9.397775
## academic 56.25714 7.943343
## vocation 46.76000 9.318754
# Set reference level
ml$prog2 <- relevel(ml$prog, ref = "academic")
# Run multinomial model
test <- multinom(prog2 ~ ses + write, data = ml)
## # weights:  15 (8 variable)
## initial  value 219.722458 
## iter  10 value 179.982880
## final  value 179.981726 
## converged
summary(test)
## Call:
## multinom(formula = prog2 ~ ses + write, data = ml)
## 
## Coefficients:
##          (Intercept)  sesmiddle    seshigh      write
## general     2.852198 -0.5332810 -1.1628226 -0.0579287
## vocation    5.218260  0.2913859 -0.9826649 -0.1136037
## 
## Std. Errors:
##          (Intercept) sesmiddle   seshigh      write
## general     1.166441 0.4437323 0.5142196 0.02141097
## vocation    1.163552 0.4763739 0.5955665 0.02221996
## 
## Residual Deviance: 359.9635 
## AIC: 375.9635
# Interpretation of beta coefficients
 # A one-unit increase in the variable 'write' is associated with the decrease in the log odds of being in 'general program vs. academic program' in the amount of .058 .
# --> The higher the 'write' score is, the less the students are likely to choose 'general' compared to 'academic'

 # A one-unit increase in the variable 'write' is associated with the decrease in the log odds of being in 'vocation program' vs. 'academic program' in the amount of .1136 .

 # The log odds of being in 'general program' vs. in 'academic program' will decrease by 1.163 if moving from ses="low" to ses="high".
# --> The students in 'ses = high' compared to 'ses=low (reference level)' are less likely to choose 'general' compared to 'academic (reference level)'

 # The log odds of being in general program vs. in academic program will decrease by 0.533 if moving from ses="low"to ses="middle", although this coefficient is not significant.

 # The log odds of being in vocation program vs. in academic program will decrease by 0.983 if moving from ses="low" to ses="high".

 # The log odds of being in vocation program vs. in academic program will increase by 0.291 if moving from ses="low" to ses="middle", although this coefficient is not significant
## z test statistics to find p-values
z <- summary(test)$coefficients/summary(test)$standard.errors
z
##          (Intercept)  sesmiddle   seshigh     write
## general     2.445214 -1.2018081 -2.261334 -2.705562
## vocation    4.484769  0.6116747 -1.649967 -5.112689
# 2-tailed z test: p-values
p <-  2*pnorm(-abs(z))
p
##           (Intercept) sesmiddle    seshigh        write
## general  0.0144766100 0.2294379 0.02373856 6.818902e-03
## vocation 0.0000072993 0.5407530 0.09894976 3.176045e-07
# Odds (relative risks): the ratio of the probability of choosing one outcome category over the probability of choosing the baseline

## Exponentiate the coefficients to get risk ratios
exp(coef(test))
##          (Intercept) sesmiddle   seshigh     write
## general     17.32582 0.5866769 0.3126026 0.9437172
## vocation   184.61262 1.3382809 0.3743123 0.8926116
### Interpretation of odds (relative risks)
## For 'write' in general caterory
# The relative risk ratio for a one-unit increase in the variable 'write' is .9437 for being in 'general program' versus 'academic program'

## For ses(high) in general caterory
# The relative risk ratio (odds) switching from 'ses = 1(low)' to '3(high)' is .3126 for being in 'general program' vs 'academic program'
## Predicted probabilities for each of the outcome levels (each obs)
# (Go back slide or white board)
head(pp <- fitted(test))
##    academic   general  vocation
## 1 0.1482764 0.3382454 0.5134781
## 2 0.1202017 0.1806283 0.6991700
## 3 0.4186747 0.2368082 0.3445171
## 4 0.1726885 0.3508384 0.4764731
## 5 0.1001231 0.1689374 0.7309395
## 6 0.3533566 0.2377976 0.4088458
## Fitted probabilities
# 'write' at its mean
mean(ml$write)
## [1] 52.775
# Fitted probabilities for categorical variable
dses <- data.frame(ses = c("low", "middle", "high"), write = mean(ml$write))
predict(test, newdata = dses, "probs")
##    academic   general  vocation
## 1 0.4396845 0.3581917 0.2021238
## 2 0.4777488 0.2283353 0.2939159
## 3 0.7009007 0.1784939 0.1206054
## Interpretation
# At the write mean level, the probability of chossing academic in 'low ses' is 43.9%
## Probability varying the levels or values of each variable
dwrite <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70),
    3))

# store the predicted probabilities for each value of ses and write
pp.write <- cbind(dwrite, predict(test, newdata = dwrite, type = "probs", se = TRUE))

lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")
head(lpp)  # view first few rows
##   ses write variable probability
## 1 low    30 academic  0.09843588
## 2 low    31 academic  0.10716868
## 3 low    32 academic  0.11650390
## 4 low    33 academic  0.12645834
## 5 low    34 academic  0.13704576
## 6 low    35 academic  0.14827643
## Calculate the mean probabilities within each level of ses
by(pp.write[, 3:5], pp.write$ses, colMeans)
## pp.write$ses: high
##  academic   general  vocation 
## 0.6164315 0.1808037 0.2027648 
## -------------------------------------------------------- 
## pp.write$ses: low
##  academic   general  vocation 
## 0.3972977 0.3278174 0.2748849 
## -------------------------------------------------------- 
## pp.write$ses: middle
##  academic   general  vocation 
## 0.4256198 0.2010864 0.3732938
## Plot predicted probabilities across write values for each level of ses
## facetted by program type
ggplot(lpp, aes(x = write, y = probability, colour = ses)) + geom_line() + facet_grid(variable ~
    ., scales = "free")

# (Going back to slide p.31)
## Likelihood ratio test 1
# with write vs without write
# model
mod2 <- multinom(prog2 ~ ses + write, data = ml)
## # weights:  15 (8 variable)
## initial  value 219.722458 
## iter  10 value 179.982880
## final  value 179.981726 
## converged
logLik(mod2)
## 'log Lik.' -179.9817 (df=8)
# Nested model
mod <- multinom(prog2 ~ ses, data = ml)
## # weights:  12 (6 variable)
## initial  value 219.722458 
## iter  10 value 195.705189
## iter  10 value 195.705188
## iter  10 value 195.705188
## final  value 195.705188 
## converged
logLik(mod)
## 'log Lik.' -195.7052 (df=6)
# Degree of freedom
df <- length(coef(mod2)) - length(coef(mod)) 
df
## [1] 2
# Test statistics and p-value
teststat<--2*(as.numeric(logLik(mod))-as.numeric(logLik(mod2)))
teststat
## [1] 31.44692
pchisq(teststat,df=2,lower.tail=FALSE)
## [1] 1.483841e-07
# Reject H0: keep write in the model
## Likelihood ratio test 2
# with ses vs without ses
# model
mod2 <- multinom(prog2 ~ ses + write, data = ml)
## # weights:  15 (8 variable)
## initial  value 219.722458 
## iter  10 value 179.982880
## final  value 179.981726 
## converged
logLik(mod2)
## 'log Lik.' -179.9817 (df=8)
# Nested model
mod <- multinom(prog2 ~ write, data = ml)
## # weights:  9 (4 variable)
## initial  value 219.722458 
## final  value 185.510837 
## converged
logLik(mod)
## 'log Lik.' -185.5108 (df=4)
# Degree of freedom
df <- length(coef(mod2)) - length(coef(mod)) 
df
## [1] 4
# Test statistics and p-value
teststat<--2*(as.numeric(logLik(mod))-as.numeric(logLik(mod2)))
teststat
## [1] 11.05822
pchisq(teststat,df=2,lower.tail=FALSE)
## [1] 0.003969516
#vReject H0: keep ses in the model