##爸爸職業等級對高中畢業後選擇的預測
#multinomial logit model through Poisson link
#因Poisson合適單位內隨機事件發生的次數的機率分佈

pacman::p_load(MASS, lattice, tidyverse)

#
options(digits = 4, show.signif.stars = FALSE)

# lattice options
ltheme <- canonical.theme(color = FALSE)     ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme)      ## set as default

#hs :high school rank"L", "M" and "U" for lower
#phs :post high school status Enrolled in college, ("C"), enrolled in   non-collegiate school,("N")
      #employed full-time, ("E") and other, ("O").
#fol :father's occupational level, ("F1", "F2", ..., "F7").
#sex :factor with levels"F" or "M".
#f :frequency.
# 選取中等表現的女性 
dta <- subset(minn38, hs == "M" & sex == "F")

# 擷取不同post high school status的次數
# Enrolled in college, ("C")
# enrolled in non-collegiate school, ("N")
# employed full-time("E") 
# other, ("O")
dtac <- subset(dta, phs == "C")[,5]
dtae <- subset(dta, phs == "E")[,5]
dtan <- subset(dta, phs == "N")[,5]
dtao <- subset(dta, phs == "O")[,5]

# 擷取父親職業等級1-7,O選擇每個父親職業等級都有,所以從這抓
# (不過事實上每個畢業選擇都有1-7的父親職業等級)
dtaf <- subset(dta, phs == "O")[,3]

#
dta <- as.data.frame(cbind(dtaf, dtac, dtae, dtan, dtao))

# assign variable names
names(dta) <- c("FOL", "C", "E", "N", "O")

# sum to Total frequency
dta <- transform(dta, T = C + E + N + O)

# 算出每一個post-high school choice的比例
dta <- transform(dta, pc = C/T, pe = E/T, pn = N/T, po = O/T)

#不同職業等級爸爸 V.S. 不同post high school status比例
xyplot(pc + pe + pn + po ~ FOL, data = dta, cex = 1.5,
       xlab = "Occupation levels of father", 
       ylab = "Proportion of post-high school choice", 
       type = c("b", "g"), pch = c("c", "e", "n", "o"), lty = 1:4)

# convert wide to long format
  #每一種爸等級.不同畢業選擇.次數 7*4=28
dtaL <- dta %>% 
  gather(key = Choice, value = Count, 2:5) %>%
  mutate(Choice = relevel(as.factor(Choice), ref = "O"),
         OL = as.factor(FOL))
# use poisson-multinomial relationship to fit a multinomial model
# null multinomial model                         
m0 <- glm(Count ~ OL + Choice, data = dtaL, family = poisson)

# add years as predictor
m1 <- update(m0, . ~ . + Choice:FOL)

# use square-root(level) as predictor
m2 <- update(m0, . ~ . + Choice:sqrt(FOL))

# comparing models
anova(m0, m1, m2)
## Analysis of Deviance Table
## 
## Model 1: Count ~ OL + Choice
## Model 2: Count ~ OL + Choice + Choice:FOL
## Model 3: Count ~ OL + Choice + Choice:sqrt(FOL)
##   Resid. Df Resid. Dev Df Deviance
## 1        18      253.2            
## 2        15      105.4  3    147.9
## 3        15       86.1  0     19.3
# m2殘差標準差小,佳

# 預測值
dtaL$yhat <- fitted(m2)

# augment data with fitted values in proportions
dtaL <- transform(dtaL, phat = yhat/T, obsp = Count / T)

# 觀察值的點
p1 <- xyplot(obsp ~ FOL, data = dtaL, groups = Choice,
             xlab = 'Father Occupation Level', 
             ylab = 'Proportion post-high school choice', 
             pch = c("c", "e", "n", "o"), cex = 1.5)

# 預測值的線             
update(p1, 
       panel = function(x,y,...){
         panel.xyplot(x,y,...)
         panel.xyplot(x, dtaL$phat, type = c("l","g"), lwd = 1.5, ...)
       })

# residual plot
# 標記outlier
plot(m2, which = 1)
grid()

#
# Understand the meaning of parameter estimates
# 實際與參照組比較下的機率
dta <- transform(dta, lcn = log(pc/pn), len = log(pe/pn), lon = log(po/pn))

# 爸爸職業等級對高教選擇的預測 
# 取斜率
c(coef(lm(lon ~ sqrt(FOL), data = dta))[2],
  coef(lm(lcn ~ sqrt(FOL), data = dta))[2],
  coef(lm(len ~ sqrt(FOL), data = dta))[2])
## sqrt(FOL) sqrt(FOL) sqrt(FOL) 
##    0.8791   -0.4423    0.4088
# 對照一下我們做出來的multinomial model 參數
m2
## 
## Call:  glm(formula = Count ~ OL + Choice + Choice:sqrt(FOL), family = poisson, 
##     data = dtaL)
## 
## Coefficients:
##       (Intercept)                OL2                OL3  
##             4.138              0.139              0.952  
##               OL4                OL5                OL6  
##             0.168             -0.787             -0.933  
##               OL7            ChoiceC            ChoiceE  
##            -1.143              1.352             -0.972  
##           ChoiceN  ChoiceO:sqrt(FOL)  ChoiceC:sqrt(FOL)  
##            -0.726              0.782             -0.578  
## ChoiceE:sqrt(FOL)  ChoiceN:sqrt(FOL)  
##             0.407                 NA  
## 
## Degrees of Freedom: 27 Total (i.e. Null);  15 Residual
## Null Deviance:       3250 
## Residual Deviance: 86.1  AIC: 279
coef(m2)[11:13]
## ChoiceO:sqrt(FOL) ChoiceC:sqrt(FOL) ChoiceE:sqrt(FOL) 
##            0.7822           -0.5780            0.4066
# 與參照組比較下的機率 V.s.爸爸職業等級開根號
# 只會有三條線,因為是以N為對照組的發生機率
xyplot(lcn + len + lon ~ sqrt(FOL), data = dta, cex = 1.5,
       xlab = "Occupation levels of father (in square-root unit)", 
       ylab = "Log-Odds of post-high school status (rel. N))", 
       type = c("r", "g", "p"), pch = c("c", "e", "o"), lty = 1:3)