##爸爸職業等級對高中畢業後選擇的預測
#multinomial logit model through Poisson link
#因Poisson合適單位內隨機事件發生的次數的機率分佈
pacman::p_load(MASS, lattice, tidyverse)
#
options(digits = 4, show.signif.stars = FALSE)
# lattice options
ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme) ## set as default
#hs :high school rank"L", "M" and "U" for lower
#phs :post high school status Enrolled in college, ("C"), enrolled in non-collegiate school,("N")
#employed full-time, ("E") and other, ("O").
#fol :father's occupational level, ("F1", "F2", ..., "F7").
#sex :factor with levels"F" or "M".
#f :frequency.
# 選取中等表現的女性
dta <- subset(minn38, hs == "M" & sex == "F")
# 擷取不同post high school status的次數
# Enrolled in college, ("C")
# enrolled in non-collegiate school, ("N")
# employed full-time("E")
# other, ("O")
dtac <- subset(dta, phs == "C")[,5]
dtae <- subset(dta, phs == "E")[,5]
dtan <- subset(dta, phs == "N")[,5]
dtao <- subset(dta, phs == "O")[,5]
# 擷取父親職業等級1-7,O選擇每個父親職業等級都有,所以從這抓
# (不過事實上每個畢業選擇都有1-7的父親職業等級)
dtaf <- subset(dta, phs == "O")[,3]
#
dta <- as.data.frame(cbind(dtaf, dtac, dtae, dtan, dtao))
# assign variable names
names(dta) <- c("FOL", "C", "E", "N", "O")
# sum to Total frequency
dta <- transform(dta, T = C + E + N + O)
# 算出每一個post-high school choice的比例
dta <- transform(dta, pc = C/T, pe = E/T, pn = N/T, po = O/T)
#不同職業等級爸爸 V.S. 不同post high school status比例
xyplot(pc + pe + pn + po ~ FOL, data = dta, cex = 1.5,
xlab = "Occupation levels of father",
ylab = "Proportion of post-high school choice",
type = c("b", "g"), pch = c("c", "e", "n", "o"), lty = 1:4)

# convert wide to long format
#每一種爸等級.不同畢業選擇.次數 7*4=28
dtaL <- dta %>%
gather(key = Choice, value = Count, 2:5) %>%
mutate(Choice = relevel(as.factor(Choice), ref = "O"),
OL = as.factor(FOL))
# use poisson-multinomial relationship to fit a multinomial model
# null multinomial model
m0 <- glm(Count ~ OL + Choice, data = dtaL, family = poisson)
# add years as predictor
m1 <- update(m0, . ~ . + Choice:FOL)
# use square-root(level) as predictor
m2 <- update(m0, . ~ . + Choice:sqrt(FOL))
# comparing models
anova(m0, m1, m2)
## Analysis of Deviance Table
##
## Model 1: Count ~ OL + Choice
## Model 2: Count ~ OL + Choice + Choice:FOL
## Model 3: Count ~ OL + Choice + Choice:sqrt(FOL)
## Resid. Df Resid. Dev Df Deviance
## 1 18 253.2
## 2 15 105.4 3 147.9
## 3 15 86.1 0 19.3
# m2殘差標準差小,佳
# 預測值
dtaL$yhat <- fitted(m2)
# augment data with fitted values in proportions
dtaL <- transform(dtaL, phat = yhat/T, obsp = Count / T)
# 觀察值的點
p1 <- xyplot(obsp ~ FOL, data = dtaL, groups = Choice,
xlab = 'Father Occupation Level',
ylab = 'Proportion post-high school choice',
pch = c("c", "e", "n", "o"), cex = 1.5)
# 預測值的線
update(p1,
panel = function(x,y,...){
panel.xyplot(x,y,...)
panel.xyplot(x, dtaL$phat, type = c("l","g"), lwd = 1.5, ...)
})

# residual plot
# 標記outlier
plot(m2, which = 1)
grid()

#
# Understand the meaning of parameter estimates
# 實際與參照組比較下的機率
dta <- transform(dta, lcn = log(pc/pn), len = log(pe/pn), lon = log(po/pn))
# 爸爸職業等級對高教選擇的預測
# 取斜率
c(coef(lm(lon ~ sqrt(FOL), data = dta))[2],
coef(lm(lcn ~ sqrt(FOL), data = dta))[2],
coef(lm(len ~ sqrt(FOL), data = dta))[2])
## sqrt(FOL) sqrt(FOL) sqrt(FOL)
## 0.8791 -0.4423 0.4088
# 對照一下我們做出來的multinomial model 參數
m2
##
## Call: glm(formula = Count ~ OL + Choice + Choice:sqrt(FOL), family = poisson,
## data = dtaL)
##
## Coefficients:
## (Intercept) OL2 OL3
## 4.138 0.139 0.952
## OL4 OL5 OL6
## 0.168 -0.787 -0.933
## OL7 ChoiceC ChoiceE
## -1.143 1.352 -0.972
## ChoiceN ChoiceO:sqrt(FOL) ChoiceC:sqrt(FOL)
## -0.726 0.782 -0.578
## ChoiceE:sqrt(FOL) ChoiceN:sqrt(FOL)
## 0.407 NA
##
## Degrees of Freedom: 27 Total (i.e. Null); 15 Residual
## Null Deviance: 3250
## Residual Deviance: 86.1 AIC: 279
coef(m2)[11:13]
## ChoiceO:sqrt(FOL) ChoiceC:sqrt(FOL) ChoiceE:sqrt(FOL)
## 0.7822 -0.5780 0.4066
# 與參照組比較下的機率 V.s.爸爸職業等級開根號
# 只會有三條線,因為是以N為對照組的發生機率
xyplot(lcn + len + lon ~ sqrt(FOL), data = dta, cex = 1.5,
xlab = "Occupation levels of father (in square-root unit)",
ylab = "Log-Odds of post-high school status (rel. N))",
type = c("r", "g", "p"), pch = c("c", "e", "o"), lty = 1:3)
