Reproduce the results of analysis reported in Chapter 11 of Behavioral data analysis with R (Cheng & Sheu, 2015) with this R script and the married dataset .
pacman::p_load(knitr, simstudy, tidyverse, nlme, rms, ggplot2, lme4, binomTools)
# input data
dta <- read.table('data/married.txt', header = T, fileEncoding = "Big-5")
#
head(dta)
教育 性別 婚姻 年齡
1 國小以下 女 未婚 32.5
2 專科 男 有偶 57.5
3 國中 男 有偶 42.5
4 國小以下 男 有偶 42.5
5 高中 女 喪偶 62.5
6 高中 女 有偶 52.5
summary(dta)
教育 性別 婚姻 年齡
Length:10000 Length:10000 Length:10000 Min. :32.5
Class :character Class :character Class :character 1st Qu.:37.5
Mode :character Mode :character Mode :character Median :47.5
Mean :46.6
3rd Qu.:52.5
Max. :62.5
dta$教育 <- factor(dta$教育, levels = c('國小以下', '國中', '高中',
'專科', '大學以上'))
#重排婚姻,由多到少
dta$婚姻 <- factor(dta$婚姻, levels = c('有偶', '未婚', '離婚', '喪偶'))
#將年齡換成年次
dta$年次 <- (102-dta$年齡)
#將婚姻狀況分為未婚與結過婚
dta$結過婚 <- ifelse(dta$婚姻 == '未婚', 0, 1)
apply(dta, 2, function(x) table(x)/dim(dta)[1])
$教育
x
大學以上 高中 國小以下 國中 專科
0.1070 0.3297 0.2406 0.1360 0.1867
$性別
x
男 女
0.4927 0.5073
$婚姻
x
離婚 喪偶 未婚 有偶
0.1143 0.0341 0.1966 0.6550
$年齡
x
32.5 37.5 42.5 47.5 52.5 57.5 62.5
0.1619 0.1516 0.1435 0.1434 0.1558 0.1306 0.1132
$年次
x
39.5 44.5 49.5 54.5 59.5 64.5 69.5
0.1132 0.1306 0.1558 0.1434 0.1435 0.1516 0.1619
$結過婚
x
0 1
0.1966 0.8034
#單一變項與結過婚與否的列聯表
#程式報表11.4
ftable(dta, row.vars = '教育', col.vars = '結過婚')
結過婚 0 1
教育
國小以下 726 1680
國中 273 1087
高中 584 2713
專科 311 1556
大學以上 72 998
ftable(dta, row.vars = '性別', col.vars = '結過婚')
結過婚 0 1
性別
男 1072 3855
女 894 4179
ftable(dta, row.vars = '年次', col.vars = '結過婚')
結過婚 0 1
年次
39.5 64 1068
44.5 105 1201
49.5 136 1422
54.5 201 1233
59.5 263 1172
64.5 422 1094
69.5 775 844
#兩個變項與結過婚與否的列聯表
#程式報表11.5
ftable(dta, row.vars = c('教育', '性別'), col.vars = '結過婚')
結過婚 0 1
教育 性別
國小以下 男 327 906
女 399 774
國中 男 118 563
女 155 524
高中 男 349 1241
女 235 1472
專科 男 226 810
女 85 746
大學以上 男 52 335
女 20 663
ftable(dta, row.vars = c('教育', '年次'), col.vars = '結過婚')
結過婚 0 1
教育 年次
國小以下 39.5 7 123
44.5 16 150
49.5 32 200
54.5 51 250
59.5 76 286
64.5 154 350
69.5 390 321
國中 39.5 5 85
44.5 10 119
49.5 17 166
54.5 34 172
59.5 50 204
64.5 70 223
69.5 87 118
高中 39.5 23 236
44.5 37 368
49.5 42 500
54.5 65 497
59.5 90 458
64.5 132 366
69.5 195 288
專科 39.5 14 146
44.5 21 288
49.5 37 425
54.5 41 263
59.5 43 204
64.5 57 134
69.5 98 96
大學以上 39.5 15 478
44.5 21 276
49.5 8 131
54.5 10 51
59.5 4 20
64.5 9 21
69.5 5 21
ftable(dta, row.vars = c('性別', '年次'), col.vars = '結過婚')
結過婚 0 1
性別 年次
男 39.5 21 492
44.5 56 604
49.5 74 714
54.5 108 630
59.5 140 567
64.5 235 521
69.5 438 327
女 39.5 43 576
44.5 49 597
49.5 62 708
54.5 93 603
59.5 123 605
64.5 187 573
69.5 337 517
#三個變項與結過婚與否的列聯表,也可以換成比率
ftable(dta, row.vars = c(1, 2, 5), col.vars = '結過婚')
結過婚 0 1
教育 性別 年次
國小以下 男 39.5 1 76
44.5 4 92
49.5 9 114
54.5 23 146
59.5 23 162
64.5 67 195
69.5 200 121
女 39.5 6 47
44.5 12 58
49.5 23 86
54.5 28 104
59.5 53 124
64.5 87 155
69.5 190 200
國中 男 39.5 0 53
44.5 5 81
49.5 7 98
54.5 13 101
59.5 21 97
64.5 32 88
69.5 40 45
女 39.5 5 32
44.5 5 38
49.5 10 68
54.5 21 71
59.5 29 107
64.5 38 135
69.5 47 73
高中 男 39.5 8 119
44.5 18 182
49.5 25 243
54.5 36 225
59.5 60 203
64.5 83 149
69.5 119 120
女 39.5 15 117
44.5 19 186
49.5 17 257
54.5 29 272
59.5 30 255
64.5 49 217
69.5 76 168
專科 男 39.5 3 68
44.5 14 164
49.5 27 225
54.5 28 135
59.5 34 99
64.5 45 80
69.5 75 39
女 39.5 11 78
44.5 7 124
49.5 10 200
54.5 13 128
59.5 9 105
64.5 12 54
69.5 23 57
大學以上 男 39.5 9 176
44.5 15 85
49.5 6 34
54.5 8 23
59.5 2 6
64.5 8 9
69.5 4 2
女 39.5 6 302
44.5 6 191
49.5 2 97
54.5 2 28
59.5 2 14
64.5 1 12
69.5 1 19
prop.table(ftable(dta, row.vars=c(1, 2, 5), col.vars = '結過婚'), 1)
結過婚 0 1
教育 性別 年次
國小以下 男 39.5 0.012987 0.987013
44.5 0.041667 0.958333
49.5 0.073171 0.926829
54.5 0.136095 0.863905
59.5 0.124324 0.875676
64.5 0.255725 0.744275
69.5 0.623053 0.376947
女 39.5 0.113208 0.886792
44.5 0.171429 0.828571
49.5 0.211009 0.788991
54.5 0.212121 0.787879
59.5 0.299435 0.700565
64.5 0.359504 0.640496
69.5 0.487179 0.512821
國中 男 39.5 0.000000 1.000000
44.5 0.058140 0.941860
49.5 0.066667 0.933333
54.5 0.114035 0.885965
59.5 0.177966 0.822034
64.5 0.266667 0.733333
69.5 0.470588 0.529412
女 39.5 0.135135 0.864865
44.5 0.116279 0.883721
49.5 0.128205 0.871795
54.5 0.228261 0.771739
59.5 0.213235 0.786765
64.5 0.219653 0.780347
69.5 0.391667 0.608333
高中 男 39.5 0.062992 0.937008
44.5 0.090000 0.910000
49.5 0.093284 0.906716
54.5 0.137931 0.862069
59.5 0.228137 0.771863
64.5 0.357759 0.642241
69.5 0.497908 0.502092
女 39.5 0.113636 0.886364
44.5 0.092683 0.907317
49.5 0.062044 0.937956
54.5 0.096346 0.903654
59.5 0.105263 0.894737
64.5 0.184211 0.815789
69.5 0.311475 0.688525
專科 男 39.5 0.042254 0.957746
44.5 0.078652 0.921348
49.5 0.107143 0.892857
54.5 0.171779 0.828221
59.5 0.255639 0.744361
64.5 0.360000 0.640000
69.5 0.657895 0.342105
女 39.5 0.123596 0.876404
44.5 0.053435 0.946565
49.5 0.047619 0.952381
54.5 0.092199 0.907801
59.5 0.078947 0.921053
64.5 0.181818 0.818182
69.5 0.287500 0.712500
大學以上 男 39.5 0.048649 0.951351
44.5 0.150000 0.850000
49.5 0.150000 0.850000
54.5 0.258065 0.741935
59.5 0.250000 0.750000
64.5 0.470588 0.529412
69.5 0.666667 0.333333
女 39.5 0.019481 0.980519
44.5 0.030457 0.969543
49.5 0.020202 0.979798
54.5 0.066667 0.933333
59.5 0.125000 0.875000
64.5 0.076923 0.923077
69.5 0.050000 0.950000
#換種方式看
#程式報表11.6
ftable(dta, row.vars = c(1, 2), col.vars = c('年次','結過婚'))
年次 39.5 44.5 49.5 54.5 59.5 64.5 69.5
結過婚 0 1 0 1 0 1 0 1 0 1 0 1 0 1
教育 性別
國小以下 男 1 76 4 92 9 114 23 146 23 162 67 195 200 121
女 6 47 12 58 23 86 28 104 53 124 87 155 190 200
國中 男 0 53 5 81 7 98 13 101 21 97 32 88 40 45
女 5 32 5 38 10 68 21 71 29 107 38 135 47 73
高中 男 8 119 18 182 25 243 36 225 60 203 83 149 119 120
女 15 117 19 186 17 257 29 272 30 255 49 217 76 168
專科 男 3 68 14 164 27 225 28 135 34 99 45 80 75 39
女 11 78 7 124 10 200 13 128 9 105 12 54 23 57
大學以上 男 9 176 15 85 6 34 8 23 2 6 8 9 4 2
女 6 302 6 191 2 97 2 28 2 14 1 12 1 19
ftable(xtabs(結過婚 ~ 性別 + 教育 + as.factor(年次), data = dta))
as.factor(年次) 39.5 44.5 49.5 54.5 59.5 64.5 69.5
性別 教育
男 國小以下 76 92 114 146 162 195 121
國中 53 81 98 101 97 88 45
高中 119 182 243 225 203 149 120
專科 68 164 225 135 99 80 39
大學以上 176 85 34 23 6 9 2
女 國小以下 47 58 86 104 124 155 200
國中 32 38 68 71 107 135 73
高中 117 186 257 272 255 217 168
專科 78 124 200 128 105 54 57
大學以上 302 191 97 28 14 12 19
#圖示不同年次、性別與教育程度的結過婚比率,加上信賴區間
#圖11.1
ggplot(data = dta, aes(x = 年次, y = 結過婚, shape = 性別, color = 性別)) +
stat_summary(fun.data = mean_se, size = 1) +
facet_grid(. ~ 教育 ) +
labs(x = '年次', y = '結過婚百分比') +
theme(legend.position = c (.95, .1), text=element_text(family = "BiauKai"))
## Model 1: Simple model - binomial
#簡單邏輯迴歸
#程式報表11.7
summary(m1 <- glm(結過婚 ~ 年次, data = dta, family = binomial) )
Call:
glm(formula = 結過婚 ~ 年次, family = binomial, data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.550 0.281 0.450 0.704 1.054
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 7.04825 0.19472 36.2 <2e-16
年次 -0.09713 0.00319 -30.4 <2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9913.1 on 9999 degrees of freedom
Residual deviance: 8766.2 on 9998 degrees of freedom
AIC: 8770
Number of Fisher Scoring iterations: 5
#勝算比的信賴區間
#程式報表11.8
exp(cbind(coef=coef(m1), confint(m1)))
coef 2.5 % 97.5 %
(Intercept) 1150.84599 788.68296 1692.12851
年次 0.90743 0.90173 0.91308
#邏輯迴歸,加入性別與教育程度
summary(m_full <- glm(結過婚 ~ 年次 * 性別 * 教育 , data = dta, family = binomial))
Call:
glm(formula = 結過婚 ~ 年次 * 性別 * 教育, family = binomial,
data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.083 0.233 0.461 0.658 1.392
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 11.2557 0.8070 13.95 < 2e-16
年次 -0.1649 0.0125 -13.21 < 2e-16
性別女 -6.4813 0.9560 -6.78 1.2e-11
教育國中 -2.5651 1.2280 -2.09 0.03672
教育高中 -4.0707 0.9437 -4.31 1.6e-05
教育專科 -3.1143 1.0069 -3.09 0.00198
教育大學以上 -4.4320 1.1943 -3.71 0.00021
年次:性別女 0.0980 0.0149 6.58 4.7e-11
年次:教育國中 0.0436 0.0195 2.23 0.02565
年次:教育高中 0.0629 0.0149 4.24 2.3e-05
年次:教育專科 0.0434 0.0161 2.69 0.00708
年次:教育大學以上 0.0596 0.0216 2.75 0.00587
性別女:教育國中 2.0453 1.5184 1.35 0.17800
性別女:教育高中 4.6353 1.1876 3.90 9.5e-05
性別女:教育專科 3.7434 1.3544 2.76 0.00571
性別女:教育大學以上 5.5344 1.7651 3.14 0.00172
年次:性別女:教育國中 -0.0276 0.0243 -1.14 0.25609
年次:性別女:教育高中 -0.0569 0.0189 -3.00 0.00267
年次:性別女:教育專科 -0.0355 0.0223 -1.59 0.11076
年次:性別女:教育大學以上 -0.0440 0.0336 -1.31 0.19018
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9913.1 on 9999 degrees of freedom
Residual deviance: 8489.7 on 9980 degrees of freedom
AIC: 8530
Number of Fisher Scoring iterations: 6
#看看那個效果可刪除
drop1(m_full, test = 'Chisq')
Single term deletions
Model:
結過婚 ~ 年次 * 性別 * 教育
Df Deviance AIC LRT Pr(>Chi)
<none> 8490 8530
年次:性別:教育 4 8499 8531 9.46 0.051
summary(m_drop1 <- update(m_full, ~ . - 年次:性別:教育) )
Call:
glm(formula = 結過婚 ~ 年次 + 性別 + 教育 + 年次:性別 +
年次:教育 + 性別:教育, family = binomial, data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.896 0.235 0.454 0.644 1.423
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.78209 0.52788 18.53 < 2e-16
年次 -0.14189 0.00818 -17.34 < 2e-16
性別女 -4.30373 0.46279 -9.30 < 2e-16
教育國中 -1.33640 0.72972 -1.83 0.06704
教育高中 -1.92286 0.57381 -3.35 0.00081
教育專科 -1.61166 0.66309 -2.43 0.01508
教育大學以上 -2.79477 0.86989 -3.21 0.00131
年次:性別女 0.06382 0.00712 8.96 < 2e-16
年次:教育國中 0.02460 0.01156 2.13 0.03331
年次:教育高中 0.02879 0.00903 3.19 0.00143
年次:教育專科 0.01991 0.01068 1.86 0.06223
年次:教育大學以上 0.03330 0.01616 2.06 0.03941
性別女:教育國中 0.27061 0.17276 1.57 0.11725
性別女:教育高中 1.10442 0.13947 7.92 2.4e-15
性別女:教育專科 1.49109 0.17829 8.36 < 2e-16
性別女:教育大學以上 2.88247 0.31489 9.15 < 2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9913.1 on 9999 degrees of freedom
Residual deviance: 8499.1 on 9984 degrees of freedom
AIC: 8531
Number of Fisher Scoring iterations: 6
#邏輯迴歸模型診斷,將預測值與實際觀測值放在一起
#圖11.2
ggplot(data = dta, aes(x = 年次, y = 結過婚, shape = 性別, color = 性別)) +
stat_summary(fun.data = mean_se, size = 1) +
stat_summary(aes(y = fitted(m_drop1)), fun.y = mean, geom = 'line', size = 1) +
facet_grid(. ~ 教育) +
scale_x_continuous(breaks = seq(30, 70, by = 5)) +
labs(x= '年次', y = '結過婚百分比') +
theme(legend.position = c (.9, .1), text=element_text(family = "BiauKai"))
#修改邏輯迴歸,加入分散參數。載入 dispmod 套件,預備處理此事
library(dispmod)
#將資料整理成 dispmod 要的形式
dta_f <- data.frame(ftable(dta, row.vars = c(1, 2, 5), col.vars = '結過婚'))
dta_f1 <- subset(dta_f , 結過婚 == 1)
dta_f1$Tot <- dta_f[1:70, 5] + dta_f[71:140, 5]
dta_f1 <- dta_f1[,-4]
#將年次設定為連續變項
dta_f1$年次 <- as.numeric(as.vector(dta_f1$年次))
#看看資料
#程式報表11.9
head(dta_f1)
教育 性別 年次 Freq Tot
71 國小以下 男 39.5 76 77
72 國中 男 39.5 53 53
73 高中 男 39.5 119 127
74 專科 男 39.5 68 71
75 大學以上 男 39.5 176 185
76 國小以下 女 39.5 47 53
#以整理過資料重新分析,確認與前面相同
summary(m_last <- glm(cbind(Freq, Tot-Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育,
data = dta_f1, family = binomial))
Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 -
年次:性別:教育, family = binomial, data = dta_f1)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.726 -0.857 0.020 0.587 3.198
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.78209 0.52788 18.53 < 2e-16
年次 -0.14189 0.00818 -17.34 < 2e-16
性別女 -4.30373 0.46279 -9.30 < 2e-16
教育國中 -1.33640 0.72972 -1.83 0.06704
教育高中 -1.92286 0.57381 -3.35 0.00081
教育專科 -1.61166 0.66309 -2.43 0.01508
教育大學以上 -2.79477 0.86989 -3.21 0.00131
年次:性別女 0.06382 0.00712 8.96 < 2e-16
年次:教育國中 0.02460 0.01156 2.13 0.03331
年次:教育高中 0.02879 0.00903 3.19 0.00143
年次:教育專科 0.01991 0.01068 1.86 0.06223
年次:教育大學以上 0.03330 0.01617 2.06 0.03941
性別女:教育國中 0.27061 0.17276 1.57 0.11725
性別女:教育高中 1.10442 0.13947 7.92 2.4e-15
性別女:教育專科 1.49109 0.17829 8.36 < 2e-16
性別女:教育大學以上 2.88247 0.31489 9.15 < 2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1530.13 on 69 degrees of freedom
Residual deviance: 116.19 on 54 degrees of freedom
AIC: 446.3
Number of Fisher Scoring iterations: 4
#加入分散參數
#程式報表11.10, 11.11
summary(m_lastd <- glm.binomial.disp(m_last))
Binomial overdispersed logit model fitting...
Iter. 1 phi: 0.0091443
Iter. 2 phi: 0.0068854
Iter. 3 phi: 0.0072398
Iter. 4 phi: 0.0071797
Iter. 5 phi: 0.0071898
Iter. 6 phi: 0.0071881
Iter. 7 phi: 0.0071883
Iter. 8 phi: 0.0071883
Iter. 9 phi: 0.0071883
Iter. 10 phi: 0.0071883
Iter. 11 phi: 0.0071883
Converged after 11 iterations.
Estimated dispersion parameter: 0.0071883
Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 -
年次:性別:教育, family = binomial, data = dta_f1, weights = disp.weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3421 -0.5824 0.0167 0.4920 1.9120
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.4963 0.7537 12.60 < 2e-16
年次 -0.1369 0.0119 -11.52 < 2e-16
性別女 -4.4153 0.6469 -6.82 8.8e-12
教育國中 -1.0035 0.9652 -1.04 0.299
教育高中 -1.8341 0.8506 -2.16 0.031
教育專科 -1.3231 0.9283 -1.43 0.154
教育大學以上 -2.6254 1.1439 -2.30 0.022
年次:性別女 0.0650 0.0102 6.41 1.5e-10
年次:教育國中 0.0188 0.0153 1.23 0.220
年次:教育高中 0.0269 0.0136 1.98 0.047
年次:教育專科 0.0149 0.0150 0.99 0.320
年次:教育大學以上 0.0307 0.0204 1.51 0.132
性別女:教育國中 0.2893 0.2487 1.16 0.245
性別女:教育高中 1.1327 0.2277 4.98 6.5e-07
性別女:教育專科 1.4829 0.2546 5.82 5.7e-09
性別女:教育大學以上 2.8741 0.4143 6.94 4.0e-12
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 633.454 on 69 degrees of freedom
Residual deviance: 52.675 on 54 degrees of freedom
AIC: 236.8
Number of Fisher Scoring iterations: 4
Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 -
年次:性別:教育, family = binomial, data = dta_f1, weights = disp.weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3421 -0.5824 0.0167 0.4920 1.9120
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.4963 0.7537 12.60 < 2e-16
年次 -0.1369 0.0119 -11.52 < 2e-16
性別女 -4.4153 0.6469 -6.82 8.8e-12
教育國中 -1.0035 0.9652 -1.04 0.299
教育高中 -1.8341 0.8506 -2.16 0.031
教育專科 -1.3231 0.9283 -1.43 0.154
教育大學以上 -2.6254 1.1439 -2.30 0.022
年次:性別女 0.0650 0.0102 6.41 1.5e-10
年次:教育國中 0.0188 0.0153 1.23 0.220
年次:教育高中 0.0269 0.0136 1.98 0.047
年次:教育專科 0.0149 0.0150 0.99 0.320
年次:教育大學以上 0.0307 0.0204 1.51 0.132
性別女:教育國中 0.2893 0.2487 1.16 0.245
性別女:教育高中 1.1327 0.2277 4.98 6.5e-07
性別女:教育專科 1.4829 0.2546 5.82 5.7e-09
性別女:教育大學以上 2.8741 0.4143 6.94 4.0e-12
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 633.454 on 69 degrees of freedom
Residual deviance: 52.675 on 54 degrees of freedom
AIC: 236.8
Number of Fisher Scoring iterations: 4
#看看分散參數估計值
m_lastd$dispersion
[1] 0.0071883
#看看模型是否改善
anova(m_last, m_lastd)
Analysis of Deviance Table
Model 1: cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育
Model 2: cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育
Resid. Df Resid. Dev Df Deviance
1 54 116.2
2 54 52.7 0 63.5
#準備再一次模型診斷,收集資料,畫圖
#圖11.3
dta_mlastd <- data.frame(dta_f1, phat = fitted(m_lastd))
ggplot(data = dta_mlastd, aes(x = 年次, y = Freq/Tot, color = 性別, shape = 性別)) +
geom_point(size = 3) +
stat_smooth(aes(x = 年次, y = phat), method = 'loess', se = F, size = 1) +
facet_grid(. ~ 教育) +
scale_x_continuous(breaks = seq(30, 70, by = 5)) +
labs(x= '年次', y = '結過婚百分比') +
theme(legend.position = 'NONE' )
#準備繪圖。為了繪製方便,模型中去掉截距
summary(m_lastd0 <- update(m_lastd, . ~ . - 1))
Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 + 性別 + 教育 +
年次:性別 + 年次:教育 + 性別:教育 - 1, family = binomial,
data = dta_f1, weights = disp.weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3421 -0.5824 0.0167 0.4920 1.9120
Coefficients:
Estimate Std. Error z value Pr(>|z|)
年次 -0.1369 0.0119 -11.52 < 2e-16
性別男 9.4963 0.7537 12.60 < 2e-16
性別女 5.0810 0.6358 7.99 1.3e-15
教育國中 -1.0035 0.9652 -1.04 0.299
教育高中 -1.8341 0.8506 -2.16 0.031
教育專科 -1.3231 0.9283 -1.43 0.154
教育大學以上 -2.6254 1.1439 -2.30 0.022
年次:性別女 0.0650 0.0102 6.41 1.5e-10
年次:教育國中 0.0188 0.0153 1.23 0.220
年次:教育高中 0.0269 0.0136 1.98 0.047
年次:教育專科 0.0149 0.0150 0.99 0.320
年次:教育大學以上 0.0307 0.0204 1.51 0.132
性別女:教育國中 0.2893 0.2487 1.16 0.245
性別女:教育高中 1.1327 0.2277 4.98 6.5e-07
性別女:教育專科 1.4829 0.2546 5.82 5.7e-09
性別女:教育大學以上 2.8741 0.4143 6.94 4.0e-12
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2504.909 on 70 degrees of freedom
Residual deviance: 52.675 on 54 degrees of freedom
AIC: 236.8
Number of Fisher Scoring iterations: 4
library(coefplot)
#繪製估計值
#圖11.4
coefplot(m_lastd0) + labs(x = '估計值', y = '變項', title = '')
##多分類邏輯迴歸
#先看看年代與婚姻列聯表
#程式報表11.12
with(dta, prop.table(table(年次, 婚姻), 1))
婚姻
年次 有偶 未婚 離婚 喪偶
39.5 0.7199647 0.0565371 0.1139576 0.1095406
44.5 0.7281776 0.0803982 0.1294028 0.0620214
49.5 0.7137356 0.0872914 0.1553273 0.0436457
54.5 0.7168759 0.1401674 0.1206416 0.0223152
59.5 0.6710801 0.1832753 0.1358885 0.0097561
64.5 0.6220317 0.2783641 0.0930079 0.0065963
69.5 0.4558369 0.4786905 0.0580605 0.0074120
#載入 mlogit 套件,準備執行多分類邏輯迴歸
library(mlogit)
#將資料轉成 mlogit 需要格式
dta_w <- mlogit.data(dta, shape = 'wide', choice = '婚姻')
#看看資料
#程式報表11.13
head(dta_w)
~~~~~~~
first 10 observations out of 40000
~~~~~~~
教育 性別 婚姻 年齡 年次 結過婚 chid alt idx
1 國小以下 女 FALSE 32.5 69.5 0 1 離婚 1:離婚
2 國小以下 女 FALSE 32.5 69.5 0 1 喪偶 1:喪偶
3 國小以下 女 TRUE 32.5 69.5 0 1 未婚 1:未婚
4 國小以下 女 FALSE 32.5 69.5 0 1 有偶 1:有偶
5 專科 男 FALSE 57.5 44.5 1 2 離婚 2:離婚
6 專科 男 FALSE 57.5 44.5 1 2 喪偶 2:喪偶
7 專科 男 FALSE 57.5 44.5 1 2 未婚 2:未婚
8 專科 男 TRUE 57.5 44.5 1 2 有偶 2:有偶
9 國中 男 FALSE 42.5 59.5 1 3 離婚 3:離婚
10 國中 男 FALSE 42.5 59.5 1 3 喪偶 3:喪偶
~~~ indexes ~~~~
chid alt
1 1 離婚
2 1 喪偶
3 1 未婚
4 1 有偶
5 2 離婚
6 2 喪偶
7 2 未婚
8 2 有偶
9 3 離婚
10 3 喪偶
indexes: 1, 2
#只展示年代效果
#程式報表11.14
summary(ml0 <- mlogit(婚姻 ~ 0 | 年次, data = dta_w ) )
Call:
mlogit(formula = 婚姻 ~ 0 | 年次, data = dta_w, method = "nr")
Frequencies of alternatives:choice
離婚 喪偶 未婚 有偶
0.1143 0.0341 0.1966 0.6550
nr method
7 iterations, 0h:0m:2s
g'(-H)^-1g = 0.000103
successive function values within tolerance limits
Coefficients :
Estimate Std. Error z-value Pr(>|z|)
(Intercept):喪偶 3.29682 0.38273 8.61 < 2e-16
(Intercept):未婚 -5.26867 0.25073 -21.01 < 2e-16
(Intercept):有偶 1.38675 0.18329 7.57 3.9e-14
年次:喪偶 -0.09035 0.00789 -11.45 < 2e-16
年次:未婚 0.10024 0.00431 23.25 < 2e-16
年次:有偶 0.00666 0.00336 1.98 0.048
Log-Likelihood: -8920
McFadden R^2: 0.071
Likelihood ratio test : chisq = 1360 (p.value = <2e-16)
#勝算比信賴區間
cbind(b_hat = exp(coef(ml0)), b_ci = exp(confint(ml0)))
b_hat 2.5 % 97.5 %
(Intercept):喪偶 27.0266067 12.7646651 57.2233947
(Intercept):未婚 0.0051504 0.0031508 0.0084191
(Intercept):有偶 4.0018180 2.7940692 5.7316215
年次:喪偶 0.9136097 0.8995914 0.9278465
年次:未婚 1.1054378 1.0961378 1.1148166
年次:有偶 1.0066846 1.0000731 1.0133398
#準備畫圖,先得到預測值
newdta <- with(dta, data.frame(
年次 = rep(seq(from = 39.5, to = 69.5, length = 7), 4),
婚姻 = factor(rep(c('有偶', '未婚', '離婚', '喪偶'), each = 7) ) ) )
newdta_w <- mlogit.data(newdta, shape = 'wide', choice = '婚姻')
ml0_phat <- as.data.frame(predict(ml0, newdta_w)[1:7, ])
#需轉換資料形式,載入 reshape 套件
library(reshape2)
#製造年次婚姻列聯表、將預測值轉成需要格式、彙整在一起
obs_p <- data.frame(with(dta, prop.table(table(年次, 婚姻), 1)))
dta_op <- data.frame(obs_p, melt(ml0_phat))
#繪圖
#圖11.5
ggplot(data = dta_op, aes(x = 年次, y = Freq, shape = 婚姻)) +
geom_point(size = 3) +
stat_smooth(aes(x = 年次, y = value, group = 婚姻), method = 'loess', se = F, size = 1) +
labs(x = '年次', y = '分類百分比') +
theme(legend.position = c(.92, .88), text = element_text(family = "BiauKai"))
The data set days absent contains attendance data on 314 high school juniors from high schools. The response variable of interest is days absent from school. The standardized math score for each student was given as well as the program variable indicating the type of instructional program in which the student was enrolled.
Column 1: Student ID Column 2: Gender ID Column 3: Program ID Column 4: Standardized mathematics score Column 5: Days absent from school
The interest is to model the number of days of absence from school using student’s gender, the standardized test in math, and the type of program in which the student is enrolled.
dta <- read.csv("data/days_absent.csv", header=T)
dta$Gender <- factor(dta$Gender, levels = c("female", "male"),
labels = c("Female", "Male"))
head(dta)
ID Gender Program Math Days
1 S1001 Male Academic 63 4
2 S1002 Male Academic 27 4
3 S1003 Female Academic 20 2
4 S1004 Female Academic 16 3
5 S1005 Female Academic 2 3
6 S1006 Female Academic 71 13
ggplot(dta,
aes(Days)) +
geom_histogram(binwidth = 1, fill=8, col=1)+
facet_grid(Gender ~ Program) +
labs(x="Days absent",
y="Frequency") +
theme_minimal()+
theme(legend.position = c(.95, .9))
aggregate(cbind(Days, Math) ~ Gender+Program, dta, mean)
Gender Program Days Math
1 Female Academic 7.7590 43.699
2 Male Academic 6.1190 41.500
3 Female General 12.3636 44.409
4 Male General 8.5556 45.333
5 Female Vocational 2.7091 58.000
6 Male Vocational 2.6346 58.846
aggregate(cbind(Days, Math) ~ Gender+Program, dta, var)
Gender Program Days Math
1 Female Academic 68.551 586.73
2 Male Academic 41.817 689.82
3 Female General 84.338 804.54
4 Male General 41.673 923.53
5 Female Vocational 10.284 512.48
6 Male Vocational 18.080 364.92
ggplot(dta,
aes(Math, Days,
color=Gender)) +
stat_smooth(method="glm",
formula=y ~ x,
method.args=list(family=poisson),
size=rel(.5)) +
geom_point(pch=20, alpha=.5) +
facet_grid(. ~ Program) +
labs(y="Days absent",
x="Math score") +
theme_minimal()+
theme(legend.position = c(.95, .9))
## Poisson Model
sjPlot::tab_model(m0 <- glm(Days ~ Program*Gender*Math, data=dta,
family=poisson(link=log)), show.se=T, show.r2=F, show.obs=F)
| Days | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | std. Error | CI | p |
| (Intercept) | 10.43 | 0.08 | 8.94 – 12.14 | <0.001 |
| Program [General] | 1.47 | 0.13 | 1.13 – 1.91 | 0.004 |
| Program [Vocational] | 0.23 | 0.25 | 0.14 – 0.36 | <0.001 |
| Gender [Male] | 0.85 | 0.11 | 0.69 – 1.06 | 0.146 |
| Math | 0.99 | 0.00 | 0.99 – 1.00 | <0.001 |
|
Program [General] * Gender [Male] |
0.97 | 0.20 | 0.65 – 1.43 | 0.863 |
|
Program [Vocational] * Gender [Male] |
1.59 | 0.37 | 0.76 – 3.27 | 0.214 |
| Program [General] * Math | 1.00 | 0.00 | 1.00 – 1.01 | 0.479 |
|
Program [Vocational] * Math |
1.01 | 0.00 | 1.00 – 1.02 | 0.019 |
| Gender [Male] * Math | 1.00 | 0.00 | 0.99 – 1.00 | 0.274 |
|
(Program [General] Gender [Male]) Math |
1.00 | 0.00 | 0.99 – 1.01 | 0.692 |
|
(Program [Vocational] Gender [Male]) Math |
1.00 | 0.01 | 0.98 – 1.01 | 0.634 |
sjPlot::tab_model(m1 <- update(m0, . ~ . - Program:Gender:Math, family=poisson(link=log)),
show.se=T, show.r2=F, show.obs=F)
| Days | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | std. Error | CI | p |
| (Intercept) | 10.27 | 0.07 | 8.90 – 11.82 | <0.001 |
| Program [General] | 1.52 | 0.11 | 1.22 – 1.89 | <0.001 |
| Program [Vocational] | 0.24 | 0.20 | 0.16 – 0.35 | <0.001 |
| Gender [Male] | 0.88 | 0.09 | 0.73 – 1.05 | 0.166 |
| Math | 0.99 | 0.00 | 0.99 – 1.00 | <0.001 |
|
Program [General] * Gender [Male] |
0.91 | 0.12 | 0.72 – 1.14 | 0.396 |
|
Program [Vocational] * Gender [Male] |
1.36 | 0.14 | 1.03 – 1.78 | 0.027 |
| Program [General] * Math | 1.00 | 0.00 | 1.00 – 1.01 | 0.560 |
|
Program [Vocational] * Math |
1.01 | 0.00 | 1.00 – 1.01 | 0.008 |
| Gender [Male] * Math | 1.00 | 0.00 | 0.99 – 1.00 | 0.066 |
sjPlot::tab_model(m2 <- update(m1, . ~ . - Gender:Math, family=poisson(link=log)),
show.se=T, show.r2=F, show.obs=F)
| Days | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | std. Error | CI | p |
| (Intercept) | 10.98 | 0.06 | 9.71 – 12.39 | <0.001 |
| Program [General] | 1.50 | 0.11 | 1.21 – 1.86 | <0.001 |
| Program [Vocational] | 0.25 | 0.19 | 0.17 – 0.36 | <0.001 |
| Gender [Male] | 0.77 | 0.06 | 0.69 – 0.87 | <0.001 |
| Math | 0.99 | 0.00 | 0.99 – 0.99 | <0.001 |
|
Program [General] * Gender [Male] |
0.90 | 0.12 | 0.71 – 1.13 | 0.369 |
|
Program [Vocational] * Gender [Male] |
1.26 | 0.13 | 0.97 – 1.63 | 0.080 |
| Program [General] * Math | 1.00 | 0.00 | 1.00 – 1.01 | 0.474 |
|
Program [Vocational] * Math |
1.01 | 0.00 | 1.00 – 1.01 | 0.006 |
sjPlot::tab_model(m3 <- update(m2, . ~ . - Program:Gender, family=poisson(link=log)),
show.se=T, show.r2=F, show.obs=F)
| Days | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | std. Error | CI | p |
| (Intercept) | 10.91 | 0.06 | 9.70 – 12.25 | <0.001 |
| Program [General] | 1.45 | 0.10 | 1.19 – 1.76 | <0.001 |
| Program [Vocational] | 0.27 | 0.18 | 0.19 – 0.39 | <0.001 |
| Gender [Male] | 0.78 | 0.05 | 0.71 – 0.86 | <0.001 |
| Math | 0.99 | 0.00 | 0.99 – 0.99 | <0.001 |
| Program [General] * Math | 1.00 | 0.00 | 1.00 – 1.01 | 0.479 |
|
Program [Vocational] * Math |
1.01 | 0.00 | 1.00 – 1.01 | 0.005 |
anova(m3, m2, m1, m0, test="Chisq")
Analysis of Deviance Table
Model 1: Days ~ Program + Gender + Math + Program:Math
Model 2: Days ~ Program + Gender + Math + Program:Gender + Program:Math
Model 3: Days ~ Program + Gender + Math + Program:Gender + Program:Math +
Gender:Math
Model 4: Days ~ Program * Gender * Math
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 307 1739
2 305 1734 2 4.82 0.090
3 304 1731 1 3.39 0.066
4 302 1730 2 0.31 0.855
anova(m3, m1, test="Chisq")
Analysis of Deviance Table
Model 1: Days ~ Program + Gender + Math + Program:Math
Model 2: Days ~ Program + Gender + Math + Program:Gender + Program:Math +
Gender:Math
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 307 1739
2 304 1731 3 8.21 0.042
(m1$null.deviance - m1$deviance)/m1$null.deviance
[1] 0.2196
Results revealed by poisson model showed that Program and Math were significantly interacted with Days absent, especially in Vocational program. Program and Math were significantly associted with Days absent, such effect was not observed in Gender.
pchisq(deviance(m1), df=m1$df.residual, lower.tail=FALSE)
[1] 7.0591e-198
dta_m1 <- data.frame(dta, predict(m1, se.fit=TRUE)) %>%
mutate(yhat=exp(fit),
lb=exp(fit - 1.96 * se.fit),
ub=exp(fit + 1.96 * se.fit))
ggplot(dta_m1,
aes(Math, yhat, color=Gender)) +
geom_point(aes(Math, Days), pch=20) +
geom_ribbon(aes(ymin=lb, ymax=ub),
alpha=0.2) +
geom_line() +
facet_grid(. ~ Program) +
labs(x="Math Score",
y="Days Absent")+
theme_minimal()+
theme(legend.position = c(.9, .85))
Use the General health questionnaire example as a template to guide your analysis of the American national election survey data.
This dataset is based on 2012 American National Election Survey. The response variable of interest is whether the respondent had voted in 2008. There are 2 covariates: age and the marital status of the respondent.
Source: nes{poliscidata}
Column 1: Age (in year) Column 2: Marital status, 1 = Married, 0 = Not married Column 3: Number of respondents who did not vote in 2008 Column 4: Number of respondents voted in 2008
dta <- read.csv("data/nes_2008.csv", header = T)
head(dta)
Age Married No Yes
1 18 0 164 11
2 21 0 126 153
3 25 0 83 179
4 30 0 85 134
5 36 0 51 128
6 40 0 38 148
#
dta <- dta %>%
mutate(Total = Yes + No)
dta$Married <- factor(dta$Married, levels = c('0', '1'),
labels = c('Married', 'Not married'))
#
str(dta)
'data.frame': 26 obs. of 5 variables:
$ Age : int 18 21 25 30 36 40 45 50 55 60 ...
$ Married: Factor w/ 2 levels "Married","Not married": 1 1 1 1 1 1 1 1 1 1 ...
$ No : int 164 126 83 85 51 38 47 63 53 41 ...
$ Yes : int 11 153 179 134 128 148 138 220 228 198 ...
$ Total : int 175 279 262 219 179 186 185 283 281 239 ...
# numerical summaries
# Age effect
aggregate(cbind(Yes, No) ~ Age, data = dta, sum)
Age Yes No
1 18 11 172
2 21 173 155
3 25 276 146
4 30 315 139
5 36 298 103
6 40 393 84
7 45 378 86
8 50 529 110
9 55 562 105
10 60 510 74
11 65 469 49
12 70 293 38
13 75 332 29
# Married effect
aggregate(cbind(Yes, No) ~ Married, data = dta, sum)
Married Yes No
1 Married 1861 814
2 Not married 2678 476
# set black and white theme
ot <- theme_set(theme_bw())
#
ggplot(dta, aes(Age, Yes/Total, color = Married)) +
geom_jitter() +
stat_smooth(method = "glm", formula = y ~ x,
method.args = list(family = binomial),
aes(weight = Total), se = FALSE) +
labs(x = "Age (in years)", y = "Proportion of Vote") +
theme(legend.position = c(.9, .15))
#logistic regression models
# slopes and intercepts dependent on gender
summary(m0 <- glm(cbind(Yes, No) ~ Age*Married, data = dta,
family = "binomial"))
Call:
glm(formula = cbind(Yes, No) ~ Age * Married, family = "binomial",
data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-11.798 -1.256 0.081 1.105 4.756
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.91879 0.11649 -7.89 3.1e-15
Age 0.04299 0.00281 15.28 < 2e-16
MarriedNot married 0.35860 0.20525 1.75 0.081
Age:MarriedNot married 0.00508 0.00460 1.11 0.269
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 882.94 on 25 degrees of freedom
Residual deviance: 229.51 on 22 degrees of freedom
AIC: 368.3
Number of Fisher Scoring iterations: 4
summary(m1 <- glm(cbind(Yes, No) ~ Age+Married, data = dta,
family = "binomial"))
Call:
glm(formula = cbind(Yes, No) ~ Age + Married, family = "binomial",
data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-11.566 -1.154 -0.051 1.142 4.959
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.99247 0.09587 -10.35 <2e-16
Age 0.04492 0.00223 20.15 <2e-16
MarriedNot married 0.57248 0.06929 8.26 <2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 882.94 on 25 degrees of freedom
Residual deviance: 230.73 on 23 degrees of freedom
AIC: 367.6
Number of Fisher Scoring iterations: 4
summary(m2 <- glm(cbind(Yes, No) ~ Age, data = dta,
family = "binomial"))
Call:
glm(formula = cbind(Yes, No) ~ Age, family = "binomial", data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-12.53 -2.15 0.34 1.59 4.34
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.91168 0.09490 -9.61 <2e-16
Age 0.04942 0.00218 22.64 <2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 882.94 on 25 degrees of freedom
Residual deviance: 299.48 on 24 degrees of freedom
AIC: 434.3
Number of Fisher Scoring iterations: 4
anova(m0, m1, m2, test = "Chi")
Analysis of Deviance Table
Model 1: cbind(Yes, No) ~ Age * Married
Model 2: cbind(Yes, No) ~ Age + Married
Model 3: cbind(Yes, No) ~ Age
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 22 230
2 23 231 -1 -1.2 0.27
3 24 300 -1 -68.7 <2e-16
summary(m3 <- glm(cbind(Yes, No) ~ Married, data = dta,
family = "binomial"))
Call:
glm(formula = cbind(Yes, No) ~ Married, family = "binomial",
data = dta)
Deviance Residuals:
Min 1Q Median 3Q Max
-17.778 -3.044 0.959 3.209 5.367
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8269 0.0420 19.7 <2e-16
MarriedNot married 0.9005 0.0651 13.8 <2e-16
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 882.94 on 25 degrees of freedom
Residual deviance: 684.88 on 24 degrees of freedom
AIC: 819.7
Number of Fisher Scoring iterations: 4
anova(m1, m3, test = "Chi")
Analysis of Deviance Table
Model 1: cbind(Yes, No) ~ Age + Married
Model 2: cbind(Yes, No) ~ Married
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 23 231
2 24 685 -1 -454 <2e-16
sjPlot::tab_model(m1, show.se=T, show.r2=F, show.obs=F)
| cbind(Yes, No) | ||||
|---|---|---|---|---|
| Predictors | Odds Ratios | std. Error | CI | p |
| (Intercept) | 0.37 | 0.10 | 0.31 – 0.45 | <0.001 |
| Age | 1.05 | 0.00 | 1.04 – 1.05 | <0.001 |
| Married [Not married] | 1.77 | 0.07 | 1.55 – 2.03 | <0.001 |
Results revealed by GLM showed that Age and Marrige were postively associted with the proportion of Vote. Conditional on Age, one year increases in Age is associated with the increase of 4% to 5% in the odds for having a vote. In addition to Marrige, getting Married is associated with the increase of 55% to 103% in the odds for voting.
# fortify data with proportions and predicted probabilities
dta_m1 <- dta %>%
mutate(Prop = Yes/(Total),
phat = predict(m1, newdata = dta[, c(1,2)],
type = "response"))
# plot
ggplot(dta_m1, aes(Age, Prop, color = Married)) +
geom_jitter() +
geom_line(aes(Age, phat, color = Married)) +
labs(x = "Age (in years)", y = "Proportion of Vote") +
theme(legend.position = c(.9, .15))
dta_m <- broom::augment(m1)
# residual plot
ggplot(dta_m,
aes(Age, .std.resid,
color=Married)) +
geom_point(alpha=.5) +
scale_y_continuous(breaks=seq(-3, 3, by=1),
limits=c(-3.3, 3.3))+
labs(x = "Age (in years)", y = "Proportion of Vote") +
theme_minimal()+
theme(legend.position = c(.9, .9))