1 Homework 1:

Reproduce the results of analysis reported in Chapter 11 of Behavioral data analysis with R (Cheng & Sheu, 2015) with this R script and the married dataset .

1.1 Load data file

pacman::p_load(knitr, simstudy, tidyverse, nlme, rms, ggplot2, lme4, binomTools)

# input data
dta <- read.table('data/married.txt', header = T, fileEncoding = "Big-5")

#
head(dta)
      教育 性別 婚姻 年齡
1 國小以下   女 未婚 32.5
2     專科   男 有偶 57.5
3     國中   男 有偶 42.5
4 國小以下   男 有偶 42.5
5     高中   女 喪偶 62.5
6     高中   女 有偶 52.5
summary(dta)
     教育               性別               婚姻                年齡     
 Length:10000       Length:10000       Length:10000       Min.   :32.5  
 Class :character   Class :character   Class :character   1st Qu.:37.5  
 Mode  :character   Mode  :character   Mode  :character   Median :47.5  
                                                          Mean   :46.6  
                                                          3rd Qu.:52.5  
                                                          Max.   :62.5  

1.2 Order and Relabel

dta$教育 <- factor(dta$教育, levels = c('國小以下', '國中', '高中',
                                              '專科', '大學以上'))

#重排婚姻,由多到少
dta$婚姻 <- factor(dta$婚姻, levels = c('有偶', '未婚', '離婚', '喪偶'))

#將年齡換成年次
dta$年次 <- (102-dta$年齡)

#將婚姻狀況分為未婚與結過婚
dta$結過婚 <- ifelse(dta$婚姻 == '未婚', 0, 1)

1.3 Sumaary the proportion of each variable

apply(dta, 2, function(x) table(x)/dim(dta)[1])
$教育
x
大學以上     高中 國小以下     國中     專科 
  0.1070   0.3297   0.2406   0.1360   0.1867 

$性別
x
    男     女 
0.4927 0.5073 

$婚姻
x
  離婚   喪偶   未婚   有偶 
0.1143 0.0341 0.1966 0.6550 

$年齡
x
  32.5   37.5   42.5   47.5   52.5   57.5   62.5 
0.1619 0.1516 0.1435 0.1434 0.1558 0.1306 0.1132 

$年次
x
  39.5   44.5   49.5   54.5   59.5   64.5   69.5 
0.1132 0.1306 0.1558 0.1434 0.1435 0.1516 0.1619 

$結過婚
x
     0      1 
0.1966 0.8034 

1.4 Descriptive statistics - Table

#單一變項與結過婚與否的列聯表
#程式報表11.4
ftable(dta, row.vars = '教育', col.vars = '結過婚')
         結過婚    0    1
教育                     
國小以下         726 1680
國中             273 1087
高中             584 2713
專科             311 1556
大學以上          72  998
ftable(dta, row.vars = '性別', col.vars = '結過婚')
     結過婚    0    1
性別                 
男          1072 3855
女           894 4179
ftable(dta, row.vars = '年次', col.vars = '結過婚')
     結過婚    0    1
年次                 
39.5          64 1068
44.5         105 1201
49.5         136 1422
54.5         201 1233
59.5         263 1172
64.5         422 1094
69.5         775  844
#兩個變項與結過婚與否的列聯表
#程式報表11.5
ftable(dta, row.vars = c('教育', '性別'), col.vars = '結過婚')
              結過婚    0    1
教育     性別                 
國小以下 男           327  906
         女           399  774
國中     男           118  563
         女           155  524
高中     男           349 1241
         女           235 1472
專科     男           226  810
         女            85  746
大學以上 男            52  335
         女            20  663
ftable(dta, row.vars = c('教育', '年次'), col.vars = '結過婚')
              結過婚   0   1
教育     年次               
國小以下 39.5          7 123
         44.5         16 150
         49.5         32 200
         54.5         51 250
         59.5         76 286
         64.5        154 350
         69.5        390 321
國中     39.5          5  85
         44.5         10 119
         49.5         17 166
         54.5         34 172
         59.5         50 204
         64.5         70 223
         69.5         87 118
高中     39.5         23 236
         44.5         37 368
         49.5         42 500
         54.5         65 497
         59.5         90 458
         64.5        132 366
         69.5        195 288
專科     39.5         14 146
         44.5         21 288
         49.5         37 425
         54.5         41 263
         59.5         43 204
         64.5         57 134
         69.5         98  96
大學以上 39.5         15 478
         44.5         21 276
         49.5          8 131
         54.5         10  51
         59.5          4  20
         64.5          9  21
         69.5          5  21
ftable(dta, row.vars = c('性別', '年次'), col.vars = '結過婚')
          結過婚   0   1
性別 年次               
男   39.5         21 492
     44.5         56 604
     49.5         74 714
     54.5        108 630
     59.5        140 567
     64.5        235 521
     69.5        438 327
女   39.5         43 576
     44.5         49 597
     49.5         62 708
     54.5         93 603
     59.5        123 605
     64.5        187 573
     69.5        337 517
#三個變項與結過婚與否的列聯表,也可以換成比率
ftable(dta, row.vars = c(1, 2, 5), col.vars = '結過婚')
                   結過婚   0   1
教育     性別 年次               
國小以下 男   39.5          1  76
              44.5          4  92
              49.5          9 114
              54.5         23 146
              59.5         23 162
              64.5         67 195
              69.5        200 121
         女   39.5          6  47
              44.5         12  58
              49.5         23  86
              54.5         28 104
              59.5         53 124
              64.5         87 155
              69.5        190 200
國中     男   39.5          0  53
              44.5          5  81
              49.5          7  98
              54.5         13 101
              59.5         21  97
              64.5         32  88
              69.5         40  45
         女   39.5          5  32
              44.5          5  38
              49.5         10  68
              54.5         21  71
              59.5         29 107
              64.5         38 135
              69.5         47  73
高中     男   39.5          8 119
              44.5         18 182
              49.5         25 243
              54.5         36 225
              59.5         60 203
              64.5         83 149
              69.5        119 120
         女   39.5         15 117
              44.5         19 186
              49.5         17 257
              54.5         29 272
              59.5         30 255
              64.5         49 217
              69.5         76 168
專科     男   39.5          3  68
              44.5         14 164
              49.5         27 225
              54.5         28 135
              59.5         34  99
              64.5         45  80
              69.5         75  39
         女   39.5         11  78
              44.5          7 124
              49.5         10 200
              54.5         13 128
              59.5          9 105
              64.5         12  54
              69.5         23  57
大學以上 男   39.5          9 176
              44.5         15  85
              49.5          6  34
              54.5          8  23
              59.5          2   6
              64.5          8   9
              69.5          4   2
         女   39.5          6 302
              44.5          6 191
              49.5          2  97
              54.5          2  28
              59.5          2  14
              64.5          1  12
              69.5          1  19
prop.table(ftable(dta, row.vars=c(1, 2, 5), col.vars = '結過婚'), 1)
                   結過婚        0        1
教育     性別 年次                         
國小以下 男   39.5        0.012987 0.987013
              44.5        0.041667 0.958333
              49.5        0.073171 0.926829
              54.5        0.136095 0.863905
              59.5        0.124324 0.875676
              64.5        0.255725 0.744275
              69.5        0.623053 0.376947
         女   39.5        0.113208 0.886792
              44.5        0.171429 0.828571
              49.5        0.211009 0.788991
              54.5        0.212121 0.787879
              59.5        0.299435 0.700565
              64.5        0.359504 0.640496
              69.5        0.487179 0.512821
國中     男   39.5        0.000000 1.000000
              44.5        0.058140 0.941860
              49.5        0.066667 0.933333
              54.5        0.114035 0.885965
              59.5        0.177966 0.822034
              64.5        0.266667 0.733333
              69.5        0.470588 0.529412
         女   39.5        0.135135 0.864865
              44.5        0.116279 0.883721
              49.5        0.128205 0.871795
              54.5        0.228261 0.771739
              59.5        0.213235 0.786765
              64.5        0.219653 0.780347
              69.5        0.391667 0.608333
高中     男   39.5        0.062992 0.937008
              44.5        0.090000 0.910000
              49.5        0.093284 0.906716
              54.5        0.137931 0.862069
              59.5        0.228137 0.771863
              64.5        0.357759 0.642241
              69.5        0.497908 0.502092
         女   39.5        0.113636 0.886364
              44.5        0.092683 0.907317
              49.5        0.062044 0.937956
              54.5        0.096346 0.903654
              59.5        0.105263 0.894737
              64.5        0.184211 0.815789
              69.5        0.311475 0.688525
專科     男   39.5        0.042254 0.957746
              44.5        0.078652 0.921348
              49.5        0.107143 0.892857
              54.5        0.171779 0.828221
              59.5        0.255639 0.744361
              64.5        0.360000 0.640000
              69.5        0.657895 0.342105
         女   39.5        0.123596 0.876404
              44.5        0.053435 0.946565
              49.5        0.047619 0.952381
              54.5        0.092199 0.907801
              59.5        0.078947 0.921053
              64.5        0.181818 0.818182
              69.5        0.287500 0.712500
大學以上 男   39.5        0.048649 0.951351
              44.5        0.150000 0.850000
              49.5        0.150000 0.850000
              54.5        0.258065 0.741935
              59.5        0.250000 0.750000
              64.5        0.470588 0.529412
              69.5        0.666667 0.333333
         女   39.5        0.019481 0.980519
              44.5        0.030457 0.969543
              49.5        0.020202 0.979798
              54.5        0.066667 0.933333
              59.5        0.125000 0.875000
              64.5        0.076923 0.923077
              69.5        0.050000 0.950000
#換種方式看
#程式報表11.6
ftable(dta, row.vars = c(1, 2), col.vars = c('年次','結過婚'))
              年次   39.5     44.5     49.5     54.5     59.5     64.5     69.5    
              結過婚    0   1    0   1    0   1    0   1    0   1    0   1    0   1
教育     性別                                                                      
國小以下 男             1  76    4  92    9 114   23 146   23 162   67 195  200 121
         女             6  47   12  58   23  86   28 104   53 124   87 155  190 200
國中     男             0  53    5  81    7  98   13 101   21  97   32  88   40  45
         女             5  32    5  38   10  68   21  71   29 107   38 135   47  73
高中     男             8 119   18 182   25 243   36 225   60 203   83 149  119 120
         女            15 117   19 186   17 257   29 272   30 255   49 217   76 168
專科     男             3  68   14 164   27 225   28 135   34  99   45  80   75  39
         女            11  78    7 124   10 200   13 128    9 105   12  54   23  57
大學以上 男             9 176   15  85    6  34    8  23    2   6    8   9    4   2
         女             6 302    6 191    2  97    2  28    2  14    1  12    1  19
ftable(xtabs(結過婚 ~ 性別 + 教育 + as.factor(年次), data = dta))
              as.factor(年次) 39.5 44.5 49.5 54.5 59.5 64.5 69.5
性別 教育                                                       
男   國小以下                   76   92  114  146  162  195  121
     國中                       53   81   98  101   97   88   45
     高中                      119  182  243  225  203  149  120
     專科                       68  164  225  135   99   80   39
     大學以上                  176   85   34   23    6    9    2
女   國小以下                   47   58   86  104  124  155  200
     國中                       32   38   68   71  107  135   73
     高中                      117  186  257  272  255  217  168
     專科                       78  124  200  128  105   54   57
     大學以上                  302  191   97   28   14   12   19

1.5 Data Visualization

#圖示不同年次、性別與教育程度的結過婚比率,加上信賴區間
#圖11.1
ggplot(data = dta, aes(x = 年次, y = 結過婚, shape = 性別, color = 性別)) +
  stat_summary(fun.data = mean_se, size = 1) +
  facet_grid(. ~ 教育 ) +
  labs(x = '年次', y = '結過婚百分比') +
  theme(legend.position = c (.95, .1), text=element_text(family = "BiauKai"))

## Model 1: Simple model - binomial

#簡單邏輯迴歸
#程式報表11.7
summary(m1 <- glm(結過婚 ~ 年次, data = dta, family = binomial) )

Call:
glm(formula = 結過婚 ~ 年次, family = binomial, data = dta)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.550   0.281   0.450   0.704   1.054  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  7.04825    0.19472    36.2   <2e-16
年次        -0.09713    0.00319   -30.4   <2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9913.1  on 9999  degrees of freedom
Residual deviance: 8766.2  on 9998  degrees of freedom
AIC: 8770

Number of Fisher Scoring iterations: 5
#勝算比的信賴區間 
#程式報表11.8
exp(cbind(coef=coef(m1), confint(m1))) 
                  coef     2.5 %     97.5 %
(Intercept) 1150.84599 788.68296 1692.12851
年次           0.90743   0.90173    0.91308

1.6 Model 2: add gender and educaion effect

#邏輯迴歸,加入性別與教育程度
summary(m_full <- glm(結過婚 ~ 年次 * 性別 * 教育 , data = dta, family = binomial))

Call:
glm(formula = 結過婚 ~ 年次 * 性別 * 教育, family = binomial, 
    data = dta)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-3.083   0.233   0.461   0.658   1.392  

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)
(Intercept)               11.2557     0.8070   13.95  < 2e-16
年次                      -0.1649     0.0125  -13.21  < 2e-16
性別女                    -6.4813     0.9560   -6.78  1.2e-11
教育國中                  -2.5651     1.2280   -2.09  0.03672
教育高中                  -4.0707     0.9437   -4.31  1.6e-05
教育專科                  -3.1143     1.0069   -3.09  0.00198
教育大學以上              -4.4320     1.1943   -3.71  0.00021
年次:性別女                0.0980     0.0149    6.58  4.7e-11
年次:教育國中              0.0436     0.0195    2.23  0.02565
年次:教育高中              0.0629     0.0149    4.24  2.3e-05
年次:教育專科              0.0434     0.0161    2.69  0.00708
年次:教育大學以上          0.0596     0.0216    2.75  0.00587
性別女:教育國中            2.0453     1.5184    1.35  0.17800
性別女:教育高中            4.6353     1.1876    3.90  9.5e-05
性別女:教育專科            3.7434     1.3544    2.76  0.00571
性別女:教育大學以上        5.5344     1.7651    3.14  0.00172
年次:性別女:教育國中      -0.0276     0.0243   -1.14  0.25609
年次:性別女:教育高中      -0.0569     0.0189   -3.00  0.00267
年次:性別女:教育專科      -0.0355     0.0223   -1.59  0.11076
年次:性別女:教育大學以上  -0.0440     0.0336   -1.31  0.19018

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9913.1  on 9999  degrees of freedom
Residual deviance: 8489.7  on 9980  degrees of freedom
AIC: 8530

Number of Fisher Scoring iterations: 6
#看看那個效果可刪除
drop1(m_full, test = 'Chisq')
Single term deletions

Model:
結過婚 ~ 年次 * 性別 * 教育
               Df Deviance  AIC  LRT Pr(>Chi)
<none>                8490 8530              
年次:性別:教育  4     8499 8531 9.46    0.051
summary(m_drop1 <- update(m_full, ~ . - 年次:性別:教育) )

Call:
glm(formula = 結過婚 ~ 年次 + 性別 + 教育 + 年次:性別 + 
    年次:教育 + 性別:教育, family = binomial, data = dta)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.896   0.235   0.454   0.644   1.423  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)
(Intercept)          9.78209    0.52788   18.53  < 2e-16
年次                -0.14189    0.00818  -17.34  < 2e-16
性別女              -4.30373    0.46279   -9.30  < 2e-16
教育國中            -1.33640    0.72972   -1.83  0.06704
教育高中            -1.92286    0.57381   -3.35  0.00081
教育專科            -1.61166    0.66309   -2.43  0.01508
教育大學以上        -2.79477    0.86989   -3.21  0.00131
年次:性別女          0.06382    0.00712    8.96  < 2e-16
年次:教育國中        0.02460    0.01156    2.13  0.03331
年次:教育高中        0.02879    0.00903    3.19  0.00143
年次:教育專科        0.01991    0.01068    1.86  0.06223
年次:教育大學以上    0.03330    0.01616    2.06  0.03941
性別女:教育國中      0.27061    0.17276    1.57  0.11725
性別女:教育高中      1.10442    0.13947    7.92  2.4e-15
性別女:教育專科      1.49109    0.17829    8.36  < 2e-16
性別女:教育大學以上  2.88247    0.31489    9.15  < 2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9913.1  on 9999  degrees of freedom
Residual deviance: 8499.1  on 9984  degrees of freedom
AIC: 8531

Number of Fisher Scoring iterations: 6

1.7 Data Visualization

#邏輯迴歸模型診斷,將預測值與實際觀測值放在一起
#圖11.2
ggplot(data = dta, aes(x = 年次, y = 結過婚, shape = 性別, color = 性別)) +
  stat_summary(fun.data = mean_se, size = 1) +
  stat_summary(aes(y = fitted(m_drop1)), fun.y = mean, geom = 'line', size = 1) +
  facet_grid(. ~ 教育) +
  scale_x_continuous(breaks = seq(30, 70, by = 5)) +
  labs(x= '年次', y = '結過婚百分比') +
  theme(legend.position = c (.9, .1), text=element_text(family = "BiauKai"))

1.8 Change year from catagorical to continous variable

#修改邏輯迴歸,加入分散參數。載入 dispmod 套件,預備處理此事
library(dispmod)

#將資料整理成 dispmod 要的形式
dta_f <- data.frame(ftable(dta, row.vars = c(1, 2, 5), col.vars = '結過婚'))
dta_f1 <- subset(dta_f , 結過婚 == 1)
dta_f1$Tot <- dta_f[1:70, 5] + dta_f[71:140, 5]
dta_f1 <- dta_f1[,-4]

#將年次設定為連續變項
dta_f1$年次 <- as.numeric(as.vector(dta_f1$年次))

#看看資料
#程式報表11.9
head(dta_f1)
       教育 性別 年次 Freq Tot
71 國小以下   男 39.5   76  77
72     國中   男 39.5   53  53
73     高中   男 39.5  119 127
74     專科   男 39.5   68  71
75 大學以上   男 39.5  176 185
76 國小以下   女 39.5   47  53

1.9 Reanalyze

#以整理過資料重新分析,確認與前面相同
summary(m_last <- glm(cbind(Freq, Tot-Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育, 
                    data = dta_f1, family = binomial))

Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 
    年次:性別:教育, family = binomial, data = dta_f1)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-3.726  -0.857   0.020   0.587   3.198  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)
(Intercept)          9.78209    0.52788   18.53  < 2e-16
年次                -0.14189    0.00818  -17.34  < 2e-16
性別女              -4.30373    0.46279   -9.30  < 2e-16
教育國中            -1.33640    0.72972   -1.83  0.06704
教育高中            -1.92286    0.57381   -3.35  0.00081
教育專科            -1.61166    0.66309   -2.43  0.01508
教育大學以上        -2.79477    0.86989   -3.21  0.00131
年次:性別女          0.06382    0.00712    8.96  < 2e-16
年次:教育國中        0.02460    0.01156    2.13  0.03331
年次:教育高中        0.02879    0.00903    3.19  0.00143
年次:教育專科        0.01991    0.01068    1.86  0.06223
年次:教育大學以上    0.03330    0.01617    2.06  0.03941
性別女:教育國中      0.27061    0.17276    1.57  0.11725
性別女:教育高中      1.10442    0.13947    7.92  2.4e-15
性別女:教育專科      1.49109    0.17829    8.36  < 2e-16
性別女:教育大學以上  2.88247    0.31489    9.15  < 2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1530.13  on 69  degrees of freedom
Residual deviance:  116.19  on 54  degrees of freedom
AIC: 446.3

Number of Fisher Scoring iterations: 4
#加入分散參數
#程式報表11.10, 11.11
summary(m_lastd <- glm.binomial.disp(m_last))

Binomial overdispersed logit model fitting...
Iter.  1  phi: 0.0091443 
Iter.  2  phi: 0.0068854 
Iter.  3  phi: 0.0072398 
Iter.  4  phi: 0.0071797 
Iter.  5  phi: 0.0071898 
Iter.  6  phi: 0.0071881 
Iter.  7  phi: 0.0071883 
Iter.  8  phi: 0.0071883 
Iter.  9  phi: 0.0071883 
Iter.  10  phi: 0.0071883 
Iter.  11  phi: 0.0071883 
Converged after 11 iterations. 
Estimated dispersion parameter: 0.0071883 

Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 
    年次:性別:教育, family = binomial, data = dta_f1, weights = disp.weights)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3421  -0.5824   0.0167   0.4920   1.9120  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)
(Intercept)           9.4963     0.7537   12.60  < 2e-16
年次                 -0.1369     0.0119  -11.52  < 2e-16
性別女               -4.4153     0.6469   -6.82  8.8e-12
教育國中             -1.0035     0.9652   -1.04    0.299
教育高中             -1.8341     0.8506   -2.16    0.031
教育專科             -1.3231     0.9283   -1.43    0.154
教育大學以上         -2.6254     1.1439   -2.30    0.022
年次:性別女           0.0650     0.0102    6.41  1.5e-10
年次:教育國中         0.0188     0.0153    1.23    0.220
年次:教育高中         0.0269     0.0136    1.98    0.047
年次:教育專科         0.0149     0.0150    0.99    0.320
年次:教育大學以上     0.0307     0.0204    1.51    0.132
性別女:教育國中       0.2893     0.2487    1.16    0.245
性別女:教育高中       1.1327     0.2277    4.98  6.5e-07
性別女:教育專科       1.4829     0.2546    5.82  5.7e-09
性別女:教育大學以上   2.8741     0.4143    6.94  4.0e-12

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 633.454  on 69  degrees of freedom
Residual deviance:  52.675  on 54  degrees of freedom
AIC: 236.8

Number of Fisher Scoring iterations: 4

Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 
    年次:性別:教育, family = binomial, data = dta_f1, weights = disp.weights)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3421  -0.5824   0.0167   0.4920   1.9120  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)
(Intercept)           9.4963     0.7537   12.60  < 2e-16
年次                 -0.1369     0.0119  -11.52  < 2e-16
性別女               -4.4153     0.6469   -6.82  8.8e-12
教育國中             -1.0035     0.9652   -1.04    0.299
教育高中             -1.8341     0.8506   -2.16    0.031
教育專科             -1.3231     0.9283   -1.43    0.154
教育大學以上         -2.6254     1.1439   -2.30    0.022
年次:性別女           0.0650     0.0102    6.41  1.5e-10
年次:教育國中         0.0188     0.0153    1.23    0.220
年次:教育高中         0.0269     0.0136    1.98    0.047
年次:教育專科         0.0149     0.0150    0.99    0.320
年次:教育大學以上     0.0307     0.0204    1.51    0.132
性別女:教育國中       0.2893     0.2487    1.16    0.245
性別女:教育高中       1.1327     0.2277    4.98  6.5e-07
性別女:教育專科       1.4829     0.2546    5.82  5.7e-09
性別女:教育大學以上   2.8741     0.4143    6.94  4.0e-12

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 633.454  on 69  degrees of freedom
Residual deviance:  52.675  on 54  degrees of freedom
AIC: 236.8

Number of Fisher Scoring iterations: 4
#看看分散參數估計值
m_lastd$dispersion
[1] 0.0071883
#看看模型是否改善
anova(m_last, m_lastd)
Analysis of Deviance Table

Model 1: cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育
Model 2: cbind(Freq, Tot - Freq) ~ 年次 * 性別 * 教育 - 年次:性別:教育
  Resid. Df Resid. Dev Df Deviance
1        54      116.2            
2        54       52.7  0     63.5
#準備再一次模型診斷,收集資料,畫圖
#圖11.3
dta_mlastd <- data.frame(dta_f1, phat = fitted(m_lastd))
ggplot(data = dta_mlastd, aes(x = 年次, y = Freq/Tot,  color = 性別, shape = 性別)) +
  geom_point(size = 3) +
  stat_smooth(aes(x = 年次, y = phat), method = 'loess', se = F, size = 1) +
  facet_grid(. ~ 教育) +
  scale_x_continuous(breaks = seq(30, 70, by = 5)) +
  labs(x= '年次', y = '結過婚百分比') +
  theme(legend.position = 'NONE' )

#準備繪圖。為了繪製方便,模型中去掉截距
summary(m_lastd0 <- update(m_lastd, . ~ . - 1))

Call:
glm(formula = cbind(Freq, Tot - Freq) ~ 年次 + 性別 + 教育 + 
    年次:性別 + 年次:教育 + 性別:教育 - 1, family = binomial, 
    data = dta_f1, weights = disp.weights)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3421  -0.5824   0.0167   0.4920   1.9120  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)
年次                 -0.1369     0.0119  -11.52  < 2e-16
性別男                9.4963     0.7537   12.60  < 2e-16
性別女                5.0810     0.6358    7.99  1.3e-15
教育國中             -1.0035     0.9652   -1.04    0.299
教育高中             -1.8341     0.8506   -2.16    0.031
教育專科             -1.3231     0.9283   -1.43    0.154
教育大學以上         -2.6254     1.1439   -2.30    0.022
年次:性別女           0.0650     0.0102    6.41  1.5e-10
年次:教育國中         0.0188     0.0153    1.23    0.220
年次:教育高中         0.0269     0.0136    1.98    0.047
年次:教育專科         0.0149     0.0150    0.99    0.320
年次:教育大學以上     0.0307     0.0204    1.51    0.132
性別女:教育國中       0.2893     0.2487    1.16    0.245
性別女:教育高中       1.1327     0.2277    4.98  6.5e-07
性別女:教育專科       1.4829     0.2546    5.82  5.7e-09
性別女:教育大學以上   2.8741     0.4143    6.94  4.0e-12

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2504.909  on 70  degrees of freedom
Residual deviance:   52.675  on 54  degrees of freedom
AIC: 236.8

Number of Fisher Scoring iterations: 4
library(coefplot)

#繪製估計值
#圖11.4
coefplot(m_lastd0) + labs(x = '估計值', y = '變項', title = '') 

##多分類邏輯迴歸
#先看看年代與婚姻列聯表
#程式報表11.12
with(dta, prop.table(table(年次, 婚姻), 1))
      婚姻
年次      有偶      未婚      離婚      喪偶
  39.5 0.7199647 0.0565371 0.1139576 0.1095406
  44.5 0.7281776 0.0803982 0.1294028 0.0620214
  49.5 0.7137356 0.0872914 0.1553273 0.0436457
  54.5 0.7168759 0.1401674 0.1206416 0.0223152
  59.5 0.6710801 0.1832753 0.1358885 0.0097561
  64.5 0.6220317 0.2783641 0.0930079 0.0065963
  69.5 0.4558369 0.4786905 0.0580605 0.0074120
#載入 mlogit 套件,準備執行多分類邏輯迴歸
library(mlogit)

#將資料轉成 mlogit 需要格式
dta_w <- mlogit.data(dta, shape = 'wide', choice = '婚姻')

#看看資料
#程式報表11.13
head(dta_w)
~~~~~~~
 first 10 observations out of 40000 
~~~~~~~
       教育 性別  婚姻 年齡 年次 結過婚 chid  alt    idx
1  國小以下   女 FALSE 32.5 69.5      0    1 離婚 1:離婚
2  國小以下   女 FALSE 32.5 69.5      0    1 喪偶 1:喪偶
3  國小以下   女  TRUE 32.5 69.5      0    1 未婚 1:未婚
4  國小以下   女 FALSE 32.5 69.5      0    1 有偶 1:有偶
5      專科   男 FALSE 57.5 44.5      1    2 離婚 2:離婚
6      專科   男 FALSE 57.5 44.5      1    2 喪偶 2:喪偶
7      專科   男 FALSE 57.5 44.5      1    2 未婚 2:未婚
8      專科   男  TRUE 57.5 44.5      1    2 有偶 2:有偶
9      國中   男 FALSE 42.5 59.5      1    3 離婚 3:離婚
10     國中   男 FALSE 42.5 59.5      1    3 喪偶 3:喪偶

~~~ indexes ~~~~
   chid  alt
1     1 離婚
2     1 喪偶
3     1 未婚
4     1 有偶
5     2 離婚
6     2 喪偶
7     2 未婚
8     2 有偶
9     3 離婚
10    3 喪偶
indexes:  1, 2 
#只展示年代效果
#程式報表11.14
summary(ml0 <- mlogit(婚姻 ~ 0 | 年次, data = dta_w ) )

Call:
mlogit(formula = 婚姻 ~ 0 | 年次, data = dta_w, method = "nr")

Frequencies of alternatives:choice
  離婚   喪偶   未婚   有偶 
0.1143 0.0341 0.1966 0.6550 

nr method
7 iterations, 0h:0m:2s 
g'(-H)^-1g = 0.000103 
successive function values within tolerance limits 

Coefficients :
                 Estimate Std. Error z-value Pr(>|z|)
(Intercept):喪偶  3.29682    0.38273    8.61  < 2e-16
(Intercept):未婚 -5.26867    0.25073  -21.01  < 2e-16
(Intercept):有偶  1.38675    0.18329    7.57  3.9e-14
年次:喪偶        -0.09035    0.00789  -11.45  < 2e-16
年次:未婚         0.10024    0.00431   23.25  < 2e-16
年次:有偶         0.00666    0.00336    1.98    0.048

Log-Likelihood: -8920
McFadden R^2:  0.071 
Likelihood ratio test : chisq = 1360 (p.value = <2e-16)
#勝算比信賴區間
cbind(b_hat = exp(coef(ml0)), b_ci = exp(confint(ml0)))
                      b_hat      2.5 %     97.5 %
(Intercept):喪偶 27.0266067 12.7646651 57.2233947
(Intercept):未婚  0.0051504  0.0031508  0.0084191
(Intercept):有偶  4.0018180  2.7940692  5.7316215
年次:喪偶         0.9136097  0.8995914  0.9278465
年次:未婚         1.1054378  1.0961378  1.1148166
年次:有偶         1.0066846  1.0000731  1.0133398
#準備畫圖,先得到預測值
newdta <- with(dta, data.frame(
               年次 = rep(seq(from = 39.5, to = 69.5, length = 7), 4),
               婚姻 = factor(rep(c('有偶', '未婚', '離婚', '喪偶'), each = 7) ) ) )
 
newdta_w <- mlogit.data(newdta, shape = 'wide', choice = '婚姻')
ml0_phat <- as.data.frame(predict(ml0, newdta_w)[1:7, ])

#需轉換資料形式,載入 reshape 套件
library(reshape2)

#製造年次婚姻列聯表、將預測值轉成需要格式、彙整在一起
obs_p <- data.frame(with(dta, prop.table(table(年次, 婚姻), 1)))
dta_op <- data.frame(obs_p, melt(ml0_phat))
#繪圖
#圖11.5
ggplot(data = dta_op, aes(x = 年次, y = Freq, shape = 婚姻)) +
 geom_point(size = 3) + 
 stat_smooth(aes(x = 年次, y = value, group = 婚姻), method = 'loess', se = F, size = 1) +
 labs(x = '年次', y = '分類百分比') +
 theme(legend.position = c(.92, .88), text = element_text(family = "BiauKai"))

2 Homework 2:

The data set days absent contains attendance data on 314 high school juniors from high schools. The response variable of interest is days absent from school. The standardized math score for each student was given as well as the program variable indicating the type of instructional program in which the student was enrolled.

Column 1: Student ID Column 2: Gender ID Column 3: Program ID Column 4: Standardized mathematics score Column 5: Days absent from school

The interest is to model the number of days of absence from school using student’s gender, the standardized test in math, and the type of program in which the student is enrolled.

2.1 Load data file

dta <- read.csv("data/days_absent.csv", header=T)

dta$Gender <- factor(dta$Gender, levels = c("female", "male"),
                     labels = c("Female", "Male"))

head(dta)
     ID Gender  Program Math Days
1 S1001   Male Academic   63    4
2 S1002   Male Academic   27    4
3 S1003 Female Academic   20    2
4 S1004 Female Academic   16    3
5 S1005 Female Academic    2    3
6 S1006 Female Academic   71   13

2.2 Data Visualization - Days absent

ggplot(dta, 
       aes(Days)) +
 geom_histogram(binwidth = 1, fill=8, col=1)+
 facet_grid(Gender ~ Program) +
 labs(x="Days absent", 
      y="Frequency") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

2.3 Mean and Variance

aggregate(cbind(Days, Math) ~ Gender+Program, dta, mean)
  Gender    Program    Days   Math
1 Female   Academic  7.7590 43.699
2   Male   Academic  6.1190 41.500
3 Female    General 12.3636 44.409
4   Male    General  8.5556 45.333
5 Female Vocational  2.7091 58.000
6   Male Vocational  2.6346 58.846
aggregate(cbind(Days, Math) ~ Gender+Program, dta, var)
  Gender    Program   Days   Math
1 Female   Academic 68.551 586.73
2   Male   Academic 41.817 689.82
3 Female    General 84.338 804.54
4   Male    General 41.673 923.53
5 Female Vocational 10.284 512.48
6   Male Vocational 18.080 364.92

2.4 Effects of Program, Gender, and Math

ggplot(dta, 
       aes(Math, Days, 
           color=Gender)) +
 stat_smooth(method="glm",
             formula=y ~ x,
             method.args=list(family=poisson),
             size=rel(.5)) +
 geom_point(pch=20, alpha=.5) +
 facet_grid(. ~ Program) +
 labs(y="Days absent", 
      x="Math score") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

## Poisson Model

sjPlot::tab_model(m0 <- glm(Days ~ Program*Gender*Math, data=dta, 
                            family=poisson(link=log)), show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.43 0.08 8.94 – 12.14 <0.001
Program [General] 1.47 0.13 1.13 – 1.91 0.004
Program [Vocational] 0.23 0.25 0.14 – 0.36 <0.001
Gender [Male] 0.85 0.11 0.69 – 1.06 0.146
Math 0.99 0.00 0.99 – 1.00 <0.001
Program [General] *
Gender [Male]
0.97 0.20 0.65 – 1.43 0.863
Program [Vocational] *
Gender [Male]
1.59 0.37 0.76 – 3.27 0.214
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.479
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.02 0.019
Gender [Male] * Math 1.00 0.00 0.99 – 1.00 0.274
(Program [General]
Gender [Male])
Math
1.00 0.00 0.99 – 1.01 0.692
(Program [Vocational]
Gender [Male])
Math
1.00 0.01 0.98 – 1.01 0.634
sjPlot::tab_model(m1 <- update(m0, . ~ . - Program:Gender:Math, family=poisson(link=log)),
                  show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.27 0.07 8.90 – 11.82 <0.001
Program [General] 1.52 0.11 1.22 – 1.89 <0.001
Program [Vocational] 0.24 0.20 0.16 – 0.35 <0.001
Gender [Male] 0.88 0.09 0.73 – 1.05 0.166
Math 0.99 0.00 0.99 – 1.00 <0.001
Program [General] *
Gender [Male]
0.91 0.12 0.72 – 1.14 0.396
Program [Vocational] *
Gender [Male]
1.36 0.14 1.03 – 1.78 0.027
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.560
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.01 0.008
Gender [Male] * Math 1.00 0.00 0.99 – 1.00 0.066
sjPlot::tab_model(m2 <- update(m1, . ~ . - Gender:Math, family=poisson(link=log)),
                  show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.98 0.06 9.71 – 12.39 <0.001
Program [General] 1.50 0.11 1.21 – 1.86 <0.001
Program [Vocational] 0.25 0.19 0.17 – 0.36 <0.001
Gender [Male] 0.77 0.06 0.69 – 0.87 <0.001
Math 0.99 0.00 0.99 – 0.99 <0.001
Program [General] *
Gender [Male]
0.90 0.12 0.71 – 1.13 0.369
Program [Vocational] *
Gender [Male]
1.26 0.13 0.97 – 1.63 0.080
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.474
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.01 0.006
sjPlot::tab_model(m3 <- update(m2, . ~ . - Program:Gender, family=poisson(link=log)),
                  show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.91 0.06 9.70 – 12.25 <0.001
Program [General] 1.45 0.10 1.19 – 1.76 <0.001
Program [Vocational] 0.27 0.18 0.19 – 0.39 <0.001
Gender [Male] 0.78 0.05 0.71 – 0.86 <0.001
Math 0.99 0.00 0.99 – 0.99 <0.001
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.479
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.01 0.005
anova(m3, m2, m1, m0, test="Chisq")
Analysis of Deviance Table

Model 1: Days ~ Program + Gender + Math + Program:Math
Model 2: Days ~ Program + Gender + Math + Program:Gender + Program:Math
Model 3: Days ~ Program + Gender + Math + Program:Gender + Program:Math + 
    Gender:Math
Model 4: Days ~ Program * Gender * Math
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1       307       1739                     
2       305       1734  2     4.82    0.090
3       304       1731  1     3.39    0.066
4       302       1730  2     0.31    0.855
anova(m3, m1, test="Chisq")
Analysis of Deviance Table

Model 1: Days ~ Program + Gender + Math + Program:Math
Model 2: Days ~ Program + Gender + Math + Program:Gender + Program:Math + 
    Gender:Math
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1       307       1739                     
2       304       1731  3     8.21    0.042
(m1$null.deviance - m1$deviance)/m1$null.deviance
[1] 0.2196

Results revealed by poisson model showed that Program and Math were significantly interacted with Days absent, especially in Vocational program. Program and Math were significantly associted with Days absent, such effect was not observed in Gender.

pchisq(deviance(m1), df=m1$df.residual, lower.tail=FALSE)
[1] 7.0591e-198

2.5 Prediction - lack-of-fit

dta_m1 <- data.frame(dta, predict(m1, se.fit=TRUE)) %>% 
          mutate(yhat=exp(fit),
                 lb=exp(fit - 1.96 * se.fit),
                 ub=exp(fit + 1.96 * se.fit))
ggplot(dta_m1, 
       aes(Math, yhat, color=Gender)) +
 geom_point(aes(Math, Days), pch=20) +
 geom_ribbon(aes(ymin=lb, ymax=ub), 
             alpha=0.2) +
 geom_line() +
 facet_grid(. ~ Program) +
 labs(x="Math Score", 
      y="Days Absent")+
 theme_minimal()+
 theme(legend.position = c(.9, .85))

3 Homework 3:

Use the General health questionnaire example as a template to guide your analysis of the American national election survey data.

This dataset is based on 2012 American National Election Survey. The response variable of interest is whether the respondent had voted in 2008. There are 2 covariates: age and the marital status of the respondent.

Source: nes{poliscidata}

Column 1: Age (in year) Column 2: Marital status, 1 = Married, 0 = Not married Column 3: Number of respondents who did not vote in 2008 Column 4: Number of respondents voted in 2008

3.1 Load data file

dta <- read.csv("data/nes_2008.csv", header = T)

head(dta)
  Age Married  No Yes
1  18       0 164  11
2  21       0 126 153
3  25       0  83 179
4  30       0  85 134
5  36       0  51 128
6  40       0  38 148
#
dta <-  dta %>% 
  mutate(Total = Yes + No)

dta$Married <- factor(dta$Married, levels = c('0', '1'),
                      labels = c('Married', 'Not married'))

#
str(dta)
'data.frame':   26 obs. of  5 variables:
 $ Age    : int  18 21 25 30 36 40 45 50 55 60 ...
 $ Married: Factor w/ 2 levels "Married","Not married": 1 1 1 1 1 1 1 1 1 1 ...
 $ No     : int  164 126 83 85 51 38 47 63 53 41 ...
 $ Yes    : int  11 153 179 134 128 148 138 220 228 198 ...
 $ Total  : int  175 279 262 219 179 186 185 283 281 239 ...

3.2 Summary

# numerical summaries
# Age effect
aggregate(cbind(Yes, No) ~ Age, data = dta, sum)
   Age Yes  No
1   18  11 172
2   21 173 155
3   25 276 146
4   30 315 139
5   36 298 103
6   40 393  84
7   45 378  86
8   50 529 110
9   55 562 105
10  60 510  74
11  65 469  49
12  70 293  38
13  75 332  29
# Married effect
aggregate(cbind(Yes, No) ~ Married, data = dta, sum)
      Married  Yes  No
1     Married 1861 814
2 Not married 2678 476

3.3 Data Visualization

# set black and white theme
ot <- theme_set(theme_bw())

#
ggplot(dta, aes(Age, Yes/Total, color = Married)) + 
 geom_jitter() + 
 stat_smooth(method = "glm", formula = y ~ x, 
             method.args = list(family = binomial),
             aes(weight = Total), se = FALSE) +
 labs(x = "Age (in years)", y = "Proportion of Vote") +
 theme(legend.position = c(.9, .15))

3.4 Model - GLM

#logistic regression models
# slopes and intercepts dependent on gender
summary(m0 <- glm(cbind(Yes, No) ~ Age*Married, data = dta,
                  family = "binomial"))

Call:
glm(formula = cbind(Yes, No) ~ Age * Married, family = "binomial", 
    data = dta)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-11.798   -1.256    0.081    1.105    4.756  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)
(Intercept)            -0.91879    0.11649   -7.89  3.1e-15
Age                     0.04299    0.00281   15.28  < 2e-16
MarriedNot married      0.35860    0.20525    1.75    0.081
Age:MarriedNot married  0.00508    0.00460    1.11    0.269

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 882.94  on 25  degrees of freedom
Residual deviance: 229.51  on 22  degrees of freedom
AIC: 368.3

Number of Fisher Scoring iterations: 4
summary(m1 <- glm(cbind(Yes, No) ~ Age+Married, data = dta,
                  family = "binomial"))

Call:
glm(formula = cbind(Yes, No) ~ Age + Married, family = "binomial", 
    data = dta)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-11.566   -1.154   -0.051    1.142    4.959  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)
(Intercept)        -0.99247    0.09587  -10.35   <2e-16
Age                 0.04492    0.00223   20.15   <2e-16
MarriedNot married  0.57248    0.06929    8.26   <2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 882.94  on 25  degrees of freedom
Residual deviance: 230.73  on 23  degrees of freedom
AIC: 367.6

Number of Fisher Scoring iterations: 4
summary(m2 <- glm(cbind(Yes, No) ~ Age, data = dta,
                  family = "binomial"))

Call:
glm(formula = cbind(Yes, No) ~ Age, family = "binomial", data = dta)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-12.53   -2.15    0.34    1.59    4.34  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.91168    0.09490   -9.61   <2e-16
Age          0.04942    0.00218   22.64   <2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 882.94  on 25  degrees of freedom
Residual deviance: 299.48  on 24  degrees of freedom
AIC: 434.3

Number of Fisher Scoring iterations: 4

3.5 Model Comparison

anova(m0, m1, m2, test = "Chi")
Analysis of Deviance Table

Model 1: cbind(Yes, No) ~ Age * Married
Model 2: cbind(Yes, No) ~ Age + Married
Model 3: cbind(Yes, No) ~ Age
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1        22        230                     
2        23        231 -1     -1.2     0.27
3        24        300 -1    -68.7   <2e-16
summary(m3 <- glm(cbind(Yes, No) ~ Married, data = dta,
                  family = "binomial"))

Call:
glm(formula = cbind(Yes, No) ~ Married, family = "binomial", 
    data = dta)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-17.778   -3.044    0.959    3.209    5.367  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)
(Intercept)          0.8269     0.0420    19.7   <2e-16
MarriedNot married   0.9005     0.0651    13.8   <2e-16

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 882.94  on 25  degrees of freedom
Residual deviance: 684.88  on 24  degrees of freedom
AIC: 819.7

Number of Fisher Scoring iterations: 4
anova(m1, m3, test = "Chi")
Analysis of Deviance Table

Model 1: cbind(Yes, No) ~ Age + Married
Model 2: cbind(Yes, No) ~ Married
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1        23        231                     
2        24        685 -1     -454   <2e-16
sjPlot::tab_model(m1, show.se=T, show.r2=F, show.obs=F)
  cbind(Yes, No)
Predictors Odds Ratios std. Error CI p
(Intercept) 0.37 0.10 0.31 – 0.45 <0.001
Age 1.05 0.00 1.04 – 1.05 <0.001
Married [Not married] 1.77 0.07 1.55 – 2.03 <0.001

Results revealed by GLM showed that Age and Marrige were postively associted with the proportion of Vote. Conditional on Age, one year increases in Age is associated with the increase of 4% to 5% in the odds for having a vote. In addition to Marrige, getting Married is associated with the increase of 55% to 103% in the odds for voting.

# fortify data with proportions and predicted probabilities
dta_m1 <- dta %>%
          mutate(Prop = Yes/(Total),
                 phat = predict(m1, newdata = dta[, c(1,2)], 
                                type = "response"))
# plot
ggplot(dta_m1, aes(Age, Prop, color = Married)) +
 geom_jitter() +
 geom_line(aes(Age, phat, color = Married)) +
 labs(x = "Age (in years)", y = "Proportion of Vote") +
 theme(legend.position = c(.9, .15))

3.6 Model Diagnostics

dta_m <- broom::augment(m1)
# residual plot
ggplot(dta_m, 
       aes(Age, .std.resid,
           color=Married)) + 
  geom_point(alpha=.5) +
  scale_y_continuous(breaks=seq(-3, 3, by=1), 
                     limits=c(-3.3, 3.3))+
  labs(x = "Age (in years)", y = "Proportion of Vote") +
  theme_minimal()+
  theme(legend.position = c(.9, .9))