Exercise: GLM

p = c("tidyverse", "magrittr", "MASS", "HH", "nnet")
lapply(p, library, character.only = TRUE)

Q1

#Q1
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/domesticViolence.txt")
str(dat)

## 'data.frame':    313 obs. of  3 variables:
##  $ V1: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ V2: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V3: int  0 0 0 0 0 0 0 0 0 0 ...

colnames(dat) = c("fv", "ad", "ar")
dat %<>% mutate(treatment = ifelse(ar==1, "arrest", 
                                   ifelse(ad==1, "advise", "separate")))
dat$treatment %<>% as.factor
dat$treatment = factor(dat$treatment, levels = c("separate", "advise", "arrest"))

#Plot
ggplot(dat, aes(x = treatment, y = fv))+
  stat_summary(fun.data = mean_se)+
  labs(x = "Treatment", y = "Proportion of future violence")+
  theme_bw()

summary(m0 <- glm(factor(fv)~treatment, data = dat, family = "binomial", contrasts = "contr.sum"))

## 
## Call:
## glm(formula = factor(fv) ~ treatment, family = "binomial", data = dat, 
##     contrasts = "contr.sum")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7232  -0.7232  -0.6576  -0.4797   2.1067  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.2078     0.2235  -5.404 6.52e-08 ***
## treatmentadvise  -0.2136     0.3303  -0.647    0.518    
## treatmentarrest  -0.8963     0.4027  -2.226    0.026 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 297.08  on 312  degrees of freedom
## Residual deviance: 291.56  on 310  degrees of freedom
## AIC: 297.56
## 
## Number of Fisher Scoring iterations: 4

coef(m0)

##     (Intercept) treatmentadvise treatmentarrest 
##      -1.2078116      -0.2135741      -0.8963226

exp(-1.2078-0.8963)/(1+exp(-1.2078-0.8963))

## [1] 0.108699

Q3

dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/mentalSES.txt", h = T)
str(dat)

## 'data.frame':    1660 obs. of  8 variables:
##  $ Id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ses   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ A     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ B     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ C     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ D     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ E     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mental: int  1 1 1 1 1 1 1 1 1 1 ...

sum(sapply(dat, is.na))

## [1] 0

dat$mental %<>% as.factor
dat$ses %<>% as.factor
levels(dat$ses) = c("F", "E", "D", "C", "B", "A")
dat$ses = factor(dat$ses, levels = c("A", "B", "C", "D", "E", "F"))


#Plot
ggplot(dat, aes(x = mental, fill = mental))+
  geom_bar(position = "dodge")+
  facet_wrap(~ses)+
  labs(x = "Mental impairement", y = "Count")+
  scale_fill_discrete(name = "", lab = c("well", "mild", "moderate", "impaired"))+
  coord_flip()+
  theme_bw()

dat$ses = relevel(dat$ses, ref = 6)
summary(m0 <- polr(mental~ses, method = "logistic", data = dat))

## 
## Re-fitting to get Hessian

## Call:
## polr(formula = mental ~ ses, data = dat, method = "logistic")
## 
## Coefficients:
##        Value Std. Error t value
## sesA -0.8238     0.1662  -4.956
## sesB -0.8408     0.1681  -5.001
## sesC -0.6157     0.1620  -3.802
## sesD -0.5248     0.1532  -3.425
## sesE -0.2571     0.1647  -1.561
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -2.0278   0.1334   -15.2044
## 2|3  -0.3285   0.1236    -2.6587
## 3|4   0.6802   0.1249     5.4455
## 
## Residual Deviance: 4449.382 
## AIC: 4465.382

The coefficients in my model are in the opposite direction comparing to teacher’s.
I’m wondering whether the difference results from the order of mental impairment.
According to the plot, I think the coefficients in my model would be more appropriate.
That is, mental impairment among people having higher ses is less severe than that among people having lower ses.

Q4

dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/victims.txt", h = T)
dat$race %<>% as.factor

#Plot
ggplot(dat, aes(x = race, y = nvic))+
  stat_summary(fun.data = mean_se)+
  labs(x = "Race", y = "Mean victims personally known")+
  theme_bw()

summary(m0 <- glm.nb(nvic~race, data = dat, contrasts = "contr.sum"))

## 
## Call:
## glm.nb(formula = nvic ~ race, data = dat, contrasts = "contr.sum", 
##     init.theta = 0.2023119205, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7184  -0.3899  -0.3899  -0.3899   3.5072  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3832     0.1172 -20.335  < 2e-16 ***
## race1         1.7331     0.2385   7.268 3.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2023) family taken to be 1)
## 
##     Null deviance: 471.57  on 1307  degrees of freedom
## Residual deviance: 412.60  on 1306  degrees of freedom
## AIC: 1001.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2023 
##           Std. Err.:  0.0409 
## 
##  2 x log-likelihood:  -995.7980

m = aggregate(nvic~race, FUN = mean, data = dat)
m

##   race       nvic
## 1    0 0.09225413
## 2    1 0.52201258

exp(coef(m0)["(Intercept)"]) #equals mean race0

## (Intercept) 
##  0.09225413

exp(coef(m0)["(Intercept)"]+coef(m0)["race1"]) #equals mean race1

## (Intercept) 
##   0.5220126

The estimate of dispersion parameter is different from teacher’s, and I cannot figure out why…

Q5

dat = read.table("http://data.princeton.edu/wws509/datasets/ceb.dat", h = T)
dat %<>% mutate(os = log(n))
dat %<>% mutate(kids = round(y))
dat$educ = factor(dat$educ, levels = c("none", "lower", "upper", "sec+"))
dat$dur = factor(dat$dur, levels = c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29"))
dat$res = factor(dat$res, levels = c("rural", "urban", "Suva"))

#Plot
ggplot(dat, aes(x = res, y = mean, color = educ, group = educ))+
  stat_summary(fun.y = mean, geom = "point")+
  stat_summary(fun.y = mean, geom = "line")+
  stat_summary(fun.data = mean_se, geom = "errorbar", width = .1, position = "dodge")+
  labs(x = "Residence", y = "Average children")+
  scale_color_discrete(name = "Education")+
  theme_bw()

summary(m0 <- glm(kids~offset(os), data = dat, family = "poisson"))

## 
## Call:
## glm(formula = kids ~ offset(os), family = "poisson", data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -18.6368   -4.4774   -0.8548    2.5292   21.9744  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.376346   0.009712   141.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3731.9  on 69  degrees of freedom
## Residual deviance: 3731.9  on 69  degrees of freedom
## AIC: 4163.3
## 
## Number of Fisher Scoring iterations: 5

summary(m1 <- glm(kids~dur+res+educ, data = dat, family = "poisson"))

## 
## Call:
## glm(formula = kids ~ dur + res + educ, family = "poisson", data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -18.2023   -4.1368   -0.3074    3.7858   16.2193  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.93649    0.04614  107.00  < 2e-16 ***
## dur5-9       0.89541    0.05240   17.09  < 2e-16 ***
## dur10-14     1.22136    0.05024   24.31  < 2e-16 ***
## dur15-19     1.31091    0.04975   26.35  < 2e-16 ***
## dur20-24     1.32965    0.04965   26.78  < 2e-16 ***
## dur25-29     1.89210    0.04756   39.78  < 2e-16 ***
## resurban    -1.04901    0.02405  -43.62  < 2e-16 ***
## resSuva     -1.44312    0.02785  -51.83  < 2e-16 ***
## educlower   -0.16257    0.02168   -7.50  6.4e-14 ***
## educupper   -1.04995    0.02886  -36.38  < 2e-16 ***
## educsec+    -2.10594    0.05171  -40.73  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 13735.6  on 69  degrees of freedom
## Residual deviance:  2504.1  on 59  degrees of freedom
## AIC: 2955.6
## 
## Number of Fisher Scoring iterations: 5

exp(coef(m1))

## (Intercept)      dur5-9    dur10-14    dur15-19    dur20-24    dur25-29 
## 139.2799255   2.4483431   3.3918129   3.7095517   3.7797271   6.6332554 
##    resurban     resSuva   educlower   educupper    educsec+ 
##   0.3502850   0.2361892   0.8499568   0.3499568   0.1217314

The expected number of children born among people having above secondary education is 0.12 times that among people lacking education.

Q6

dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/grayScale.txt", h = T)
dat$resp %<>% as.factor
str(dat)

## 'data.frame':    60 obs. of  4 variables:
##  $ oid     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ count   : int  19 14 3 2 0 0 0 0 0 0 ...
##  $ resp    : Factor w/ 12 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ contrast: int  -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 ...

#plot
dat_g = as.data.frame(matrix(dat$count, 12, 5))
names(dat_g) = -2:2
rownames(dat_g) <- paste(rep(c("Dark","Bright"), c(6,6)), c(6:1, 1:6), sep="")
likert(t(dat_g), as.percent = T, main = "", ylab = "Level of contrast")

summary(m0 <- polr(resp~contrast, data = dat, weights = count, Hess = T))

## Call:
## polr(formula = resp ~ contrast, data = dat, weights = count, 
##     Hess = T)
## 
## Coefficients:
##          Value Std. Error t value
## contrast  3.09     0.2354   13.13
## 
## Intercepts:
##       Value    Std. Error t value 
## 1|2    -6.1214   0.4958   -12.3470
## 2|3    -4.5455   0.4275   -10.6325
## 3|4    -3.2964   0.3617    -9.1136
## 4|5    -2.4018   0.3300    -7.2793
## 5|6     0.0089   0.2517     0.0353
## 6|7     1.0705   0.2700     3.9648
## 7|8     2.2188   0.3121     7.1088
## 8|9     3.1776   0.3457     9.1920
## 9|10    4.3815   0.4068    10.7721
## 10|11   6.1632   0.4967    12.4088
## 11|12   6.9703   0.5326    13.0875
## 
## Residual Deviance: 594.6589 
## AIC: 618.6589

Q8

dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/youth_employment.txt", h = T)
str(dat)

## 'data.frame':    978 obs. of  7 variables:
##  $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
##  $ Race      : Factor w/ 2 levels "Black","Others": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Family    : Factor w/ 2 levels "Non-Intact","Otherwise": 1 1 2 1 1 1 2 2 1 2 ...
##  $ DadEdu    : Factor w/ 2 levels "College","Otherwise": 2 2 2 1 2 2 2 1 1 1 ...
##  $ Income    : num  0.491 0.694 1.1 1.5 0.254 ...
##  $ Local     : num  6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
##  $ ASAB      : num  -0.11 0.452 0.967 1.667 0 ...

dat$Race = relevel(dat$Race, ref = "Others")

mm = dat %>% group_by(Race, Family, DadEdu) %>% summarise(Inactive = mean(Employment =="Idle"),
                                                          School = mean(Employment == "School"), 
                                                          Work = mean(Employment =="Work"))

mm %<>% gather(key = Employment, value = Prop, c(Inactive, School, Work))

ggplot(mm, aes(x = DadEdu, y = Prop, color = Employment))+
  geom_point()+
  geom_line(aes(group = Employment))+
  facet_grid(Family~Race, labeller = label_both)+
  theme_bw()+
  theme(legend.position = "top")+
  labs(x = "Father's education", y = "Mean Proportion")

Three-way interaction between Race, Family and Father’s education exists.

dat %<>% mutate(Income_f = ifelse(Income<0.3677, "Low", 
                                 ifelse(Income>1.05, "High", "Middle")))
dat$Income_f %<>% as.factor
dat$Income_f = factor(dat$Income_f, levels = c("Low", "Middle", "High"))

ggplot(dat, aes(x = Income_f, y = ASAB, color = Employment))+
  stat_summary(fun.data = mean_se, position = position_dodge(.5))+
  labs(x = "Income", y = "ASAB")+
  theme_bw()

Two-way interaction between Income and ASAB exists.

p1 = ggplot(dat[dat$Employment=="Idle",], aes(x = Income, y = ASAB, color = DadEdu))+
  geom_point(alpha = .5)+
  xlim(0,5)+
  ylim(-2,2)+
  facet_grid(Family~Race, labeller = label_both)+
  theme_bw()+
  labs(title = "Employment:Inactive")

p2 = ggplot(dat[dat$Employment=="School",], aes(x = Income, y = ASAB, color = DadEdu))+
  geom_point(alpha = .5)+
  xlim(0,5)+
  ylim(-2,2)+
  facet_grid(Family~Race, labeller = label_both)+
  theme_bw()+
  labs(title = "Employment:School")

p3 = ggplot(dat[dat$Employment=="Work",], aes(x = Income, y = ASAB, color = DadEdu))+
  geom_point(alpha = .5)+
  xlim(0,5)+
  ylim(-2,2)+
  facet_grid(Family~Race, labeller = label_both)+
  theme_bw()+
  labs(title = "Employment:Work")

p1

p2

## Warning: Removed 2 rows containing missing values (geom_point).

p3

Overview.
The distributions of each type of Employment differ.
For example, most of the observations in School fall in the cell of Race: Others and Family:Otherwise.
But the observations in Inactive do not show such pattern.
Thus, model using multinomial distribution might be more appropriate.

summary(m0 <- multinom(Employment~., data = dat))

## # weights:  30 (18 variable)
## initial  value 1074.442818 
## iter  10 value 1023.001178
## iter  20 value 1015.508959
## final  value 1015.356575 
## converged

## Call:
## multinom(formula = Employment ~ ., data = dat)
## 
## Coefficients:
##        (Intercept)  RaceBlack FamilyOtherwise DadEduOtherwise    Income
## School  0.05786532  0.2241142      0.55420767      -0.2446818 0.3067009
## Work    0.64217399 -0.4040686      0.09280388      -0.1983911 0.1812409
##              Local      ASAB Income_fMiddle Income_fHigh
## School  0.01318595 0.1812641    -0.02298188   -0.1270792
## Work   -0.06837179 0.2929785     0.36858655    0.5155179
## 
## Std. Errors:
##        (Intercept) RaceBlack FamilyOtherwise DadEduOtherwise    Income
## School   0.4209707 0.1975070       0.1882329        0.236736 0.3442137
## Work     0.4391742 0.2205727       0.1946329        0.241760 0.3457114
##             Local      ASAB Income_fMiddle Income_fHigh
## School 0.03472030 0.1066964      0.2468094    0.4823819
## Work   0.03752286 0.1107403      0.2651683    0.4901349
## 
## Residual Deviance: 2030.713 
## AIC: 2066.713

Exercise: GLM

Yu-Han, Luo

Q1

Q3

Q4

Q5

Q6

Q8