p = c("tidyverse", "magrittr", "MASS", "HH", "nnet")
lapply(p, library, character.only = TRUE)
#Q1
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/domesticViolence.txt")
str(dat)
## 'data.frame': 313 obs. of 3 variables:
## $ V1: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V2: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V3: int 0 0 0 0 0 0 0 0 0 0 ...
colnames(dat) = c("fv", "ad", "ar")
dat %<>% mutate(treatment = ifelse(ar==1, "arrest",
ifelse(ad==1, "advise", "separate")))
dat$treatment %<>% as.factor
dat$treatment = factor(dat$treatment, levels = c("separate", "advise", "arrest"))
#Plot
ggplot(dat, aes(x = treatment, y = fv))+
stat_summary(fun.data = mean_se)+
labs(x = "Treatment", y = "Proportion of future violence")+
theme_bw()
summary(m0 <- glm(factor(fv)~treatment, data = dat, family = "binomial", contrasts = "contr.sum"))
##
## Call:
## glm(formula = factor(fv) ~ treatment, family = "binomial", data = dat,
## contrasts = "contr.sum")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7232 -0.7232 -0.6576 -0.4797 2.1067
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2078 0.2235 -5.404 6.52e-08 ***
## treatmentadvise -0.2136 0.3303 -0.647 0.518
## treatmentarrest -0.8963 0.4027 -2.226 0.026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 297.08 on 312 degrees of freedom
## Residual deviance: 291.56 on 310 degrees of freedom
## AIC: 297.56
##
## Number of Fisher Scoring iterations: 4
coef(m0)
## (Intercept) treatmentadvise treatmentarrest
## -1.2078116 -0.2135741 -0.8963226
exp(-1.2078-0.8963)/(1+exp(-1.2078-0.8963))
## [1] 0.108699
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/mentalSES.txt", h = T)
str(dat)
## 'data.frame': 1660 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ses : int 6 6 6 6 6 6 6 6 6 6 ...
## $ A : int 1 1 1 1 1 1 1 1 1 1 ...
## $ B : int 0 0 0 0 0 0 0 0 0 0 ...
## $ C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ E : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mental: int 1 1 1 1 1 1 1 1 1 1 ...
sum(sapply(dat, is.na))
## [1] 0
dat$mental %<>% as.factor
dat$ses %<>% as.factor
levels(dat$ses) = c("F", "E", "D", "C", "B", "A")
dat$ses = factor(dat$ses, levels = c("A", "B", "C", "D", "E", "F"))
#Plot
ggplot(dat, aes(x = mental, fill = mental))+
geom_bar(position = "dodge")+
facet_wrap(~ses)+
labs(x = "Mental impairement", y = "Count")+
scale_fill_discrete(name = "", lab = c("well", "mild", "moderate", "impaired"))+
coord_flip()+
theme_bw()
dat$ses = relevel(dat$ses, ref = 6)
summary(m0 <- polr(mental~ses, method = "logistic", data = dat))
##
## Re-fitting to get Hessian
## Call:
## polr(formula = mental ~ ses, data = dat, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## sesA -0.8238 0.1662 -4.956
## sesB -0.8408 0.1681 -5.001
## sesC -0.6157 0.1620 -3.802
## sesD -0.5248 0.1532 -3.425
## sesE -0.2571 0.1647 -1.561
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.0278 0.1334 -15.2044
## 2|3 -0.3285 0.1236 -2.6587
## 3|4 0.6802 0.1249 5.4455
##
## Residual Deviance: 4449.382
## AIC: 4465.382
The coefficients in my model are in the opposite direction comparing to teacher’s.
I’m wondering whether the difference results from the order of mental impairment.
According to the plot, I think the coefficients in my model would be more appropriate.
That is, mental impairment among people having higher ses is less severe than that among people having lower ses.
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/victims.txt", h = T)
dat$race %<>% as.factor
#Plot
ggplot(dat, aes(x = race, y = nvic))+
stat_summary(fun.data = mean_se)+
labs(x = "Race", y = "Mean victims personally known")+
theme_bw()
summary(m0 <- glm.nb(nvic~race, data = dat, contrasts = "contr.sum"))
##
## Call:
## glm.nb(formula = nvic ~ race, data = dat, contrasts = "contr.sum",
## init.theta = 0.2023119205, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7184 -0.3899 -0.3899 -0.3899 3.5072
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3832 0.1172 -20.335 < 2e-16 ***
## race1 1.7331 0.2385 7.268 3.66e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.2023) family taken to be 1)
##
## Null deviance: 471.57 on 1307 degrees of freedom
## Residual deviance: 412.60 on 1306 degrees of freedom
## AIC: 1001.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.2023
## Std. Err.: 0.0409
##
## 2 x log-likelihood: -995.7980
m = aggregate(nvic~race, FUN = mean, data = dat)
m
## race nvic
## 1 0 0.09225413
## 2 1 0.52201258
exp(coef(m0)["(Intercept)"]) #equals mean race0
## (Intercept)
## 0.09225413
exp(coef(m0)["(Intercept)"]+coef(m0)["race1"]) #equals mean race1
## (Intercept)
## 0.5220126
The estimate of dispersion parameter is different from teacher’s, and I cannot figure out why…
dat = read.table("http://data.princeton.edu/wws509/datasets/ceb.dat", h = T)
dat %<>% mutate(os = log(n))
dat %<>% mutate(kids = round(y))
dat$educ = factor(dat$educ, levels = c("none", "lower", "upper", "sec+"))
dat$dur = factor(dat$dur, levels = c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29"))
dat$res = factor(dat$res, levels = c("rural", "urban", "Suva"))
#Plot
ggplot(dat, aes(x = res, y = mean, color = educ, group = educ))+
stat_summary(fun.y = mean, geom = "point")+
stat_summary(fun.y = mean, geom = "line")+
stat_summary(fun.data = mean_se, geom = "errorbar", width = .1, position = "dodge")+
labs(x = "Residence", y = "Average children")+
scale_color_discrete(name = "Education")+
theme_bw()
summary(m0 <- glm(kids~offset(os), data = dat, family = "poisson"))
##
## Call:
## glm(formula = kids ~ offset(os), family = "poisson", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -18.6368 -4.4774 -0.8548 2.5292 21.9744
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.376346 0.009712 141.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3731.9 on 69 degrees of freedom
## Residual deviance: 3731.9 on 69 degrees of freedom
## AIC: 4163.3
##
## Number of Fisher Scoring iterations: 5
summary(m1 <- glm(kids~dur+res+educ, data = dat, family = "poisson"))
##
## Call:
## glm(formula = kids ~ dur + res + educ, family = "poisson", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -18.2023 -4.1368 -0.3074 3.7858 16.2193
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.93649 0.04614 107.00 < 2e-16 ***
## dur5-9 0.89541 0.05240 17.09 < 2e-16 ***
## dur10-14 1.22136 0.05024 24.31 < 2e-16 ***
## dur15-19 1.31091 0.04975 26.35 < 2e-16 ***
## dur20-24 1.32965 0.04965 26.78 < 2e-16 ***
## dur25-29 1.89210 0.04756 39.78 < 2e-16 ***
## resurban -1.04901 0.02405 -43.62 < 2e-16 ***
## resSuva -1.44312 0.02785 -51.83 < 2e-16 ***
## educlower -0.16257 0.02168 -7.50 6.4e-14 ***
## educupper -1.04995 0.02886 -36.38 < 2e-16 ***
## educsec+ -2.10594 0.05171 -40.73 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 13735.6 on 69 degrees of freedom
## Residual deviance: 2504.1 on 59 degrees of freedom
## AIC: 2955.6
##
## Number of Fisher Scoring iterations: 5
exp(coef(m1))
## (Intercept) dur5-9 dur10-14 dur15-19 dur20-24 dur25-29
## 139.2799255 2.4483431 3.3918129 3.7095517 3.7797271 6.6332554
## resurban resSuva educlower educupper educsec+
## 0.3502850 0.2361892 0.8499568 0.3499568 0.1217314
The expected number of children born among people having above secondary education is 0.12 times that among people lacking education.
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/grayScale.txt", h = T)
dat$resp %<>% as.factor
str(dat)
## 'data.frame': 60 obs. of 4 variables:
## $ oid : int 1 2 3 4 5 6 7 8 9 10 ...
## $ count : int 19 14 3 2 0 0 0 0 0 0 ...
## $ resp : Factor w/ 12 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ contrast: int -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 ...
#plot
dat_g = as.data.frame(matrix(dat$count, 12, 5))
names(dat_g) = -2:2
rownames(dat_g) <- paste(rep(c("Dark","Bright"), c(6,6)), c(6:1, 1:6), sep="")
likert(t(dat_g), as.percent = T, main = "", ylab = "Level of contrast")
summary(m0 <- polr(resp~contrast, data = dat, weights = count, Hess = T))
## Call:
## polr(formula = resp ~ contrast, data = dat, weights = count,
## Hess = T)
##
## Coefficients:
## Value Std. Error t value
## contrast 3.09 0.2354 13.13
##
## Intercepts:
## Value Std. Error t value
## 1|2 -6.1214 0.4958 -12.3470
## 2|3 -4.5455 0.4275 -10.6325
## 3|4 -3.2964 0.3617 -9.1136
## 4|5 -2.4018 0.3300 -7.2793
## 5|6 0.0089 0.2517 0.0353
## 6|7 1.0705 0.2700 3.9648
## 7|8 2.2188 0.3121 7.1088
## 8|9 3.1776 0.3457 9.1920
## 9|10 4.3815 0.4068 10.7721
## 10|11 6.1632 0.4967 12.4088
## 11|12 6.9703 0.5326 13.0875
##
## Residual Deviance: 594.6589
## AIC: 618.6589
dat = read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/lmm/Data/youth_employment.txt", h = T)
str(dat)
## 'data.frame': 978 obs. of 7 variables:
## $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
## $ Race : Factor w/ 2 levels "Black","Others": 2 2 2 2 2 2 2 2 2 2 ...
## $ Family : Factor w/ 2 levels "Non-Intact","Otherwise": 1 1 2 1 1 1 2 2 1 2 ...
## $ DadEdu : Factor w/ 2 levels "College","Otherwise": 2 2 2 1 2 2 2 1 1 1 ...
## $ Income : num 0.491 0.694 1.1 1.5 0.254 ...
## $ Local : num 6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
## $ ASAB : num -0.11 0.452 0.967 1.667 0 ...
dat$Race = relevel(dat$Race, ref = "Others")
mm = dat %>% group_by(Race, Family, DadEdu) %>% summarise(Inactive = mean(Employment =="Idle"),
School = mean(Employment == "School"),
Work = mean(Employment =="Work"))
mm %<>% gather(key = Employment, value = Prop, c(Inactive, School, Work))
ggplot(mm, aes(x = DadEdu, y = Prop, color = Employment))+
geom_point()+
geom_line(aes(group = Employment))+
facet_grid(Family~Race, labeller = label_both)+
theme_bw()+
theme(legend.position = "top")+
labs(x = "Father's education", y = "Mean Proportion")
Three-way interaction between Race, Family and Father’s education exists.
dat %<>% mutate(Income_f = ifelse(Income<0.3677, "Low",
ifelse(Income>1.05, "High", "Middle")))
dat$Income_f %<>% as.factor
dat$Income_f = factor(dat$Income_f, levels = c("Low", "Middle", "High"))
ggplot(dat, aes(x = Income_f, y = ASAB, color = Employment))+
stat_summary(fun.data = mean_se, position = position_dodge(.5))+
labs(x = "Income", y = "ASAB")+
theme_bw()
Two-way interaction between Income and ASAB exists.
p1 = ggplot(dat[dat$Employment=="Idle",], aes(x = Income, y = ASAB, color = DadEdu))+
geom_point(alpha = .5)+
xlim(0,5)+
ylim(-2,2)+
facet_grid(Family~Race, labeller = label_both)+
theme_bw()+
labs(title = "Employment:Inactive")
p2 = ggplot(dat[dat$Employment=="School",], aes(x = Income, y = ASAB, color = DadEdu))+
geom_point(alpha = .5)+
xlim(0,5)+
ylim(-2,2)+
facet_grid(Family~Race, labeller = label_both)+
theme_bw()+
labs(title = "Employment:School")
p3 = ggplot(dat[dat$Employment=="Work",], aes(x = Income, y = ASAB, color = DadEdu))+
geom_point(alpha = .5)+
xlim(0,5)+
ylim(-2,2)+
facet_grid(Family~Race, labeller = label_both)+
theme_bw()+
labs(title = "Employment:Work")
p1
p2
## Warning: Removed 2 rows containing missing values (geom_point).
p3
Overview.
The distributions of each type of Employment differ.
For example, most of the observations in School fall in the cell of Race: Others and Family:Otherwise.
But the observations in Inactive do not show such pattern.
Thus, model using multinomial distribution might be more appropriate.
summary(m0 <- multinom(Employment~., data = dat))
## # weights: 30 (18 variable)
## initial value 1074.442818
## iter 10 value 1023.001178
## iter 20 value 1015.508959
## final value 1015.356575
## converged
## Call:
## multinom(formula = Employment ~ ., data = dat)
##
## Coefficients:
## (Intercept) RaceBlack FamilyOtherwise DadEduOtherwise Income
## School 0.05786532 0.2241142 0.55420767 -0.2446818 0.3067009
## Work 0.64217399 -0.4040686 0.09280388 -0.1983911 0.1812409
## Local ASAB Income_fMiddle Income_fHigh
## School 0.01318595 0.1812641 -0.02298188 -0.1270792
## Work -0.06837179 0.2929785 0.36858655 0.5155179
##
## Std. Errors:
## (Intercept) RaceBlack FamilyOtherwise DadEduOtherwise Income
## School 0.4209707 0.1975070 0.1882329 0.236736 0.3442137
## Work 0.4391742 0.2205727 0.1946329 0.241760 0.3457114
## Local ASAB Income_fMiddle Income_fHigh
## School 0.03472030 0.1066964 0.2468094 0.4823819
## Work 0.03752286 0.1107403 0.2651683 0.4901349
##
## Residual Deviance: 2030.713
## AIC: 2066.713