# load packages
pacman::p_load(tidyverse, MASS, pscl)
# input data
dta <- read.table("bullying.txt", h=T)
# first 6 lines
head(dta)## Score Nomination
## 1 1.56 0
## 2 1.56 0
## 3 1.11 0
## 4 1.56 0
## 5 1.22 4
## 6 1.33 0
## 'data.frame': 291 obs. of 2 variables:
## $ Score : num 1.56 1.56 1.11 1.56 1.22 1.33 1.11 1 1.11 1.22 ...
## $ Nomination: int 0 0 0 0 4 0 0 0 0 19 ...
## Score Nomination
## 1.658110 2.487973
## Score Nomination
## 0.4842037 41.2645100
#
ot <- theme_set(theme_bw())
# histogram of nomination
ggplot(dta, aes(Nomination)) +
stat_bin(binwidth = bd) +
labs(x = "Number of nominations", y = "Count")ggplot(dta, aes(Score, Nomination)) +
geom_jitter(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = poisson)) +
labs(y = "Number of peer nomination", x = "Bully score")## `geom_smooth()` using formula 'y ~ x'
本圖顯示,本資料中多數的人非霸凌者。 然而,“0”包含了真0與假0,亦即“確實非霸凌者”,以及“雖然沒有表現出來,但實際可能是霸凌者”兩類。
##
## Call:
## glm(formula = Nomination ~ Score, family = poisson, data = dta)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.4995 -1.8246 -1.5952 -0.1552 11.6806
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.66308 0.08871 -7.475 7.75e-14 ***
## Score 0.81434 0.03504 23.239 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2205.2 on 290 degrees of freedom
## Residual deviance: 1774.6 on 289 degrees of freedom
## AIC: 2160.3
##
## Number of Fisher Scoring iterations: 6
##
## Call:
## zeroinfl(formula = Nomination ~ Score | Score, data = dta)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.37068 -0.68873 -0.59685 -0.05357 9.87491
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.49604 0.09987 4.967 6.81e-07 ***
## Score 0.59609 0.03942 15.120 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4967 0.3463 4.322 1.54e-05 ***
## Score -0.7716 0.1967 -3.924 8.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 8
## Log-likelihood: -780.6 on 4 Df
| Nomination | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | CI | p | |
| Count Model | ||||
| (Intercept) | 1.64 | 1.35 – 2.00 | <0.001 | |
| Score | 1.82 | 1.68 – 1.96 | <0.001 | |
| Zero-Inflated Model | ||||
| (Intercept) | 4.47 | 2.27 – 8.81 | <0.001 | |
| Score | 0.46 | 0.31 – 0.68 | <0.001 | |
In m1 poisson model,分數增加一單位,被同儕認為是霸凌者的次數增加exp(0.596)=1.82。在沒有被同儕認為是霸凌者中,分數增加一單位,被同儕認為是霸凌者的次數減少exp(0.7716)=0.46。
##
## Call:
## zeroinfl(formula = Nomination ~ Score | Score, data = dta, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.58338 -0.48698 -0.38705 0.02297 6.96982
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6737 0.4354 -1.547 0.122
## Score 0.8819 0.2052 4.297 1.73e-05 ***
## Log(theta) -1.0658 0.1792 -5.947 2.73e-09 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.954 2.484 1.189 0.234
## Score -3.419 2.260 -1.513 0.130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.3445
## Number of iterations in BFGS optimization: 21
## Log-likelihood: -494.7 on 5 Df
| Nomination | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | CI | p | |
| Count Model | ||||
| (Intercept) | 0.51 | 0.22 – 1.20 | 0.122 | |
| Score | 2.42 | 1.62 – 3.61 | <0.001 | |
| Zero-Inflated Model | ||||
| (Intercept) | 19.18 | 0.15 – 2494.37 | 0.234 | |
| Score | 0.03 | 0.00 – 2.75 | 0.130 | |
In m2 negbin model,分數增加一單位,被同儕認為是霸凌者的次數增加exp(0.8819)=2.42。在沒有被同儕認為是霸凌者中,分數增加一單位,被同儕認為是霸凌者的次數減少exp(3.419)=0.03。
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -4.891816 model2 > model1 4.9955e-07
## AIC-corrected -4.858930 model2 > model1 5.9011e-07
## BIC-corrected -4.798531 model2 > model1 7.9917e-07
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -4.242754 model2 > model1 1.104e-05
## AIC-corrected -4.242754 model2 > model1 1.104e-05
## BIC-corrected -4.242754 model2 > model1 1.104e-05
model2 > model1
ggplot(dta_m, aes(fit1, r1)) +
geom_point(pch=20) +
geom_point(aes(fit2, r2), pch=1) +
geom_hline(yintercept = 0) +
labs(x = "Fitted values", y = "Residuals",
title = "Poisson vs. Negative Binomial")“●”為m1 -> poisson “o”為m2 -> negbin 圖顯示m1、m2 fitted values and residuals的關係
ggplot(dta_m2, aes(Score, Nomination)) +
geom_point(pch = 20, alpha = .2) +
geom_line(aes(Score, yhat), col = "magenta", lwd=rel(1)) +
stat_smooth(method = "glm", method.args = list(family = poisson)) +
labs(x = "Bully Score", y = "Predicted number of nomination")## `geom_smooth()` using formula 'y ~ x'
此圖表示在霸凌得分高者較容易被同儕提名視為霸凌者。 加入紅線代表negbin的模型,與先前的藍線poisson模型相互比較。
# load packages
pacman::p_load(tidyverse, MASS, pscl)
dta2 <- read.table("D:/sheu/youth_employment.txt", h=T)
head(dta2)## Employment Race Family DadEdu Income Local ASAB
## 1 Work Others Non-Intact Otherwise 0.4909 6.9 -0.110
## 2 School Others Non-Intact Otherwise 0.6940 4.8 0.452
## 3 Work Others Otherwise Otherwise 1.1000 6.5 0.967
## 4 Work Others Non-Intact College 1.5000 3.8 1.667
## 5 Work Others Non-Intact Otherwise 0.2544 6.9 0.000
## 6 School Others Non-Intact Otherwise 0.9391 5.4 0.000
## 'data.frame': 978 obs. of 7 variables:
## $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
## $ Race : Factor w/ 2 levels "Black","Others": 2 2 2 2 2 2 2 2 2 2 ...
## $ Family : Factor w/ 2 levels "Non-Intact","Otherwise": 1 1 2 1 1 1 2 2 1 2 ...
## $ DadEdu : Factor w/ 2 levels "College","Otherwise": 2 2 2 1 2 2 2 1 1 1 ...
## $ Income : num 0.491 0.694 1.1 1.5 0.254 ...
## $ Local : num 6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
## $ ASAB : num -0.11 0.452 0.967 1.667 0 ...
dta2$Race <- relevel(factor(dta2$Race), ref="Others")
dta2$Employment <- relevel(factor(dta2$Employment), ref="Idle")
dta2$Family <- relevel(factor(dta2$Family), ref="Otherwise")
dta2$DadEdu <- relevel(factor(dta2$DadEdu), ref="Otherwise")
#cut datadta2$Incf<- factor(cut(dta2$Income,breaks=c(quantile(dta2$Income, probs=seq(0, 1, by=.33))),labels=c("Low", "Middle","High"), ordered=T, lowest.inclued=T))
dta2$Localf<- factor(cut(dta2$Local,breaks=c(quantile(dta2$Local, probs=seq(0, 1, by=.5))),labels=c("Low","High"), ordered=T, lowest.inclued=T))
dta2$ASABf<- factor(cut(dta2$ASAB,breaks=c(quantile(dta2$ASAB, probs=seq(0, 1, by=.33))),labels=c("Low", "Middle","High"), ordered=T, lowest.inclued=T))
str(dta2)## 'data.frame': 978 obs. of 10 variables:
## $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
## $ Race : Factor w/ 2 levels "Others","Black": 1 1 1 1 1 1 1 1 1 1 ...
## $ Family : Factor w/ 2 levels "Otherwise","Non-Intact": 2 2 1 2 2 2 1 1 2 1 ...
## $ DadEdu : Factor w/ 2 levels "Otherwise","College": 1 1 1 2 1 1 1 2 2 2 ...
## $ Income : num 0.491 0.694 1.1 1.5 0.254 ...
## $ Local : num 6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
## $ ASAB : num -0.11 0.452 0.967 1.667 0 ...
## $ Incf : Ord.factor w/ 3 levels "Low"<"Middle"<..: 2 2 3 3 1 3 3 3 3 3 ...
## $ Localf : Ord.factor w/ 2 levels "Low"<"High": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASABf : Ord.factor w/ 3 levels "Low"<"Middle"<..: 2 3 3 NA 2 2 2 3 3 3 ...
## Employment Idle School Work
## Race Family DadEdu
## Others Otherwise Otherwise 84 161 141
## College 23 66 72
## Non-Intact Otherwise 47 43 54
## College 5 12 18
## Black Otherwise Otherwise 28 60 26
## College 6 8 3
## Non-Intact Otherwise 39 40 27
## College 3 10 2
a<-as.data.frame(with(dta2,ftable(Race,Family, DadEdu,Employment)))
b<-as.data.frame(with(dta2, ftable(Incf,ASABf,Localf, Employment)))
with(dta2, ftable(Incf,ASABf,Localf, Employment))## Employment Idle School Work
## Incf ASABf Localf
## Low Low Low 26 39 19
## High 42 34 19
## Middle Low 21 22 15
## High 11 26 10
## High Low 3 13 8
## High 4 5 3
## Middle Low Low 13 20 24
## High 11 24 10
## Middle Low 22 28 36
## High 6 16 13
## High Low 9 22 29
## High 9 19 12
## High Low Low 6 9 10
## High 4 6 5
## Middle Low 18 23 16
## High 4 16 15
## High Low 15 40 59
## High 11 26 27
ggplot(subset(a, Employment!="Idle"), aes(DadEdu, Freq/sum(Freq), color=Employment))+
geom_line()+
stat_smooth(method = "glm", method.args = list(family=binomial))+
geom_point()+
facet_grid(Family~Race)+
scale_y_continuous(seq(-1.5, 1,by=.5))+
labs(x="Father's education", y="Log-odds(Inactive)")## `geom_smooth()` using formula 'y ~ x'
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
ggplot(subset(b, Employment!="Idle"), aes(ASABf, Freq/cumsum(Freq), color=Employment))+
geom_line()+
stat_smooth(method = "glm", method.args = list(family=binomial))+
geom_point()+
facet_wrap(Incf~Localf, nrow=2)+
scale_y_continuous(seq(-1.5, 1,by=.5))+
labs(x="Father's education", y="Log-odds(Inactive)")## `geom_smooth()` using formula 'y ~ x'
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## # weights: 39 (24 variable)
## initial value 1046.977511
## iter 10 value 995.801470
## iter 20 value 989.576609
## final value 989.321874
## converged
## Call:
## nnet::multinom(formula = Employment ~ ., data = dta2)
##
## Coefficients:
## (Intercept) RaceBlack FamilyNon-Intact DadEduCollege Income
## School 0.7018017 0.2391504 -0.54708859 0.2263025 -0.07437054
## Work 0.7184842 -0.3885990 -0.04838905 0.1326759 0.22198338
## Local ASAB Incf.L Incf.Q Localf.L ASABf.L
## School -0.004238422 -0.01547256 0.1184440 -0.09025778 0.11135278 0.4116813
## Work -0.067554926 -0.08513663 0.3412993 -0.29255812 -0.02332271 0.5843150
## ASABf.Q
## School 0.04306545
## Work 0.12077936
##
## Std. Errors:
## (Intercept) RaceBlack FamilyNon-Intact DadEduCollege Income
## School 0.5384768 0.1997812 0.1888134 0.2396764 0.4137664
## Work 0.5635782 0.2230273 0.1957335 0.2457077 0.4089509
## Local ASAB Incf.L Incf.Q Localf.L ASABf.L ASABf.Q
## School 0.05398749 0.2637922 0.3230540 0.1537179 0.1882321 0.4201313 0.1469578
## Work 0.05915372 0.2756148 0.3261682 0.1578611 0.2015597 0.4372910 0.1540811
##
## Residual Deviance: 1978.644
## AIC: 2026.644
## 'data.frame': 1660 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ses : int 6 6 6 6 6 6 6 6 6 6 ...
## $ A : int 1 1 1 1 1 1 1 1 1 1 ...
## $ B : int 0 0 0 0 0 0 0 0 0 0 ...
## $ C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ E : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mental: int 1 1 1 1 1 1 1 1 1 1 ...
## Id ses A B C D E mental
## 1 1 6 1 0 0 0 0 1
## 2 2 6 1 0 0 0 0 1
## 3 3 6 1 0 0 0 0 1
## 4 4 6 1 0 0 0 0 1
## 5 5 6 1 0 0 0 0 1
## 6 6 6 1 0 0 0 0 1
## 'data.frame': 1660 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ses : int 6 6 6 6 6 6 6 6 6 6 ...
## $ A : int 1 1 1 1 1 1 1 1 1 1 ...
## $ B : int 0 0 0 0 0 0 0 0 0 0 ...
## $ C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ E : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mental: int 1 1 1 1 1 1 1 1 1 1 ...
## Call:
## polr(formula = ordered(mental) ~ ses, data = dta3, Hess = TRUE,
## method = "probit")
##
## Coefficients:
## Value Std. Error t value
## ses -0.1021 0.01644 -6.213
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.2679 0.0698 -18.1603
## 2|3 -0.2387 0.0654 -3.6490
## 3|4 0.3741 0.0659 5.6761
##
## Residual Deviance: 4450.272
## AIC: 4458.272
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## -0.1343804 -0.0699364
## [1] 0.1024168
## [1] 0.3032523
## [1] 0.2401659
## [1] 0.354165
## [1] 0.1218477
## [1] 0.3238258
## [1] 0.2373606
## [1] 0.3169659
ndta3 <- dta3 %>%
mutate(mental=dplyr::recode(mental, 'Well', 'Mild', 'Moderate', 'Impaired'))%>%
mutate(mental=factor(mental, levels=c('Well','Mild','Moderate','Impaired')),
ses=factor(ses, levels=c('1', '2', '3', '4', '5', '6'), labels = c('F', 'E', 'D', 'C', 'B', 'A')))
likert(t(with(ndta3, table(mental, ses))), main="", ylab="SES")