1 Exercise 1

1.1 load and see

# load packages
pacman::p_load(tidyverse, MASS, pscl)

# input data
dta <- read.table("bullying.txt", h=T)

# first 6 lines
head(dta)

##   Score Nomination
## 1  1.56          0
## 2  1.56          0
## 3  1.11          0
## 4  1.56          0
## 5  1.22          4
## 6  1.33          0

# check data structure
str(dta)

## 'data.frame':    291 obs. of  2 variables:
##  $ Score     : num  1.56 1.56 1.11 1.56 1.22 1.33 1.11 1 1.11 1.22 ...
##  $ Nomination: int  0 0 0 0 4 0 0 0 0 19 ...

1.2 proportion of zeros

sum(dta$Nomination < 1)/length(dta$Nomination)

## [1] 0.5635739

1.3 mean and variance

#mean
colMeans(dta)

##      Score Nomination 
##   1.658110   2.487973

#variance
apply(dta, 2, var)

##      Score Nomination 
##  0.4842037 41.2645100

# use the Freedman-Diaconis rule for bin width
bd <- 2*IQR(dta$Nomination)/(dim(dta)[1]^(1/3))

1.4 plot

#
ot <- theme_set(theme_bw())

# histogram of nomination
ggplot(dta, aes(Nomination)) + 
 stat_bin(binwidth = bd) + 
 labs(x = "Number of nominations", y = "Count")

1.5 poisson fit

ggplot(dta, aes(Score, Nomination)) +
 geom_jitter(alpha = 0.2) +
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(y = "Number of peer nomination", x = "Bully score")

## `geom_smooth()` using formula 'y ~ x'

本圖顯示，本資料中多數的人非霸凌者。然而，“0”包含了真0與假0，亦即“確實非霸凌者”，以及“雖然沒有表現出來，但實際可能是霸凌者”兩類。

1.6 model

1.6.1 poisson model

summary(m0 <- glm(Nomination ~ Score, family = poisson, data = dta))

## 
## Call:
## glm(formula = Nomination ~ Score, family = poisson, data = dta)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.4995  -1.8246  -1.5952  -0.1552  11.6806  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.66308    0.08871  -7.475 7.75e-14 ***
## Score        0.81434    0.03504  23.239  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2205.2  on 290  degrees of freedom
## Residual deviance: 1774.6  on 289  degrees of freedom
## AIC: 2160.3
## 
## Number of Fisher Scoring iterations: 6

1.6.2 zero-inflated poisson

summary(m1 <- zeroinfl(Nomination ~ Score | Score, data = dta))

## 
## Call:
## zeroinfl(formula = Nomination ~ Score | Score, data = dta)
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37068 -0.68873 -0.59685 -0.05357  9.87491 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.49604    0.09987   4.967 6.81e-07 ***
## Score        0.59609    0.03942  15.120  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.4967     0.3463   4.322 1.54e-05 ***
## Score        -0.7716     0.1967  -3.924 8.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 8 
## Log-likelihood: -780.6 on 4 Df

sjPlot::tab_model(m1, show.obs=F, show.r2=F, show.se=F)

	Nomination
Predictors	Incidence Rate Ratios	CI	p
Count Model
(Intercept)	1.64	1.35 – 2.00	<0.001
Score	1.82	1.68 – 1.96	<0.001
Zero-Inflated Model
(Intercept)	4.47	2.27 – 8.81	<0.001
Score	0.46	0.31 – 0.68	<0.001

In m1 poisson model,分數增加一單位，被同儕認為是霸凌者的次數增加exp(0.596)=1.82。在沒有被同儕認為是霸凌者中，分數增加一單位，被同儕認為是霸凌者的次數減少exp(0.7716)=0.46。

1.6.3 zero-inflated negative binomial

summary(m2 <- zeroinfl(Nomination ~ Score | Score, data = dta, 
                              dist = "negbin"))

## 
## Call:
## zeroinfl(formula = Nomination ~ Score | Score, data = dta, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58338 -0.48698 -0.38705  0.02297  6.96982 
## 
## Count model coefficients (negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6737     0.4354  -1.547    0.122    
## Score         0.8819     0.2052   4.297 1.73e-05 ***
## Log(theta)   -1.0658     0.1792  -5.947 2.73e-09 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    2.954      2.484   1.189    0.234
## Score         -3.419      2.260  -1.513    0.130
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.3445 
## Number of iterations in BFGS optimization: 21 
## Log-likelihood: -494.7 on 5 Df

sjPlot::tab_model(m2, show.obs=F, show.r2=F, show.se=F)

	Nomination
Predictors	Incidence Rate Ratios	CI	p
Count Model
(Intercept)	0.51	0.22 – 1.20	0.122
Score	2.42	1.62 – 3.61	<0.001
Zero-Inflated Model
(Intercept)	19.18	0.15 – 2494.37	0.234
Score	0.03	0.00 – 2.75	0.130

In m2 negbin model,分數增加一單位，被同儕認為是霸凌者的次數增加exp(0.8819)=2.42。在沒有被同儕認為是霸凌者中，分數增加一單位，被同儕認為是霸凌者的次數減少exp(3.419)=0.03。

1.6.4 comparing non-nested models

vuong(m0, m1)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -4.891816 model2 > model1 4.9955e-07
## AIC-corrected         -4.858930 model2 > model1 5.9011e-07
## BIC-corrected         -4.798531 model2 > model1 7.9917e-07

vuong(m1, m2)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A   p-value
## Raw                   -4.242754 model2 > model1 1.104e-05
## AIC-corrected         -4.242754 model2 > model1 1.104e-05
## BIC-corrected         -4.242754 model2 > model1 1.104e-05

model2 > model1

#fortify data with fitted values and residuals
dta_m <- data.frame(dta, fit1 = predict(m1, type = "response"), 
                      fit2 = predict(m2, type = "response"), 
                      r1 = resid(m1, type="pearson"),
                      r2 = resid(m2, type="pearson"))

1.7 residual plots

ggplot(dta_m, aes(fit1, r1)) +
 geom_point(pch=20) +
 geom_point(aes(fit2, r2), pch=1)  +
 geom_hline(yintercept = 0) +
 labs(x = "Fitted values", y = "Residuals", 
      title = "Poisson vs. Negative Binomial")

“●”為m1 -> poisson “o”為m2 -> negbin 圖顯示m1、m2 fitted values and residuals的關係

1.8 model fits and CIs

dta_m2 <- data.frame(dta, yhat = predict(m2))

1.9 model fit over observations

ggplot(dta_m2, aes(Score, Nomination)) +
 geom_point(pch = 20, alpha = .2) +
 geom_line(aes(Score, yhat), col = "magenta", lwd=rel(1)) +
 stat_smooth(method = "glm", method.args = list(family = poisson)) +
 labs(x = "Bully Score", y = "Predicted number of nomination")

## `geom_smooth()` using formula 'y ~ x'

此圖表示在霸凌得分高者較容易被同儕提名視為霸凌者。加入紅線代表negbin的模型，與先前的藍線poisson模型相互比較。

2 Exercise 2

2.1 load and see

# load packages
pacman::p_load(tidyverse, MASS, pscl)

dta2 <- read.table("D:/sheu/youth_employment.txt", h=T)


head(dta2)

##   Employment   Race     Family    DadEdu Income Local   ASAB
## 1       Work Others Non-Intact Otherwise 0.4909   6.9 -0.110
## 2     School Others Non-Intact Otherwise 0.6940   4.8  0.452
## 3       Work Others  Otherwise Otherwise 1.1000   6.5  0.967
## 4       Work Others Non-Intact   College 1.5000   3.8  1.667
## 5       Work Others Non-Intact Otherwise 0.2544   6.9  0.000
## 6     School Others Non-Intact Otherwise 0.9391   5.4  0.000

str(dta2)

## 'data.frame':    978 obs. of  7 variables:
##  $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
##  $ Race      : Factor w/ 2 levels "Black","Others": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Family    : Factor w/ 2 levels "Non-Intact","Otherwise": 1 1 2 1 1 1 2 2 1 2 ...
##  $ DadEdu    : Factor w/ 2 levels "College","Otherwise": 2 2 2 1 2 2 2 1 1 1 ...
##  $ Income    : num  0.491 0.694 1.1 1.5 0.254 ...
##  $ Local     : num  6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
##  $ ASAB      : num  -0.11 0.452 0.967 1.667 0 ...

2.2 data management

dta2$Race <- relevel(factor(dta2$Race), ref="Others")
dta2$Employment <- relevel(factor(dta2$Employment), ref="Idle")
dta2$Family <- relevel(factor(dta2$Family), ref="Otherwise")
dta2$DadEdu <- relevel(factor(dta2$DadEdu), ref="Otherwise")


#cut data

dta2$Incf<- factor(cut(dta2$Income,breaks=c(quantile(dta2$Income, probs=seq(0, 1, by=.33))),labels=c("Low", "Middle","High"), ordered=T, lowest.inclued=T)) 

dta2$Localf<- factor(cut(dta2$Local,breaks=c(quantile(dta2$Local, probs=seq(0, 1, by=.5))),labels=c("Low","High"), ordered=T, lowest.inclued=T)) 

dta2$ASABf<- factor(cut(dta2$ASAB,breaks=c(quantile(dta2$ASAB, probs=seq(0, 1, by=.33))),labels=c("Low", "Middle","High"), ordered=T, lowest.inclued=T)) 


str(dta2)

## 'data.frame':    978 obs. of  10 variables:
##  $ Employment: Factor w/ 3 levels "Idle","School",..: 3 2 3 3 3 2 3 2 2 3 ...
##  $ Race      : Factor w/ 2 levels "Others","Black": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Family    : Factor w/ 2 levels "Otherwise","Non-Intact": 2 2 1 2 2 2 1 1 2 1 ...
##  $ DadEdu    : Factor w/ 2 levels "Otherwise","College": 1 1 1 2 1 1 1 2 2 2 ...
##  $ Income    : num  0.491 0.694 1.1 1.5 0.254 ...
##  $ Local     : num  6.9 4.8 6.5 3.8 6.9 5.4 4.2 6.2 6.2 6.9 ...
##  $ ASAB      : num  -0.11 0.452 0.967 1.667 0 ...
##  $ Incf      : Ord.factor w/ 3 levels "Low"<"Middle"<..: 2 2 3 3 1 3 3 3 3 3 ...
##  $ Localf    : Ord.factor w/ 2 levels "Low"<"High": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASABf     : Ord.factor w/ 3 levels "Low"<"Middle"<..: 2 3 3 NA 2 2 2 3 3 3 ...

with(dta2,ftable(Race,Family, DadEdu,Employment))

##                             Employment Idle School Work
## Race   Family     DadEdu                               
## Others Otherwise  Otherwise              84    161  141
##                   College                23     66   72
##        Non-Intact Otherwise              47     43   54
##                   College                 5     12   18
## Black  Otherwise  Otherwise              28     60   26
##                   College                 6      8    3
##        Non-Intact Otherwise              39     40   27
##                   College                 3     10    2

a<-as.data.frame(with(dta2,ftable(Race,Family, DadEdu,Employment)))
b<-as.data.frame(with(dta2, ftable(Incf,ASABf,Localf, Employment)))

with(dta2, ftable(Incf,ASABf,Localf, Employment))

##                      Employment Idle School Work
## Incf   ASABf  Localf                            
## Low    Low    Low                 26     39   19
##               High                42     34   19
##        Middle Low                 21     22   15
##               High                11     26   10
##        High   Low                  3     13    8
##               High                 4      5    3
## Middle Low    Low                 13     20   24
##               High                11     24   10
##        Middle Low                 22     28   36
##               High                 6     16   13
##        High   Low                  9     22   29
##               High                 9     19   12
## High   Low    Low                  6      9   10
##               High                 4      6    5
##        Middle Low                 18     23   16
##               High                 4     16   15
##        High   Low                 15     40   59
##               High                11     26   27

2.3 plot

ggplot(subset(a, Employment!="Idle"), aes(DadEdu, Freq/sum(Freq), color=Employment))+
  geom_line()+
  stat_smooth(method = "glm", method.args = list(family=binomial))+
  geom_point()+
  facet_grid(Family~Race)+
  scale_y_continuous(seq(-1.5, 1,by=.5))+
  labs(x="Father's education", y="Log-odds(Inactive)")

## `geom_smooth()` using formula 'y ~ x'

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

ggplot(subset(b, Employment!="Idle"), aes(ASABf, Freq/cumsum(Freq), color=Employment))+
  geom_line()+
  stat_smooth(method = "glm", method.args = list(family=binomial))+
  geom_point()+
  facet_wrap(Incf~Localf, nrow=2)+
  scale_y_continuous(seq(-1.5, 1,by=.5))+
  labs(x="Father's education", y="Log-odds(Inactive)")

## `geom_smooth()` using formula 'y ~ x'
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

2.4 prameter

summary(m0 <- nnet::multinom(formula= Employment ~ ., data = dta2))

## # weights:  39 (24 variable)
## initial  value 1046.977511 
## iter  10 value 995.801470
## iter  20 value 989.576609
## final  value 989.321874 
## converged

## Call:
## nnet::multinom(formula = Employment ~ ., data = dta2)
## 
## Coefficients:
##        (Intercept)  RaceBlack FamilyNon-Intact DadEduCollege      Income
## School   0.7018017  0.2391504      -0.54708859     0.2263025 -0.07437054
## Work     0.7184842 -0.3885990      -0.04838905     0.1326759  0.22198338
##               Local        ASAB    Incf.L      Incf.Q    Localf.L   ASABf.L
## School -0.004238422 -0.01547256 0.1184440 -0.09025778  0.11135278 0.4116813
## Work   -0.067554926 -0.08513663 0.3412993 -0.29255812 -0.02332271 0.5843150
##           ASABf.Q
## School 0.04306545
## Work   0.12077936
## 
## Std. Errors:
##        (Intercept) RaceBlack FamilyNon-Intact DadEduCollege    Income
## School   0.5384768 0.1997812        0.1888134     0.2396764 0.4137664
## Work     0.5635782 0.2230273        0.1957335     0.2457077 0.4089509
##             Local      ASAB    Incf.L    Incf.Q  Localf.L   ASABf.L   ASABf.Q
## School 0.05398749 0.2637922 0.3230540 0.1537179 0.1882321 0.4201313 0.1469578
## Work   0.05915372 0.2756148 0.3261682 0.1578611 0.2015597 0.4372910 0.1540811
## 
## Residual Deviance: 1978.644 
## AIC: 2026.644

3 Exercise 3

3.1 load and see

pacman::p_load(tidyverse, MASS, HH)
dta3<- read.table("D:/sheu/mentalSES.txt", h=T)
str(dta3)

## 'data.frame':    1660 obs. of  8 variables:
##  $ Id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ses   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ A     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ B     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ C     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ D     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ E     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mental: int  1 1 1 1 1 1 1 1 1 1 ...

head(dta3)

##   Id ses A B C D E mental
## 1  1   6 1 0 0 0 0      1
## 2  2   6 1 0 0 0 0      1
## 3  3   6 1 0 0 0 0      1
## 4  4   6 1 0 0 0 0      1
## 5  5   6 1 0 0 0 0      1
## 6  6   6 1 0 0 0 0      1

str(dta3)

## 'data.frame':    1660 obs. of  8 variables:
##  $ Id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ses   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ A     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ B     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ C     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ D     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ E     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mental: int  1 1 1 1 1 1 1 1 1 1 ...

3.2 model

summary(m0 <- polr(ordered(mental) ~ ses, data = dta3, 
                           method = "probit", Hess = TRUE))

## Call:
## polr(formula = ordered(mental) ~ ses, data = dta3, Hess = TRUE, 
##     method = "probit")
## 
## Coefficients:
##       Value Std. Error t value
## ses -0.1021    0.01644  -6.213
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -1.2679   0.0698   -18.1603
## 2|3  -0.2387   0.0654    -3.6490
## 3|4   0.3741   0.0659     5.6761
## 
## Residual Deviance: 4450.272 
## AIC: 4458.272

confint(m0)

## Waiting for profiling to be done...

##      2.5 %     97.5 % 
## -0.1343804 -0.0699364

3.3 probabilities for each category

pnorm(-1.2679)

## [1] 0.1024168

pnorm(-0.2387)- pnorm(-1.2679)

## [1] 0.3032523

pnorm(0.3741)- pnorm(-0.2387)

## [1] 0.2401659

1-pnorm(0.3741)

## [1] 0.354165

pnorm(-1.2679-(-0.1021))

## [1] 0.1218477

pnorm(-0.2387-(-0.1021))-pnorm(-1.2679-(-0.1021))

## [1] 0.3238258

pnorm(0.3741-(-0.1021))-pnorm(-0.2387-(-0.1021))

## [1] 0.2373606

 1 - pnorm(0.3741-(-0.1021))

## [1] 0.3169659

3.4 plot

ndta3 <- dta3 %>% 
  mutate(mental=dplyr::recode(mental, 'Well', 'Mild', 'Moderate', 'Impaired'))%>% 
  mutate(mental=factor(mental, levels=c('Well','Mild','Moderate','Impaired')),
        ses=factor(ses, levels=c('1', '2', '3', '4', '5', '6'), labels = c('F', 'E', 'D', 'C', 'B', 'A')))


likert(t(with(ndta3, table(mental, ses))), main="", ylab="SES")

Exercise

tjlee

2020-11-09