glm

Q1

# package management
require(pacman)

## Loading required package: pacman

# load packages
pacman::p_load(tidyverse, MASS, HH)

dat = read.table("http://140.116.183.121/~sheu/lmm/Data/mentalSES.txt", h = T)
str(dat)

## 'data.frame':    1660 obs. of  8 variables:
##  $ Id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ses   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ A     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ B     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ C     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ D     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ E     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mental: int  1 1 1 1 1 1 1 1 1 1 ...

t<-with(dat, addmargins(table(ses, mental)))

row.names(t)<-c("F","E","D","C","B","A","total")
colnames(t)<-c("well","mild","moderate","impaired","total")
t

##        mental
## ses     well mild moderate impaired total
##   F       21   71       54       71   217
##   E       36   97       54       78   265
##   D       72  141       77       94   384
##   C       57  105       65       60   287
##   B       57   94       54       40   245
##   A       64   94       58       46   262
##   total  307  602      362      389  1660

likert(t(t), as.percent = TRUE, main = "", ylab = "mental impairment")

dat$mental<-dat$mental %>% as.factor
summary(m0 <- polr(mental ~ ses, data = dat,
                   method = "probit", Hess = TRUE))

## Call:
## polr(formula = mental ~ ses, data = dat, Hess = TRUE, method = "probit")
## 
## Coefficients:
##       Value Std. Error t value
## ses -0.1021    0.01644  -6.213
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -1.2679   0.0698   -18.1603
## 2|3  -0.2387   0.0654    -3.6490
## 3|4   0.3741   0.0659     5.6761
## 
## Residual Deviance: 4450.272 
## AIC: 4458.272

Q2

##Within the past 12 months, how many people have you known personally that were victims of homicide?
 #Let Yij be the response for subject j of race i. 
 #It is assumed that Yij follows a Negative Binomial random variable with a parameter μij (expected value).
require(pacman)

pacman::p_load(tidyverse, MASS, magrittr)

dat <- read.table("http://140.116.183.121/~sheu/lmm/Data/victims.txt", h = T)
dat$race %<>% as.factor

head(dat)

##   nvic race
## 1    0    0
## 2    0    0
## 3    0    0
## 4    0    0
## 5    0    0
## 6    0    0

str(dat)

## 'data.frame':    1308 obs. of  2 variables:
##  $ nvic: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

aggregate(nvic ~ race, data = dat, mean)

##   race       nvic
## 1    0 0.09225413
## 2    1 0.52201258

aggregate(nvic ~ race, data = dat, sd)

##   race      nvic
## 1    0 0.3940112
## 2    1 1.0723007

#Plot
ggplot(dat, aes(x = race, y = nvic))+
  stat_summary(fun.data = mean_se)+
  labs(x = "Race", y = "Mean victims personally known")+
  theme_bw()

require(qcc)

## Loading required package: qcc

## Package 'qcc' version 2.7

## Type 'citation("qcc")' for citing this R package in publications.

qcc.overdispersion.test(dat$nvic,type='poisson')

##                    
## Overdispersion test Obs.Var/Theor.Var Statistic p-value
##        poisson data          2.042251  2669.222       0

#Negative Binomial
summary(m <- glm.nb(nvic~race, data = dat))

## 
## Call:
## glm.nb(formula = nvic ~ race, data = dat, init.theta = 0.2023119205, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7184  -0.3899  -0.3899  -0.3899   3.5072  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3832     0.1172 -20.335  < 2e-16 ***
## race1         1.7331     0.2385   7.268 3.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2023) family taken to be 1)
## 
##     Null deviance: 471.57  on 1307  degrees of freedom
## Residual deviance: 412.60  on 1306  degrees of freedom
## AIC: 1001.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2023 
##           Std. Err.:  0.0409 
## 
##  2 x log-likelihood:  -995.7980

Q3

##Interpret the effect of having above secondary education compared to none, 
 #holding other terms in the model constant, on the expected number of children born.


 #log(E[Yjkl]) = log(njkl) + log(E[Yijkl]),
 #where Yijkl follows a Poisson distribution with mean μijkl (for each woman i);
 #log(μijkl) = β0 + β1×durationij+ β2×residenceik+ β3×educationil. 
 #The offset is the number of women in a cell njkl on the logarithmic unit.

dat = read.table("http://data.princeton.edu/wws509/datasets/ceb.dat", h = T)
str(dat)

## 'data.frame':    70 obs. of  7 variables:
##  $ dur : Factor w/ 6 levels "0-4","10-14",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ res : Factor w/ 3 levels "rural","Suva",..: 2 2 2 2 3 3 3 3 1 1 ...
##  $ educ: Factor w/ 4 levels "lower","none",..: 2 1 4 3 2 1 4 3 2 1 ...
##  $ mean: num  0.5 1.14 0.9 0.73 1.17 0.85 1.05 0.69 0.97 0.96 ...
##  $ var : num  1.14 0.73 0.67 0.48 1.06 1.59 0.73 0.54 0.88 0.81 ...
##  $ n   : int  8 21 42 51 12 27 39 51 62 102 ...
##  $ y   : num  4 23.9 37.8 37.2 14 ...

dat %<>% mutate(ln = log(n))
dat %<>% mutate(ry = round(y))

# conditional histogram
ggplot(dat, aes(mean,n, fill=educ)) +
  geom_bar(position="dodge", stat="identity")+
  labs(x = "Mean number of children ever born", y = "Number of women in the cell")+
  theme_bw()

summary(m0 <- glm(ry~offset(ln), data = dat, family = "poisson"))

## 
## Call:
## glm(formula = ry ~ offset(ln), family = "poisson", data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -18.6368   -4.4774   -0.8548    2.5292   21.9744  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.376346   0.009712   141.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3731.9  on 69  degrees of freedom
## Residual deviance: 3731.9  on 69  degrees of freedom
## AIC: 4163.3
## 
## Number of Fisher Scoring iterations: 5

summary(m1 <- glm(ry~dur+res+educ, data = dat, family = "poisson"))

## 
## Call:
## glm(formula = ry ~ dur + res + educ, family = "poisson", data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -18.2023   -4.1368   -0.3074    3.7858   16.2193  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.77392    0.04655  102.56  < 2e-16 ***
## dur10-14     1.22136    0.05024   24.31  < 2e-16 ***
## dur15-19     1.31091    0.04975   26.35  < 2e-16 ***
## dur20-24     1.32965    0.04965   26.78  < 2e-16 ***
## dur25-29     1.89210    0.04756   39.78  < 2e-16 ***
## dur5-9       0.89541    0.05240   17.09  < 2e-16 ***
## resSuva     -1.44312    0.02785  -51.83  < 2e-16 ***
## resurban    -1.04901    0.02405  -43.62  < 2e-16 ***
## educnone     0.16257    0.02168    7.50  6.4e-14 ***
## educsec+    -1.94337    0.05208  -37.32  < 2e-16 ***
## educupper   -0.88738    0.02951  -30.07  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 13735.6  on 69  degrees of freedom
## Residual deviance:  2504.1  on 59  degrees of freedom
## AIC: 2955.6
## 
## Number of Fisher Scoring iterations: 5

show(m1_est <- cbind(Estimate = coef(m1), confint(m1)))

## Waiting for profiling to be done...

##               Estimate      2.5 %     97.5 %
## (Intercept)  4.7739160  4.6815191  4.8640221
## dur10-14     1.2213645  1.1237563  1.3207362
## dur15-19     1.3109110  1.2142997  1.4093489
## dur20-24     1.3296518  1.2332392  1.4279035
## dur25-29     1.8920957  1.7999103  1.9863830
## dur5-9       0.8954115  0.7934300  0.9988723
## resSuva     -1.4431223 -1.4980066 -1.3888461
## resurban    -1.0490081 -1.0963210 -1.0020520
## educnone     0.1625697  0.1201065  0.2050818
## educsec+    -1.9433683 -2.0468417 -1.8426479
## educupper   -0.8873758 -0.9454534 -0.8297636

exp(m1_est)

##                Estimate       2.5 %      97.5 %
## (Intercept) 118.3819229 107.9339086 129.5441994
## dur10-14      3.3918129   3.0763883   3.7461784
## dur15-19      3.7095517   3.3679346   4.0932894
## dur20-24      3.7797271   3.4323295   4.1699476
## dur25-29      6.6332554   6.0491049   7.2891210
## dur5-9        2.4483431   2.2109671   2.7152182
## resSuva       0.2361892   0.2235754   0.2493629
## resurban      0.3502850   0.3340980   0.3671253
## educnone      1.1765304   1.1276170   1.2276255
## educsec+      0.1432207   0.1291421   0.1583975
## educupper     0.4117348   0.3885034   0.4361524

#控制其他變項下，above secondary education者多0.14倍的出生小孩數量

Q4

#
# cumulative probit model : ordinal regression
#
dat = read.table("http://140.116.183.121/~sheu/lmm/Data/grayScale.txt", h = T)

#
require(pacman)

pacman::p_load(tidyverse, MASS, HH)

head(dat)

##   oid count resp contrast
## 1   1    19    1       -2
## 2   2    14    2       -2
## 3   3     3    3       -2
## 4   4     2    4       -2
## 5   5     0    5       -2
## 6   6     0    6       -2

str(dat)

## 'data.frame':    60 obs. of  4 variables:
##  $ oid     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ count   : int  19 14 3 2 0 0 0 0 0 0 ...
##  $ resp    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ contrast: int  -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 ...

## likert scale plotting
dat_g = as.data.frame(matrix(dat$count, 12, 5))  #12Response category*5Stimulus level
names(dat_g) = -2:2
rownames(dat_g) <- c('6d', '5d', '4d', '3d','2d', '1d','1b', '2b', '3b', '4b','5b', '6b')
likert(t(dat_g), as.percent = T, main = "", ylab = "Stimulus level")

#
dat$resp %<>% as.factor
summary(m0 <- polr(resp ~ contrast, data = dat, weight = count, 
                   method = "probit", Hess = TRUE))

## Call:
## polr(formula = resp ~ contrast, data = dat, weights = count, 
##     Hess = TRUE, method = "probit")
## 
## Coefficients:
##          Value Std. Error t value
## contrast 1.707     0.1118   15.27
## 
## Intercepts:
##       Value    Std. Error t value 
## 1|2    -3.3680   0.2448   -13.7585
## 2|3    -2.4946   0.2157   -11.5656
## 3|4    -1.7943   0.1886    -9.5129
## 4|5    -1.2721   0.1689    -7.5337
## 5|6     0.0259   0.1433     0.1804
## 6|7     0.5944   0.1485     4.0014
## 7|8     1.2060   0.1654     7.2927
## 8|9     1.7557   0.1836     9.5635
## 9|10    2.4285   0.2066    11.7558
## 10|11   3.4400   0.2525    13.6242
## 11|12   3.9295   0.2738    14.3505
## 
## Residual Deviance: 597.6344 
## AIC: 621.6344

confint(m0)

## Waiting for profiling to be done...

##    2.5 %   97.5 % 
## 1.492628 1.930716

Q6

##log(p/(1-p)) = β0 + β1 Advise + β2 Arrest,
 #p = probability of future violence.
dat = read.table("http://140.116.183.121/~sheu/lmm/Data/domesticViolence.txt",header = T)
str(dat)

## 'data.frame':    312 obs. of  3 variables:
##  $ X1  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ X0  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X0.1: int  0 0 0 0 0 0 0 0 0 0 ...

dat %<>% mutate(treatment = ifelse(X0.1==1, "arrest", 
                                  ifelse(X0==1, "advise", "separate")))
names(dat) <- c("v", "adv", "ar","treatment") 

#Plot
ggplot(dat, aes(x = treatment, y = v))+
  stat_summary(fun.data = mean_se)+
  labs(x = "Treatment", y = "Proportion of future violence")+
  theme_bw()

summary(m <- glm(v~treatment, data = dat, family = "binomial", contrasts = "contr.sum"))

## 
## Call:
## glm(formula = v ~ treatment, family = "binomial", data = dat, 
##     contrasts = "contr.sum")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7108  -0.7108  -0.6576  -0.4797   2.1067  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.4214     0.2431  -5.846 5.03e-09 ***
## treatmentarrest    -0.6827     0.4139  -1.650    0.099 .  
## treatmentseparate   0.1744     0.3326   0.524    0.600    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 293.66  on 311  degrees of freedom
## Residual deviance: 288.59  on 309  degrees of freedom
## AIC: 294.59
## 
## Number of Fisher Scoring iterations: 4

glm

Li

2017年11月27日

Q1

Q2

Q3

Q4

Q6