1 Data Management

Load data

dta<-load("C:/Users/ASUS/Desktop/data/ghq.rda")

str(ghq)
## 'data.frame':    22 obs. of  4 variables:
##  $ sex: Factor w/ 2 levels "men","women": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ghq: int  0 1 2 3 4 5 6 7 8 9 ...
##  $ c  : int  0 0 1 0 1 3 0 2 0 0 ...
##  $ nc : int  18 8 1 0 0 0 0 0 0 0 ...

2 Analysis (Model Comparison)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
dta <- ghq %>% mutate(Total = c + nc) %>%
  dplyr::rename(Gender=sex, Score=ghq, Case=c, Noncase=nc)

2.1 m2

m2 is a saturated model comprises Score, Gender, and their interaction term as IVs

m2 <- glm(cbind(Case, Noncase) ~ Score + Gender + Score:Gender, data=dta, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m2)
## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score + Gender + Score:Gender, 
##     family = binomial, data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.94972   0.00000   0.00000   0.06319   0.48718  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -43.38   21986.72  -0.002    0.998
## Score                21.69   10993.36   0.002    0.998
## Genderwomen          40.38   21986.72   0.002    0.999
## Score:Genderwomen   -20.45   10993.36  -0.002    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.1763  on 16  degrees of freedom
## Residual deviance:  1.5422  on 13  degrees of freedom
## AIC: 22.018
## 
## Number of Fisher Scoring iterations: 23

2.2 m3

m3 is the model comprises Score & Gender as IVs

m3 <- glm(cbind(Case, Noncase) ~ Score + Gender, data=dta, family=binomial)
summary(m3)
## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score + Gender, family = binomial, 
##     data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.25768   0.00000   0.02623   0.23648   0.82922  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0721     0.9758  -4.173 3.01e-05 ***
## Score         1.4328     0.2907   4.929 8.26e-07 ***
## Genderwomen   0.7938     0.9289   0.855    0.393    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.1763  on 16  degrees of freedom
## Residual deviance:  4.9419  on 14  degrees of freedom
## AIC: 23.418
## 
## Number of Fisher Scoring iterations: 6

2.3 m4

m4 is a model comprising Score as IV

m4 <- glm(cbind(Case, Noncase) ~ Score, data=dta, family=binomial)
summary(m4)
## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score, family = binomial, 
##     data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.41618   0.00000   0.04739   0.33149   0.53195  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.4536     0.5633  -6.131 8.73e-10 ***
## Score         1.4402     0.2979   4.834 1.34e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.176  on 16  degrees of freedom
## Residual deviance:  5.744  on 15  degrees of freedom
## AIC: 22.22
## 
## Number of Fisher Scoring iterations: 6

2.4 m5

m5 is a model comprising Gender as IV

m5 <- glm(cbind(Case, Noncase) ~ Gender, data=dta, family=binomial(link=logit))
summary(m5)
## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Gender, family = binomial(link = logit), 
##     data = dta)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.781   0.000   1.340   1.718   2.976  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -1.2164     0.4025  -3.022  0.00251 **
## Genderwomen   0.1643     0.4726   0.348  0.72811   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.176  on 16  degrees of freedom
## Residual deviance: 83.054  on 15  degrees of freedom
## AIC: 99.53
## 
## Number of Fisher Scoring iterations: 5

2.5 Model Comparison

knitr::kable(anova(m4, m3, m2, test="Chisq"))
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
15 5.743961 NA NA NA
14 4.941883 1 0.8020783 0.3704727
13 1.542230 1 3.3996529 0.0652101
knitr::kable(anova(m5, m2, test="Chisq"))
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
15 83.05382 NA NA NA
13 1.54223 2 81.51159 0
knitr::kable(anova(m3, m2, test="Chisq"))
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
14 4.941883 NA NA NA
13 1.542230 1 3.399653 0.0652101

2.6 Goodness of fit

The output indicated that the deviance value did not approximate the chisquare value, thus m3(the model of both Score + Gender) is Not a good fit for the data

(m3$null.deviance - m3$deviance)/m3$null.deviance
## [1] 0.9405854
1 - pchisq(m3$deviance, m3$df.residual)
## [1] 0.9866052

2.7 m3 coefficients

The output showed score is a significant predictor for cbind(c,nc), while gender is not.
Furthermore, conditional on gender, one point increases in GHQ is associated with the
increase of 157% to 720% in the odds for being a case.

sjPlot::tab_model(m3, show.r2=FALSE, show.obs=FALSE, show.se=TRUE)
  cbind(Case, Noncase)
Predictors Odds Ratios std. Error CI p
(Intercept) 0.02 0.98 0.00 – 0.08 <0.001
Score 4.19 0.29 2.57 – 8.20 <0.001
Gender [women] 2.21 0.93 0.42 – 17.73 0.393

3 Visdualization

3.1 GHQ against Empirical/fitted log-odd

dta_p <- dta %>%
       mutate(Prop=Case/Total,
              phat=predict(m3, 
                              newdata=dta[, c(1,2)], 
                              type="response"),
              elgit=log((Case+1/22)/(Noncase+(1/22))))
library(ggplot2)

ggplot(dta, 
       aes(Score, fitted(m3), 
           color=Gender)) + 
  geom_line(alpha=.5) +
  geom_point(data=dta_p,
             aes(Score, elgit))+
  scale_color_manual(values=c("black", "red3"))+
  labs(x="Score on GHQ",
       y="Empirical and fitted log-odds")+
  theme_minimal()+
  theme(legend.position = c(.1, .9))

3.2 Probablity chart

library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:plyr':
## 
##     ozone
beta <- coef(m3)

with(dta, plot(Score, Case/Total, 
               col=Gender, 
               bty='n', 
               xlab="GHQ score"))
grid()
curve(ilogit(beta[1]+beta[2]*x), add=T, col=1)
curve(ilogit(beta[1]+beta[2]*x+beta[3]), add=T, col=2)