1 Data Management

Load data

dta<-load("C:/Users/ASUS/Desktop/data/ghq.rda")

str(ghq)

## 'data.frame':    22 obs. of  4 variables:
##  $ sex: Factor w/ 2 levels "men","women": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ghq: int  0 1 2 3 4 5 6 7 8 9 ...
##  $ c  : int  0 0 1 0 1 3 0 2 0 0 ...
##  $ nc : int  18 8 1 0 0 0 0 0 0 0 ...

2 Analysis (Model Comparison)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

dta <- ghq %>% mutate(Total = c + nc) %>%
  dplyr::rename(Gender=sex, Score=ghq, Case=c, Noncase=nc)

2.1 m2

m2 is a saturated model comprises Score, Gender, and their interaction term as IVs

m2 <- glm(cbind(Case, Noncase) ~ Score + Gender + Score:Gender, data=dta, family=binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(m2)

## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score + Gender + Score:Gender, 
##     family = binomial, data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.94972   0.00000   0.00000   0.06319   0.48718  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -43.38   21986.72  -0.002    0.998
## Score                21.69   10993.36   0.002    0.998
## Genderwomen          40.38   21986.72   0.002    0.999
## Score:Genderwomen   -20.45   10993.36  -0.002    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.1763  on 16  degrees of freedom
## Residual deviance:  1.5422  on 13  degrees of freedom
## AIC: 22.018
## 
## Number of Fisher Scoring iterations: 23

2.2 m3

m3 is the model comprises Score & Gender as IVs

m3 <- glm(cbind(Case, Noncase) ~ Score + Gender, data=dta, family=binomial)
summary(m3)

## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score + Gender, family = binomial, 
##     data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.25768   0.00000   0.02623   0.23648   0.82922  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0721     0.9758  -4.173 3.01e-05 ***
## Score         1.4328     0.2907   4.929 8.26e-07 ***
## Genderwomen   0.7938     0.9289   0.855    0.393    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.1763  on 16  degrees of freedom
## Residual deviance:  4.9419  on 14  degrees of freedom
## AIC: 23.418
## 
## Number of Fisher Scoring iterations: 6

2.3 m4

m4 is a model comprising Score as IV

m4 <- glm(cbind(Case, Noncase) ~ Score, data=dta, family=binomial)
summary(m4)

## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Score, family = binomial, 
##     data = dta)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.41618   0.00000   0.04739   0.33149   0.53195  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.4536     0.5633  -6.131 8.73e-10 ***
## Score         1.4402     0.2979   4.834 1.34e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.176  on 16  degrees of freedom
## Residual deviance:  5.744  on 15  degrees of freedom
## AIC: 22.22
## 
## Number of Fisher Scoring iterations: 6

2.4 m5

m5 is a model comprising Gender as IV

m5 <- glm(cbind(Case, Noncase) ~ Gender, data=dta, family=binomial(link=logit))
summary(m5)

## 
## Call:
## glm(formula = cbind(Case, Noncase) ~ Gender, family = binomial(link = logit), 
##     data = dta)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.781   0.000   1.340   1.718   2.976  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -1.2164     0.4025  -3.022  0.00251 **
## Genderwomen   0.1643     0.4726   0.348  0.72811   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.176  on 16  degrees of freedom
## Residual deviance: 83.054  on 15  degrees of freedom
## AIC: 99.53
## 
## Number of Fisher Scoring iterations: 5

2.5 Model Comparison

knitr::kable(anova(m4, m3, m2, test="Chisq"))

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
15	5.743961	NA	NA	NA
14	4.941883	1	0.8020783	0.3704727
13	1.542230	1	3.3996529	0.0652101

knitr::kable(anova(m5, m2, test="Chisq"))

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
15	83.05382	NA	NA	NA
13	1.54223	2	81.51159	0

knitr::kable(anova(m3, m2, test="Chisq"))

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
14	4.941883	NA	NA	NA
13	1.542230	1	3.399653	0.0652101

2.6 Goodness of fit

The output indicated that the deviance value did not approximate the chisquare value, thus m3(the model of both Score + Gender) is Not a good fit for the data

(m3$null.deviance - m3$deviance)/m3$null.deviance

## [1] 0.9405854

1 - pchisq(m3$deviance, m3$df.residual)

## [1] 0.9866052

2.7 m3 coefficients

The output showed score is a significant predictor for cbind(c,nc), while gender is not.
Furthermore, conditional on gender, one point increases in GHQ is associated with the
increase of 157% to 720% in the odds for being a case.

sjPlot::tab_model(m3, show.r2=FALSE, show.obs=FALSE, show.se=TRUE)

	cbind(Case, Noncase)
Predictors	Odds Ratios	std. Error	CI	p
(Intercept)	0.02	0.98	0.00 – 0.08	<0.001
Score	4.19	0.29	2.57 – 8.20	<0.001
Gender [women]	2.21	0.93	0.42 – 17.73	0.393

3 Visdualization

3.1 GHQ against Empirical/fitted log-odd

dta_p <- dta %>%
       mutate(Prop=Case/Total,
              phat=predict(m3, 
                              newdata=dta[, c(1,2)], 
                              type="response"),
              elgit=log((Case+1/22)/(Noncase+(1/22))))
library(ggplot2)

ggplot(dta, 
       aes(Score, fitted(m3), 
           color=Gender)) + 
  geom_line(alpha=.5) +
  geom_point(data=dta_p,
             aes(Score, elgit))+
  scale_color_manual(values=c("black", "red3"))+
  labs(x="Score on GHQ",
       y="Empirical and fitted log-odds")+
  theme_minimal()+
  theme(legend.position = c(.1, .9))

3.2 Probablity chart

library(faraway)

## 
## Attaching package: 'faraway'

## The following object is masked from 'package:plyr':
## 
##     ozone

beta <- coef(m3)

with(dta, plot(Score, Case/Total, 
               col=Gender, 
               bty='n', 
               xlab="GHQ score"))
grid()
curve(ilogit(beta[1]+beta[2]*x), add=T, col=1)
curve(ilogit(beta[1]+beta[2]*x+beta[3]), add=T, col=2)

GHQ Data example

Szu-Yu Chen

29 Oct 2020