In class Exercise 1

The data are obtained from two surveys of children’s mental health. The data set consists of 1,428 children who had both parent and teacher responses, and an additional 1,073 children with only a parent response. In the psychiatric literature, these sources of respondents are often referred to as “informants”.

A standardized measure of childhood psychopathology was used both by parents (Child Behavior Checklist, CBC) and teachers (Teacher Report Forms, TRF) to assess children in the study. The response variable is derived from the externalizing scale, which assesses delinquent and aggressive behavior. The scale has been dichotomized at the cutpoint for borderline/clinical psychopathology.

Source: Zahner, G.E.P., Jacobs, J.H., Freeman, D.H. & Trainor, K.F. (1993). Rural-urban child psychopathology in a northeastern U.S. State: 1986-1989. Journal of the American Academy of Child and Adolescent Psychiatry, 32, 378-387. Reported in Fitzmaurice, G.M., Laird, N.M., & Ware, J.H. (2004). Applied Longitudinal Analysis. p. 434-439.

  • Column 1: Child ID
  • Column 2: Child’s physical health problems: Fair = Fair to bad , Good
  • Column 3: Single parent status: Single, Otherwise
  • Column 4: Externalizing behavior (Parent report)
  • Column 5: Externalizing behavior (Teachert report)

Data Preparation

dta1<-read.table("ccs_data.txt", na.string = ".")
head(dta1)
##   V1 V2 V3 V4 V5
## 1  1  1  1  0 NA
## 2  2  0  0  1 NA
## 3  3  1  0  0 NA
## 4  4  1  1  1 NA
## 5  5  1  0  0 NA
## 6  6  1  0  0 NA
#require packages
pacman::p_load(MASS, gee, tidyverse, geepack)
# input data
fL <- "https://www.hsph.harvard.edu/fitzmaur/ala2e/ccs-data.txt"
dta <- read.table(fL, na.string = ".")
#rename variables
names(dta) <- c("ID", "Problem", "Single", "Parent", "Teacher")
#recode and factor them
dta <- dta %>%
       mutate(ID = factor(ID), Problem = factor(ifelse(Problem == 1, "Y", "N")),
                               Single = factor(ifelse(Single == 1, "Y", "N")))
#take a look
head(dta)
##   ID Problem Single Parent Teacher
## 1  1       Y      Y      0      NA
## 2  2       N      N      1      NA
## 3  3       Y      N      0      NA
## 4  4       Y      Y      1      NA
## 5  5       Y      N      0      NA
## 6  6       Y      N      0      NA
#check data dimensions
str(dta)
## 'data.frame':    2501 obs. of  5 variables:
##  $ ID     : Factor w/ 2501 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Problem: Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 1 1 ...
##  $ Single : Factor w/ 2 levels "N","Y": 2 1 1 2 1 1 1 1 2 1 ...
##  $ Parent : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ Teacher: int  NA NA NA NA NA NA NA 0 NA 0 ...

Explore data

##
with(dta, ftable(Teacher, Parent))
##         Parent    0    1
## Teacher                 
## 0              1030  129
## 1               165  104
# correlation
cor(dta[,4:5], use="pair")
##            Parent   Teacher
## Parent  1.0000000 0.2913296
## Teacher 0.2913296 1.0000000
# proportion of missing responses (Parent or(|) Teacher missing)
sum(is.na(dta[,4]) | is.na(dta[,5]))/dim(dta)[1]
## [1] 0.4290284
# stack the parents and teachers report; call them informants
dtaL <- dta %>%
        gather(Informant, Report, 4:5)

# sort according to child id
dtaL <- dtaL[order(dtaL$ID), ]

# get rid of annoying row names
rownames(dtaL) <- NULL

# have a look
head(dtaL)
##   ID Problem Single Informant Report
## 1  1       Y      Y    Parent      0
## 2  1       Y      Y   Teacher     NA
## 3  2       N      N    Parent      1
## 4  2       N      N   Teacher     NA
## 5  3       Y      N    Parent      0
## 6  3       Y      N   Teacher     NA

Models

# gee function
summary(m1 <- gee(Report ~ Informant + Problem + Single + Informant:Problem,
        data = dtaL, id = ID, corstr = "exchangeable", family = binomial))
##               (Intercept)          InformantTeacher                  ProblemY 
##                -2.1595398                 0.4712270                 0.5996091 
##                   SingleY InformantTeacher:ProblemY 
##                 0.6332404                -0.4247098
## 
##  GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
##  gee S-function, version 4.13 modified 98/01/27 (1998) 
## 
## Model:
##  Link:                      Logit 
##  Variance to Mean Relation: Binomial 
##  Correlation Structure:     Exchangeable 
## 
## Call:
## gee(formula = Report ~ Informant + Problem + Single + Informant:Problem, 
##     id = ID, data = dtaL, family = binomial, corstr = "exchangeable")
## 
## Summary of Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.2841247 -0.1765046 -0.1570826 -0.1039640  0.8960360 
## 
## 
## Coefficients:
##                             Estimate Naive S.E.    Naive z Robust S.E.
## (Intercept)               -2.1539359 0.09016552 -23.888688  0.08888885
## InformantTeacher           0.4738391 0.11498088   4.121025  0.11798848
## ProblemY                   0.5995711 0.11284664   5.313149  0.11278090
## SingleY                    0.6137249 0.10596325   5.791865  0.10759321
## InformantTeacher:ProblemY -0.4572920 0.15701454  -2.912418  0.15711252
##                             Robust z
## (Intercept)               -24.231790
## InformantTeacher            4.015977
## ProblemY                    5.316247
## SingleY                     5.704123
## InformantTeacher:ProblemY  -2.910602
## 
## Estimated Scale Parameter:  1.001019
## Number of Iterations:  2
## 
## Working Correlation
##           [,1]      [,2]
## [1,] 1.0000000 0.2681987
## [2,] 0.2681987 1.0000000
#'glmgee' function, add Single:Problem interaction
summary(m1a <- geeglm(Report ~ Informant + Problem + Single +
                      Informant:Problem + Single:Problem,
        data = dtaL, id = ID, corstr = "exchangeable", family = binomial))
## 
## Call:
## geeglm(formula = Report ~ Informant + Problem + Single + Informant:Problem + 
##     Single:Problem, family = binomial, data = dtaL, id = ID, 
##     corstr = "exchangeable")
## 
##  Coefficients:
##                           Estimate  Std.err    Wald Pr(>|W|)    
## (Intercept)               -2.16615  0.09410 529.882  < 2e-16 ***
## InformantTeacher           0.47515  0.11819  16.162 5.81e-05 ***
## ProblemY                   0.62177  0.12654  24.144 8.94e-07 ***
## SingleY                    0.65789  0.15949  17.014 3.71e-05 ***
## InformantTeacher:ProblemY -0.45906  0.15722   8.526   0.0035 ** 
## ProblemY:SingleY          -0.07945  0.21572   0.136   0.7127    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1 0.06415
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha   0.2668 0.03029
## Number of clusters:   2501  Maximum cluster size: 2
# exp(SingleY's coefficiet)
exp(0.6579)
## [1] 1.931
  • For kids of single parent status, both informants were more likely to report behavior problems.
#Teacher found out Problem
exp((-2.1662+0.4751)+(-0.4591))
## [1] 0.1165
  • The effect of children health problems on report of behavior problems is more apparent on parents than on teachers.

Residual Plots

# residual plot - not informative
plot(m1a, "pearson", which = 1)
grid() #加格線

Fitted Probabilities

# set theme
ot <- theme_set(theme_bw())

# fortify data with model fits
dta_m1 <- data.frame(na.omit(dtaL), phat = fitted(m1a))

# compare observed with fitted
ggplot(dta_m1, aes(Problem, Report, color = Informant)) +
 geom_point(aes(Problem, phat),position = position_dodge(width=.2),
            pch = 1, size = rel(2)) +
 geom_hline(yintercept = c(mean(dta$Parent, na.rm = T),
                           mean(dta$Teacher, na.rm = T)), linetype = "dotted")+
 stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width=.2)) +
 facet_wrap( ~ Single)+
 labs(x = "Physical Problem", y = "Probability of report") +
 theme(legend.position = c(.1, .85))

In Class Exercise 2

The data come from a randomized, double-blind clinical trial comparing active hypnotic drug with a placebo in patients who have insomnia problems. The response is the patient’s reported time (in minutes) to fall asleep after going to bed. Patients responded before and after a two-week treatment period. The patients receiving the two treatments were independent samples.

Source: Agresti, A. (2002). Categorical Data Analysis. 2nd Ed. p. 462

  • Column 1: Subject ID
  • Column 2: Treatment, Active = 1, Placebo = 0
  • Column 3: Time, Initial = 0, Follow-up = 1
  • Column 4: Time to sleep, < 20 min. = 1, 20-30 = 2, 30-60 = 3, > 60 min. = 4

Data preparation

dta2<- read.table("insomnia.txt", h=T)
head(dta2)
##   case treat occasion resp
## 1    1     1        0    1
## 2    1     1        1    1
## 3    2     1        0    1
## 4    2     1        1    1
## 5    3     1        0    1
## 6    3     1        1    1
#
library(repolr)

#
dta2_repolr <- repolr(resp ~ treat + occasion + treat:occasion, 
                     subjects="case", data=dta2, times=c(1,2), categories=4)
#
summary(dta2_repolr)
## 
## repolr: 2016-02-26 version 3.4 
## 
## Call:
## repolr(formula = resp ~ treat + occasion + treat:occasion, subjects = "case", 
##     data = dta2, times = c(1, 2), categories = 4)
## 
## Coefficients: 
##                 coeff     se.robust  z.robust  p.value 
## cuts1|2          -2.2671    0.2091   -10.8422    0.0000
## cuts2|3          -0.9515    0.1769    -5.3787    0.0000
## cuts3|4           0.3517    0.1751     2.0086    0.0446
## treat             0.0336    0.2369     0.1418    0.8872
## occasion          1.0381    0.1564     6.6375    0.0000
## treat:occasion    0.7077    0.2458     2.8792    0.0040
## 
## Correlation Structure:  independence 
## Fixed Correlation:  0
  • Initial effect = -1.7458 for Treatment,
  • -1.7458 + 0.7077 = -1.04 for Placebo.

  • Odds ration: exp(-1.7458)=0.175, exp(-1.04)=0.353

  • Treatment(Placebo) effect exp(-0.7414) = 1.03 initially,
  • exp(-0.7414+0.7077) = 0.967 at follow-up.

# hand-made cumulative prop. against time to sleep

#
dta2$treat <- factor(ifelse(dta2$treat==1, "Active", "Placebo"))

#
dta2$occasion <- factor(ifelse(dta2$occasion==1, "Follow-up", "Initial"))

#
dta2.tbl <- ftable(dta2$treat, dta2$occasion , dta2$resp)

#
dta2.tbl
##                     1  2  3  4
##                               
## Active  Follow-up  40 49 19 11
##         Initial    12 20 40 47
## Placebo Follow-up  31 29 35 25
##         Initial    14 20 35 51

Explore data

#
obsp <- as.numeric(t(prop.table(dta2.tbl, m=1)))

#
clp <- c(cumsum(obsp[1:4]), cumsum(obsp[5:8]), cumsum(obsp[9:12]), 
         cumsum(obsp[13:16]))
#
yt <-rep(c(20,25,45,60),4)

#
y <- data.frame(time2sleep=yt, clprop=clp)

#
plot(y[,1], y[,2], type="n",
     xlab="Time to Sleep (min.)", ylab="Cumulative proportion")

#
lines(y[1:4,  1],  y[1:4,2], lty=1, col="steelblue")
lines(y[5:8,  1],  y[5:8,2], lty=5, col="steelblue")
lines(y[9:12, 1],  y[9:12,2], lty=2, col="indianred")
lines(y[13:16,1],  y[13:16,2], lty=4, col="indianred")

#
text(22, .12, "T,I")
text(22, .47, "T,F")
text(22, .20, "P,I")
text(22, .33, "P,F")
#
grid()

Exercise 1

The data are from the Muscatine Coronary Risk Factor (MCRF) study, a longitudinal survey of school-age children in Muscatine, Iowa. The MCRF study had the goal of examining the development and persistence of risk factors for coronary disease in children. In the MCRF study, weight and height measurements of five cohorts of children, initially aged 5-7, 7-9, 9-11, 11-13, and 13-15 years, were obtained biennially from 1977 to 1981. Data were collected on 4856 boys and girls. On the basis of a comparison of their weight to age-gender specific norms, children were classified as obese or not obese. The goal of the analysis is to determine whether the risk of obesity increases with age and whether patterns of change in obesity are the same for boys and girls.

Source: Woolson, R.F., & Clarke, W.R. (1984). Analysis of categorical incompletel longitudinal data. Journal of the Royal Statistical Society, Series A, 147, 87-99. Reported in Fitzmaurice, G.M., Laired, N.M., & Ware, J.H. (2004). Applied Longitudinal Data Analysis. pp. 306-312.

  • Column 1: ID
  • Column 2: Gender, 0 = Male, 1 = Female
  • Column 3: Baseline Age
  • Column 4: Current Age, Age denotes mid-point of age-cohort
  • Column 5: Occasions of measurements
  • Column 6: Obesity Status, 1 = Obese, 0 = Non-Obese, . = Missing
library(knitr)
library(rmdformats)
fL <- "https://content.sph.harvard.edu/fitzmaur/ala2e/muscatine-data.txt"
dta_1 <- read.table(fL, h=F, na.strings = '.')
names(dta_1) <- c("ID","Gender","Age0","Age","Occasion","Obese")
dta_1$Gender <- ifelse(dta_1$Gender == 0, "M", "F")
head(dta_1)
##   ID Gender Age0 Age Occasion Obese
## 1  1      M    6   6        1     1
## 2  1      M    6   8        2     1
## 3  1      M    6  10        3     1
## 4  2      M    6   6        1     1
## 5  2      M    6   8        2     1
## 6  2      M    6  10        3     1
dta_1$Age_c<-factor(dta_1$Age-12)
dta_1$Gender<-factor(dta_1$Gender)
dta_1$Occasion<-factor(dta_1$Occasion)
dta_1$Obese<-as.integer(ifelse(dta_1$Obese==1, 1, 0))
#m0_1<-repolr(Obese~Age_c*Gender, data=dta_1,  subjects = "ID",            times=c(1,2,3), categories=2, corr.mod="ar1")
#summary(m0_1)

Exercise 2

The data are from a survey of primary care patients in Cantabrian, Spain, to estimate the prevalence of psychiatric illness. The prevalence for each of the sexes is estimated from data provided by the general practitioner and from completing the general health questionnaire.

Source: Everitt, B.S., & Dunn, G. (2001). Applied Multivariate Data Analysis, 2nd Ed. p. 237

  • Column 1: Subject ID
  • Column 2: Illness = 1, Otherweise = 0
  • Column 3: Gender ID
  • Column 4: General practitioner = GP, General health questionnaire = GPQ
dta_2<-read.table("cantabrian.txt", h=T)

names(dta_2)<-c("ID", "Illness", "Gender", "Type")
dta_2<-dta_2 %>% mutate(ID=factor(ID),
                       Illness=factor(ifelse(Illness==1, 1, 0),
                                      labels = c("Illness","Non-illness")),
                       Gender=factor(Gender), 
                       Type=factor(Type))
str(dta_2)  
## 'data.frame':    1646 obs. of  4 variables:
##  $ ID     : Factor w/ 823 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Illness: Factor w/ 2 levels "Illness","Non-illness": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Type   : Factor w/ 2 levels "GHQ","GP": 2 1 2 1 2 1 2 1 2 1 ...
with(dta_2, ftable(ftable(Illness, Gender, Type)))
##                    Type GHQ  GP
## Illness     Gender             
## Illness     F           306 420
##             M           244 287
## Non-illness F           193  79
##             M            80  37
dta_2p<-as.data.frame(prop.table(with(dta_2,ftable(Illness,Type)), m=2))
dta_2p
##       Illness Type   Freq
## 1     Illness  GHQ 0.6683
## 2 Non-illness  GHQ 0.3317
## 3     Illness   GP 0.8591
## 4 Non-illness   GP 0.1409
#summary(m0_2 <- geeglm(Illness ~ Gender + Type + Gender*Type, data=dta_2, id = ID, corstr="exchangeable", family = binomial, na.action=na.omit))