1 Exercise 1

gee model for multiple sources data

#
require(pacman)

## Loading required package: pacman

#
pacman::p_load(MASS, gee, tidyverse, geepack)

1.1 input data

fL <- "https://www.hsph.harvard.edu/fitzmaur/ala2e/ccs-data.txt"
dta <- read.table(fL, na.string = ".")

1.2 rename

names(dta) <- c("ID", "Problem", "Single", "Parent", "Teacher")

1.3 change data form

dta <- dta %>%
       mutate(ID = factor(ID), Problem = factor(ifelse(Problem == 1, "Y", "N")),
                               Single = factor(ifelse(Single == 1, "Y", "N")))

1.4 see it

head(dta)

##   ID Problem Single Parent Teacher
## 1  1       Y      Y      0      NA
## 2  2       N      N      1      NA
## 3  3       Y      N      0      NA
## 4  4       Y      Y      1      NA
## 5  5       Y      N      0      NA
## 6  6       Y      N      0      NA

#
str(dta)

## 'data.frame':    2501 obs. of  5 variables:
##  $ ID     : Factor w/ 2501 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Problem: Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 1 1 ...
##  $ Single : Factor w/ 2 levels "N","Y": 2 1 1 2 1 1 1 1 2 1 ...
##  $ Parent : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ Teacher: int  NA NA NA NA NA NA NA 0 NA 0 ...

##
with(dta, ftable(Teacher, Parent))

##         Parent    0    1
## Teacher                 
## 0              1030  129
## 1               165  104

165>129 老師報告學生違規和攻擊行為較多。

1.5 correlation

cor(dta[,4:5], use="pair")

##            Parent   Teacher
## Parent  1.0000000 0.2913296
## Teacher 0.2913296 1.0000000

老師和家長的的相關為0.2913

1.6 missing data

sum(is.na(dta[,4]) | is.na(dta[,5]))/dim(dta)[1]

## [1] 0.4290284

42.9%的missing data

1.7 stack the parents&teachers report

call them informants

dtaL <- dta %>%
        gather(Informant, Report, 4:5)

# sort according to child id
dtaL <- dtaL[order(dtaL$ID), ]

# get rid of annoying row names
rownames(dtaL) <- NULL

# have a look
head(dtaL)

##   ID Problem Single Informant Report
## 1  1       Y      Y    Parent      0
## 2  1       Y      Y   Teacher     NA
## 3  2       N      N    Parent      1
## 4  2       N      N   Teacher     NA
## 5  3       Y      N    Parent      0
## 6  3       Y      N   Teacher     NA

1.8 prameter estimate

#
summary(m1 <- gee(Report ~ Informant + Problem + Single + Informant:Problem,
        data = dtaL, id = ID, corstr = "exchangeable", family = binomial))

## Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27

## running glm to get initial regression estimate

##               (Intercept)          InformantTeacher                  ProblemY 
##                -2.1595398                 0.4712270                 0.5996091 
##                   SingleY InformantTeacher:ProblemY 
##                 0.6332404                -0.4247098

## 
##  GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
##  gee S-function, version 4.13 modified 98/01/27 (1998) 
## 
## Model:
##  Link:                      Logit 
##  Variance to Mean Relation: Binomial 
##  Correlation Structure:     Exchangeable 
## 
## Call:
## gee(formula = Report ~ Informant + Problem + Single + Informant:Problem, 
##     id = ID, data = dtaL, family = binomial, corstr = "exchangeable")
## 
## Summary of Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.2841247 -0.1765046 -0.1570826 -0.1039640  0.8960360 
## 
## 
## Coefficients:
##                             Estimate Naive S.E.    Naive z Robust S.E.
## (Intercept)               -2.1539359 0.09016552 -23.888688  0.08888885
## InformantTeacher           0.4738391 0.11498088   4.121025  0.11798848
## ProblemY                   0.5995711 0.11284664   5.313149  0.11278090
## SingleY                    0.6137249 0.10596325   5.791865  0.10759321
## InformantTeacher:ProblemY -0.4572920 0.15701454  -2.912418  0.15711252
##                             Robust z
## (Intercept)               -24.231790
## InformantTeacher            4.015977
## ProblemY                    5.316247
## SingleY                     5.704123
## InformantTeacher:ProblemY  -2.910602
## 
## Estimated Scale Parameter:  1.001019
## Number of Iterations:  2
## 
## Working Correlation
##           [,1]      [,2]
## [1,] 1.0000000 0.2681987
## [2,] 0.2681987 1.0000000

summary(m1a <- geeglm(Report ~ Informant + Problem + Single +
                      Informant:Problem + Single:Problem,
        data = dtaL, id = ID, corstr = "exchangeable", family = binomial))

## 
## Call:
## geeglm(formula = Report ~ Informant + Problem + Single + Informant:Problem + 
##     Single:Problem, family = binomial, data = dtaL, id = ID, 
##     corstr = "exchangeable")
## 
##  Coefficients:
##                           Estimate  Std.err    Wald Pr(>|W|)    
## (Intercept)               -2.16615  0.09410 529.882  < 2e-16 ***
## InformantTeacher           0.47515  0.11819  16.162 5.81e-05 ***
## ProblemY                   0.62177  0.12654  24.144 8.94e-07 ***
## SingleY                    0.65789  0.15949  17.014 3.71e-05 ***
## InformantTeacher:ProblemY -0.45906  0.15722   8.526   0.0035 ** 
## ProblemY:SingleY          -0.07945  0.21572   0.136   0.7127    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1 0.06415
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha   0.2668 0.03029
## Number of clusters:   2501  Maximum cluster size: 2

theoretical Correlation =0.268 ,比0.291稍微低一點

sjPlot::tab_model(m1 <- geeglm(Report ~ Informant + Problem + Single + Informant:Problem, data=dtaL, 
                               id=ID, corstr="exchangeable", family=binomial), show.obs=F, show.ngroups = F)

	Report
Predictors	Odds Ratios	CI	p
(Intercept)	0.12	0.10 – 0.14	<0.001
Informant [Teacher]	1.61	1.27 – 2.02	<0.001
Problem [Y]	1.82	1.46 – 2.27	<0.001
Single [Y]	1.85	1.50 – 2.28	<0.001
Informant [Teacher] * Problem [Y]	0.63	0.47 – 0.86	0.004

sjPlot::tab_model(m1a <- geeglm(Report ~ Informant + Problem + Single +
                      Informant:Problem + Single:Problem, data=dtaL,
                               id=ID, corstr="exchangeable", family=binomial), show.obs=F, show.ngroups = F)

	Report
Predictors	Odds Ratios	CI	p
(Intercept)	0.11	0.10 – 0.14	<0.001
Informant [Teacher]	1.61	1.28 – 2.03	<0.001
Problem [Y]	1.86	1.45 – 2.39	<0.001
Single [Y]	1.93	1.41 – 2.64	<0.001
Informant [Teacher] * Problem [Y]	0.63	0.46 – 0.86	0.004
Problem [Y] * Single [Y]	0.92	0.61 – 1.41	0.713

geeglm可經由ln進行換算即與Gee顯示會相等

老師的report較高

# residual plot - not informative
plot(m1a, "pearson", which = 1)
grid()

1.9 fitted probaility

# set theme
ot <- theme_set(theme_bw())

# fortify data with model fits
dta_m1 <- data.frame(na.omit(dtaL), phat = fitted(m1a))

# compare observed with fitted
ggplot(dta_m1, aes(Problem, Report, color = Informant)) +
 geom_point(aes(Problem, phat),position = position_dodge(width=.2),
            pch = 1, size = rel(2)) +
 geom_hline(yintercept = c(mean(dta$Parent, na.rm = T),
                           mean(dta$Teacher, na.rm = T)), linetype = "dotted")+
 stat_summary(fun.data = "mean_cl_boot", position = position_dodge(width=.2)) +
 facet_wrap( ~ Single)+
 labs(x = "Physical Problem", y = "Probability of report") +
 theme(legend.position = c(.1, .9))

###

由圖可知老師report的level比較高，空心=model estimated，實心=從data裡算出來的。 model prediction 與raw data沒有差太多。

2 Exercise 2

2.1 load and see

dta2 <- read.table("D:/sheu/insomnia.txt", h=T)

#
head(dta2)

##   case treat occasion resp
## 1    1     1        0    1
## 2    1     1        1    1
## 3    2     1        0    1
## 4    2     1        1    1
## 5    3     1        0    1
## 6    3     1        1    1

2.2 model

logit(P{Yt <= j}) = αk + β1 time + β2 trt + β3 time × trt,

j = 1, 2, 3, 4; k = j - 1 for j > 1.

#
library(repolr)

## 
## Attaching package: 'repolr'

## The following objects are masked from 'package:geepack':
## 
##     ordgee, QIC

#
dta_repolr <- repolr(resp ~ treat + occasion + treat:occasion, 
                     subjects="case", data=dta2, times=c(1,2), categories=4, 
                     corr.mod="independence")
#
summary(dta_repolr$gee)

## Length  Class   Mode 
##      0   NULL   NULL

summary(dta_repolr)

## 
## repolr: 2016-02-26 version 3.4 
## 
## Call:
## repolr(formula = resp ~ treat + occasion + treat:occasion, subjects = "case", 
##     data = dta2, times = c(1, 2), categories = 4, corr.mod = "independence")
## 
## Coefficients: 
##                 coeff     se.robust  z.robust  p.value 
## cuts1|2          -2.2671    0.2091   -10.8422    0.0000
## cuts2|3          -0.9515    0.1769    -5.3787    0.0000
## cuts3|4           0.3517    0.1751     2.0086    0.0446
## treat             0.0336    0.2369     0.1418    0.8872
## occasion          1.0381    0.1564     6.6375    0.0000
## treat:occasion    0.7077    0.2458     2.8792    0.0040
## 
## Correlation Structure:  independence 
## Fixed Correlation:  0

Occasion effect = 1.04 for placebo, 1.04 + 0.71 = 1.75 for treatment.

Odds ratios: exp(1.04) = 2.8, exp(1.75) = 5.7

Treatment effect exp(0.03) = 1.03 initially, exp(0.03+0.71) = 2.1 at follow-up.

2.3 hand-made cumulative prop. against time to sleep

#
dta2$treat <- factor(ifelse(dta2$treat==1, "Active", "Placebo"))

#
dta2$occasion <- factor(ifelse(dta2$occasion==1, "Follow-up", "Initial"))

dta.tbl <- ftable(dta2$treat, dta2$occasion , dta2$resp)

dta.tbl

##                     1  2  3  4
##                               
## Active  Follow-up  40 49 19 11
##         Initial    12 20 40 47
## Placebo Follow-up  31 29 35 25
##         Initial    14 20 35 51

#
obsp <- as.numeric(t(prop.table(dta.tbl, m=1)))

#
clp <- c(cumsum(obsp[1:4]), cumsum(obsp[5:8]), cumsum(obsp[9:12]), 
         cumsum(obsp[13:16]))
#
yt <-rep(c(20,25,45,60),4)

#
y <- data.frame(time2sleep=yt, clprop=clp)

2.4 Cumulative proportion plot

plot(y[,1], y[,2], type="n",
     xlab="Time to Sleep (min.)", ylab="Cumulative proportion")

#
lines(y[1:4,  1],  y[1:4,2], lty=1, col="steelblue")
lines(y[5:8,  1],  y[5:8,2], lty=5, col="steelblue")
lines(y[9:12, 1],  y[9:12,2], lty=2, col="indianred")
lines(y[13:16,1],  y[13:16,2], lty=4, col="indianred")

#
text(22, .12, "T,I")
text(22, .47, "T,F")
text(22, .20, "P,I")
text(22, .33, "P,F")
#
grid()

由累積折線圖可知採用催眠藥物組和安慰劑組間起初皆以相似的時間入睡。但在兩週後，接受催眠藥物治療的人比起安慰劑組，能更迅速入眠。(TF>PF)

In class Exercise

tjlee

2020-11-23