This page is a supplementary material for the module Categorical Data Analysis (part I) in the course Biostatistics II in the master Metodologia Epidemilogica e Bostatistica per la Ricerca Clinica.
It containts the R code and the results presented in the slides as well as additional (hopefully) useful material. Refer to the lecture for interpretation of the results.
To reproduce the results you need to load the following packages
pacman::p_load(tidyverse, broom, waffle, Epi, gmodels, gridExtra, epiDisplay,
data.table, scales, aod, DescTools, ResourceSelection)
# theme for ggplot
theme_set(theme_classic())
# simulate date
set.seed(1234) # seed for reproducibility
Y <- rbinom(10, 1, .4)
Y
[1] 0 1 1 1 1 1 0 0 1 0
p <- seq(.1, .9, .01)
# define functions for likelihood and log likelihhod
lik <- function(p) prod((p^Y)*(1-p)^(1-Y))
loglik <- function(p) sum(Y*log(p) + (1-Y)*log(1-p))
max_lik <- data.frame(
p = p,
lik = sapply(p, lik),
loglik = sapply(p, loglik)
)
# showing results for selected values of p
filter(max_lik, p %in% seq(.1, .9, .1))
p lik loglik
1 0.1 0.0000006561 -14.236953
2 0.2 0.0000262144 -10.549202
3 0.3 0.0001750329 -8.650537
4 0.4 0.0005308416 -7.541047
5 0.5 0.0009765625 -6.931472
6 0.6 0.0011943936 -6.730117
7 0.8 0.0004194304 -7.776613
8 0.9 0.0000531441 -9.842503
ggplot(max_lik, aes(x = p, y = loglik)) + geom_line() +
geom_hline(yintercept = max(max_lik$loglik), linetype = "dotted") +
geom_vline(xintercept = .6, linetype = "dotted") +
scale_x_continuous(breaks = seq(0, 1, .1)) +
labs(y = "Log likelihood")
# Manually
a = 50
n = 500
p = a/n
se = sqrt(p*(1-p)/n)
data.frame(p = p, low = p - qnorm(.975)*se, upp = p + qnorm(.975)*se)
p low upp
1 0.1 0.07370432 0.1262957
# Using dedicated functions
prop.test(x = 50, n = 500)
1-sample proportions test with continuity correction
data: 50 out of 500, null probability 0.5
X-squared = 318.4, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.07580034 0.13052865
sample estimates:
p
0.1
binom.test(x = 50, n = 500) # exact binomial distribuion
Exact binomial test
data: 50 and 500
number of successes = 50, number of trials = 500, p-value <
2.2e-16
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.07513582 0.12970909
sample estimates:
probability of success
0.1
load(url("http://alecri.github.io/downloads/data/marathon.Rdata"))
glimpse(marathon)
Observations: 488
Variables: 18
$ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ na <dbl> 138, 142, 151, 139, 145, 140, 142, 140, 141, 138, 141,…
$ nas135 <fct> na > 135, na > 135, na > 135, na > 135, na > 135, na >…
$ female <fct> female, male, male, male, female, female, male, male, …
$ age <dbl> 24.20534, 44.28200, 41.96304, 28.19713, 30.18207, 28.2…
$ urinat3p <fct> >=3, <3, <3, >=3, <3, <3, <3, <3, <3, <3, <3, <3, <3, …
$ prewt <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ postwt <dbl> NA, NA, NA, NA, 50.68182, 55.68182, 59.31818, 59.77273…
$ wtdiff <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ height <dbl> 1.72720, NA, NA, 1.72720, NA, 1.60655, NA, NA, NA, NA,…
$ bmi <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ runtime <dbl> NA, 161, 156, 346, 185, 233, 183, 162, 182, 190, 169, …
$ trainpace <dbl> 480, 430, 360, 630, NA, NA, 435, 450, 435, 440, 410, 4…
$ prevmara <dbl> 3, 40, 40, 1, 3, 25, 19, 2, 80, 10, 16, 3, 2, 8, 4, 3,…
$ fluidint <fct> Every mile, Every mile, Every other mile, Every mile, …
$ waterload <fct> Yes, Yes, NA, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, N…
$ nsaid <fct> Yes, Yes, NA, No, Yes, No, Yes, No, Yes, Yes, No, Yes,…
$ wtdiffc <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# Descriptives
summary(marathon$nas135)
na > 135 na <= 135
426 62
tab_na <- table(marathon$nas135)
prop.table(tab_na)
na > 135 na <= 135
0.8729508 0.1270492
# absolute frequency plot
qplot(nas135, data = marathon) +
theme_classic() +
labs(x = "", y = "Frequency")
# percentage frequency plot
ggplot(marathon, aes(nas135)) +
geom_bar(aes(y = ..count../sum(..count..))) +
labs(x = "", y = "Frequency (%)")
# Risk of hyponatrimia with 95% CIs
binom.test(tab_na[2:1])
Exact binomial test
data: tab_na[2:1]
number of successes = 62, number of trials = 488, p-value <
2.2e-16
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.09881215 0.15989560
sample estimates:
probability of success
0.1270492
# probability can be taught as a mean of 0 and 1
p <- mean(marathon$nas135 == "na <= 135")
odds <- p/(1-p)
data.frame(p = p, odds = odds)
p odds
1 0.1270492 0.1455399
# waffle plot for the probability
n_p <- round(c(100*(1-p), p*100))
names(n_p) <- levels(marathon$nas135)
waffle(n_p, rows = 7, colors = c("blue", "red"),
use_glyph = "running", glyph_size = 5)
# waffle plot for the odds
n_odds <- round(c(100, odds*100))
names(n_odds) <- levels(marathon$nas135)
waffle(n_odds, rows = 7, colors = c("blue", "red"),
use_glyph = "running", glyph_size = 5)
stat.table(list(nas135, female), list(count(), percent(nas135)),
data = marathon, margin = T)
------------------------------------
---------female----------
nas135 male female Total
------------------------------------
na > 135 297 129 426
92.2 77.7 87.3
na <= 135 25 37 62
7.8 22.3 12.7
Total 322 166 488
100.0 100.0 100.0
------------------------------------
group_by(marathon, female) %>%
count(nas135) %>%
mutate(perc = n/sum(n)) %>%
ggplot(aes(x = nas135, y = perc, fill = female)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(x = "", y = "Frequency")
tabf <- with(marathon, table(nas135, female))
CrossTable(tabf, prop.r = FALSE, prop.c = FALSE,
prop.t = FALSE, prop.chisq = FALSE, expected = TRUE, chisq = TRUE)
Cell Contents
|-------------------------|
| N |
| Expected N |
|-------------------------|
Total Observations in Table: 488
| female
nas135 | male | female | Row Total |
-------------|-----------|-----------|-----------|
na > 135 | 297 | 129 | 426 |
| 281.090 | 144.910 | |
-------------|-----------|-----------|-----------|
na <= 135 | 25 | 37 | 62 |
| 40.910 | 21.090 | |
-------------|-----------|-----------|-----------|
Column Total | 322 | 166 | 488 |
-------------|-----------|-----------|-----------|
Statistics for All Table Factors
Pearson's Chi-squared test
------------------------------------------------------------
Chi^2 = 20.83654 d.f. = 1 p = 5.001948e-06
Pearson's Chi-squared test with Yates' continuity correction
------------------------------------------------------------
Chi^2 = 19.54746 d.f. = 1 p = 9.81313e-06
# test for independence
chisq.test(tabf)
Pearson's Chi-squared test with Yates' continuity correction
data: tabf
X-squared = 19.547, df = 1, p-value = 9.813e-06
with(marathon,
twoby2(exposure = relevel(female, 2), outcome = relevel(nas135, 2)))
2 by 2 table analysis:
------------------------------------------------------
Outcome : na <= 135
Comparing : female vs. male
na <= 135 na > 135 P(na <= 135) 95% conf. interval
female 37 129 0.2229 0.166 0.2925
male 25 297 0.0776 0.053 0.1124
95% conf. interval
Relative Risk: 2.8708 1.7914 4.6007
Sample Odds Ratio: 3.4074 1.9701 5.8936
Conditional MLE Odds Ratio: 3.3982 1.9037 6.1528
Probability difference: 0.1453 0.0790 0.2186
Exact P-value: 0
Asymptotic P-value: 0
------------------------------------------------------
logistic <- function(x){
exp(x)/(1 + exp(x))
}
ggplot(data.frame(x = c(-5, 5)), aes(x = x)) +
stat_function(fun = logistic, geom = "line") +
labs(x = "x", y = "logistic(x)")
mod0 <- glm(nas135 ~ 1, data = marathon, family = "binomial")
summary(mod0)
Call:
glm(formula = nas135 ~ 1, family = "binomial", data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.5213 -0.5213 -0.5213 -0.5213 2.0313
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.9273 0.1359 -14.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 371.6 on 487 degrees of freedom
Residual deviance: 371.6 on 487 degrees of freedom
AIC: 373.6
Number of Fisher Scoring iterations: 4
ci.lin(mod0) # 95% CIs for coefficients the log odds scale
Estimate StdErr z P 2.5% 97.5%
(Intercept) -1.927305 0.1359277 -14.1789 1.237783e-45 -2.193718 -1.660892
ci.exp(mod0) # 95% CIs for coefficients the odds scale
exp(Est.) 2.5% 97.5%
(Intercept) 0.1455399 0.1115014 0.1899695
# from p to log odds and viceversa
logit <- function(x) log(x/(1-x))
invlogit <- function(x) exp(x)/(1 + exp(x))
# predicted log odds of hyponatremia
coef(mod0)
(Intercept)
-1.927305
predict(mod0, newdata = data.frame(id = 1), se.fit = T, type = "link")
$fit
1
-1.927305
$se.fit
[1] 0.1359277
$residual.scale
[1] 1
# predicted odds of hyponatremia
exp(coef(mod0))
(Intercept)
0.1455399
ci.exp(mod0)
exp(Est.) 2.5% 97.5%
(Intercept) 0.1455399 0.1115014 0.1899695
# predicted risk of hyponatremia
invlogit(coef(mod0))
(Intercept)
0.1270492
predict(mod0, newdata = data.frame(id = 1), se.fit = T)
$fit
1
-1.927305
$se.fit
[1] 0.1359277
$residual.scale
[1] 1
ci.pred(mod0, newdata = data.frame(id = 1))
Estimate 2.5% 97.5%
1 0.1270492 0.100316 0.1596424
mod <-glm(nas135 ~ female, data = marathon, family = "binomial")
summary(mod)
Call:
glm(formula = nas135 ~ female, family = "binomial", data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.7102 -0.7102 -0.4020 -0.4020 2.2608
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.4749 0.2082 -11.884 < 2e-16 ***
femalefemale 1.2260 0.2795 4.386 1.16e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 371.60 on 487 degrees of freedom
Residual deviance: 351.93 on 486 degrees of freedom
AIC: 355.93
Number of Fisher Scoring iterations: 5
# display coefficients with CI on the log odds scale
ci.lin(mod)
Estimate StdErr z P 2.5%
(Intercept) -2.474856 0.2082474 -11.884215 1.429722e-32 -2.8830136
femalefemale 1.225962 0.2795459 4.385547 1.156946e-05 0.6780619
97.5%
(Intercept) -2.066699
femalefemale 1.773862
# display coefficients with CI on the odds scale
ci.exp(mod)
exp(Est.) 2.5% 97.5%
(Intercept) 0.08417508 0.05596585 0.126603
femalefemale 3.40744186 1.97005581 5.893569
# predict log odds
predict(mod, type = "link") %>% head()
1 2 3 4 5 6
-1.248894 -2.474856 -2.474856 -2.474856 -1.248894 -1.248894
logodds <- ci.lin(mod, rbind(c(1, 0), c(1, 1)))
logodds
Estimate StdErr z P 2.5% 97.5%
[1,] -2.474856 0.2082474 -11.884215 1.429722e-32 -2.883014 -2.0666990
[2,] -1.248894 0.1864912 -6.696801 2.130307e-11 -1.614411 -0.8833785
# calculate odds in the two groups with 95% CI
ci.exp(mod, rbind(c(1, 0), c(1, 1)))
exp(Est.) 2.5% 97.5%
[1,] 0.08417508 0.05596585 0.1266030
[2,] 0.28682171 0.19900795 0.4133839
# predicted probabilities
predict(mod, type = "response") %>% head()
1 2 3 4 5 6
0.22289157 0.07763975 0.07763975 0.07763975 0.22289157 0.22289157
ci.pred(mod, data.frame(female = levels(marathon$female)))
Estimate 2.5% 97.5%
1 0.07763975 0.05299968 0.1123759
2 0.22289157 0.16597717 0.2924782
invlogit(logodds[, c(1, 5:6)])
Estimate 2.5% 97.5%
[1,] 0.07763975 0.05299968 0.1123759
[2,] 0.22289157 0.16597717 0.2924782
# plot observed and predicted probabilities
p <- ci.pred(mod, newdata = data.frame(female = c("male", "female")))
ggplot(data.frame(p), aes(levels(marathon$female) , Estimate)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = X2.5., ymax = X97.5.), width = 0.25) +
labs(x = "", y = "Probability")
logistic <- function(x, a, b){exp(a + b*x)/(1 + exp(a + b*x))}
grid.arrange(
ggplot(data.frame(x = c(-6, 6)), aes(x = x)) +
stat_function(fun = function(x, a, b) a + b*x, geom = "line", args = list(a = -2, b = 1)) +
labs(x = "x", y = "log(odds)", title = "log(odds) = -2 + 1x"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x)) +
stat_function(fun = function(x, a, b) a, geom = "line", args = list(a = -2, b = 0)) +
labs(x = "x", y = "log(odds)", title = "log(odds) = -2"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x), title = "log(odds) = -2 - 1x") +
stat_function(fun = function(x, a, b) a + b*x, geom = "line", args = list(a = -2, b = -1)) +
labs(x = "x", y = "log(odds)", title = "log(odds) = -2 - 1x"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x)) +
stat_function(fun = function(x, a, b) exp(a + b*x), geom = "line", args = list(a = -2, b = 1)) +
labs(x = "x", y = "odds", title = "odds = exp(-2 + 1x)"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x)) +
stat_function(fun = function(x, a, b) exp(a), geom = "line", args = list(a = -2, b = 0)) +
labs(x = "x", y = "odds", title = "odds = exp(-2)"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x), title = "odds = exp(-2 - 1x)") +
stat_function(fun = function(x, a, b) exp(a + b*x), geom = "line", args = list(a = -2, b = -1)) +
labs(x = "x", y = "odds", title = "log(odds) = -2 - 1x"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x), title = "log(odds) = -2 + 1x") +
stat_function(fun = logistic, geom = "line", args = list(a = -2, b = 1)) +
labs(x = "x", y = "probability", title = "p = invlogit(-2 + 1x)"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x), title = "log(odds) = -2") +
stat_function(fun = logistic, geom = "line", args = list(a = -2, b = 0)) +
labs(x = "x", y = "probability", title = "p = invlogit(-2)"),
ggplot(data.frame(x = c(-6, 6)), aes(x = x)) +
stat_function(fun = logistic, geom = "line", args = list(a = -2, b = -1)) +
labs(x = "x", y = "probability", title = "p = invlogit(-2 - 1x)"),
nrow = 3
)
mod <- glm(nas135 ~ wtdiff, data = marathon, family = "binomial")
summary(mod)
Call:
glm(formula = nas135 ~ wtdiff, family = "binomial", data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6634 -0.5317 -0.3584 -0.2332 2.7314
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.8850 0.1587 -11.880 < 2e-16 ***
wtdiff 0.7284 0.1103 6.603 4.02e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 343.35 on 454 degrees of freedom
Residual deviance: 288.94 on 453 degrees of freedom
(33 observations deleted due to missingness)
AIC: 292.94
Number of Fisher Scoring iterations: 5
# display coefficients with CI on the log odds scale
ci.lin(mod)
Estimate StdErr z P 2.5%
(Intercept) -1.8850337 0.1586757 -11.879786 1.507524e-32 -2.1960324
wtdiff 0.7284087 0.1103104 6.603262 4.022068e-11 0.5122043
97.5%
(Intercept) -1.5740350
wtdiff 0.9446132
# display coefficients with CI on the odds scale
ci.exp(mod)
exp(Est.) 2.5% 97.5%
(Intercept) 0.1518239 0.1112437 0.2072074
wtdiff 2.0717812 1.6689660 2.5718185
# OR with 95% CI
ci.exp(mod)[2, ]
exp(Est.) 2.5% 97.5%
2.071781 1.668966 2.571818
marathon$logodds <- predict(mod, marathon, type = "link")
grid.arrange(
ggplot(marathon, aes(wtdiff, logodds)) +
geom_line() +
labs(y = "Log odds", x = "Weight change (kg)"),
ggplot(marathon, aes(wtdiff, exp(logodds))) +
geom_line() +
labs(y = "Odds", x = "Weight change (kg)"),
nrow = 1)
tibble(
wtdiff = -4:3,
p = predict(mod, data.frame(wtdiff), type = "response"),
odds = exp(predict(mod, data.frame(wtdiff))),
rr = c(NA, p[-1]/p[-length(wtdiff)]),
or = c(NA, odds[-1]/odds[-length(wtdiff)])
)
# A tibble: 8 x 5
wtdiff p odds rr or
<int> <dbl> <dbl> <dbl> <dbl>
1 -4 0.00817 0.00824 NA NA
2 -3 0.0168 0.0171 2.05 2.07
3 -2 0.0342 0.0354 2.04 2.07
4 -1 0.0683 0.0733 2.00 2.07
5 0 0.132 0.152 1.93 2.07
6 1 0.239 0.315 1.82 2.07
7 2 0.395 0.652 1.65 2.07
8 3 0.574 1.35 1.46 2.07
ci.exp(mod, cbind(0, 3))
exp(Est.) 2.5% 97.5%
[1,] 8.89266 4.648817 17.01065
odds <- data.frame(ci.exp(mod, ctr.mat = cbind(1, seq(-4, 2.5, .1))))
ggplot(odds, aes(x = seq(-4, 2.5, .1), y = exp.Est..)) +
geom_line() +
geom_ribbon(aes(ymin = X2.5., ymax = X97.5.), alpha = .1) +
labs(x = "Weight change (kg)", y = "odds") +
scale_y_continuous(trans = "log", breaks = c(.005, .01, .025, .05, .15, .5, 1, 2))
or <- data.frame(ci.exp(mod, ctr.mat = cbind(0, seq(-4, 2.5, .1))))
ggplot(or, aes(x = seq(-4, 2.5, .1), y = exp.Est..)) +
geom_line() +
geom_ribbon(aes(ymin = X2.5., ymax = X97.5.), alpha = .1) +
labs(x = "Weight change (kg)", y = "OR") +
scale_y_continuous(trans = "log", breaks = c(.1, .25, .5, 1, 2, 4, 8)) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_hline(yintercept = 1, linetype = "dotted")
ci.pred(mod, data.frame(wtdiff = 1))
Estimate 2.5% 97.5%
1 0.2392811 0.184743 0.3039175
tibble(
wtdiff = -4:3,
p = predict(mod, data.frame(wtdiff), type = "response"),
odds = exp(predict(mod, data.frame(wtdiff))),
rr = c(p/p[wtdiff == 0]),
or = c(odds/odds[wtdiff == 0])
)
# A tibble: 8 x 5
wtdiff p odds rr or
<int> <dbl> <dbl> <dbl> <dbl>
1 -4 0.00817 0.00824 0.0620 0.0543
2 -3 0.0168 0.0171 0.127 0.112
3 -2 0.0342 0.0354 0.259 0.233
4 -1 0.0683 0.0733 0.518 0.483
5 0 0.132 0.152 1 1
6 1 0.239 0.315 1.82 2.07
7 2 0.395 0.652 2.99 4.29
8 3 0.574 1.35 4.36 8.89
p <- data.frame(ci.pred(mod, marathon))
ggplot(p, aes(x = marathon$wtdiff, y = Estimate)) +
geom_line() +
geom_ribbon(aes(ymin = X2.5., ymax = X97.5.), alpha = .1) +
labs(x = "Weight change (kg)", y = "Probability")
ggplot(marathon, aes(x = wtdiff, y = as.numeric(nas135)-1)) +
geom_point() +
geom_line(aes(y = predict(mod, marathon, type = "response"))) +
labs(x = "Weight change (kg)", y = "Predicted outcome")
marathon$wtdiffcat <- cut(marathon$wtdiff, 10)
pred <- na.omit(marathon, wtdiff) %>%
group_by(wtdiffcat) %>%
summarise(
wtdiff = mean(wtdiff),
pobs = mean(nas135 == "na <= 135")
) %>%
ungroup() %>%
cbind(
ci.pred(mod, data.frame(wtdiff = .$wtdiff))
)
ggplot(pred, aes(wtdiff, Estimate, ymin = `2.5%`, ymax = `97.5%`)) +
geom_line(col = "blue") +
geom_point(aes(y = pobs)) +
labs(x = "Weight change (kg)", y = "Probability") +
geom_ribbon(alpha = .2) +
geom_vline(xintercept = unique(pred$wtdiff), linetype = "dotted")
stat.table(list(nas135, wtdiffc),
list(count(), percent(nas135)),
data = marathon)
--------------------------------------------------------------------
-------------------------wtdiffc-------------------------
nas135 3.0 to 2.0 to 1.0 to 0.0 to -1.0 to -2.0 to -5.0 to
4.9 2.9 1.9 0.9 -0.1 -1.1 -2.1
--------------------------------------------------------------------
na > 135 2 9 28 78 100 93 83
28.6 56.2 71.8 81.2 91.7 93.9 98.8
na <= 135 5 7 11 18 9 6 1
71.4 43.8 28.2 18.8 8.3 6.1 1.2
--------------------------------------------------------------------
marathon$wtdiffc_nl <- marathon$wtdiffc
levels(marathon$wtdiffc_nl) <- 1:7
suppressWarnings(with(marathon, cc(nas135, wtdiffc_nl)))
wtdiffc_nl
nas135 1 2 3 4 5 6 7
na > 135 2 9 28 78 100 93 83
na <= 135 5 7 11 18 9 6 1
Odds ratio 1 0.33 0.16 0.1 0.04 0.03 0.01
lower 95% CI 0.02 0.01 0.01 0 0 0
upper 95% CI 2.79 1.19 0.64 0.27 0.21 0.08
Chi-squared = 63.37 , 6 d.f., P value = 0
Error in fisher.test(cctable): FEXACT error 7(location). LDSTP=18570 is too small for this problem,
(pastp=56.6985, ipn_0:=ipoin[itp=56]=1239, stp[ipn_0]=67.8111).
Increase workspace or consider using 'simulate.p.value=TRUE'
mod_cat <- glm(nas135 ~ relevel(wtdiffc, 5), data = marathon, family = "binomial")
summary(mod_cat)
Call:
glm(formula = nas135 ~ relevel(wtdiffc, 5), family = "binomial",
data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5829 -0.6444 -0.3536 -0.1548 2.9769
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.4079 0.3480 -6.919 4.54e-12 ***
relevel(wtdiffc, 5)3.0 to 4.9 3.3242 0.9062 3.669 0.000244 ***
relevel(wtdiffc, 5)2.0 to 2.9 2.1566 0.6124 3.521 0.000429 ***
relevel(wtdiffc, 5)1.0 to 1.9 1.4736 0.4977 2.961 0.003069 **
relevel(wtdiffc, 5)0.0 to 0.9 0.9416 0.4353 2.163 0.030532 *
relevel(wtdiffc, 5)-2.0 to -1.1 -0.3329 0.5464 -0.609 0.542343
relevel(wtdiffc, 5)-5.0 to -2.1 -2.0109 1.0645 -1.889 0.058884 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 342.00 on 449 degrees of freedom
Residual deviance: 287.61 on 443 degrees of freedom
(38 observations deleted due to missingness)
AIC: 301.61
Number of Fisher Scoring iterations: 7
mod_0 <- glm(nas135 ~ 1, data = subset(marathon, !is.na(wtdiffc)), family = "binomial")
round(logLik(mod_cat), 3)
'log Lik.' -143.805 (df=7)
round(logLik(mod_0), 3)
'log Lik.' -171 (df=1)
lrtest(mod_cat, mod_0)
Likelihood ratio test for MLE method
Chi-squared 6 d.f. = 54.39069 , P value = 6.15293e-10
ci.exp(mod_cat)
exp(Est.) 2.5% 97.5%
(Intercept) 0.0900000 0.04550045 0.1780202
relevel(wtdiffc, 5)3.0 to 4.9 27.7777778 4.70305087 164.0647655
relevel(wtdiffc, 5)2.0 to 2.9 8.6419753 2.60198928 28.7025538
relevel(wtdiffc, 5)1.0 to 1.9 4.3650794 1.64559958 11.5787085
relevel(wtdiffc, 5)0.0 to 0.9 2.5641026 1.09246189 6.0181705
relevel(wtdiffc, 5)-2.0 to -1.1 0.7168459 0.24566698 2.0917260
relevel(wtdiffc, 5)-5.0 to -2.1 0.1338688 0.01661794 1.0784045
data.frame(
wtdiffc = levels(marathon$wtdiffc),
ci.pred(mod_cat, data.frame(wtdiffc = levels(marathon$wtdiffc)))
)
wtdiffc Estimate X2.5. X97.5.
1 3.0 to 4.9 0.71428571 0.326615514 0.92798320
2 2.0 to 2.9 0.43750000 0.224602534 0.67621133
3 1.0 to 1.9 0.28205128 0.163591273 0.44105730
4 0.0 to 0.9 0.18750000 0.121442070 0.27811530
5 -1.0 to -0.1 0.08256881 0.043520257 0.15111814
6 -2.0 to -1.1 0.06060606 0.027480862 0.12838882
7 -5.0 to -2.1 0.01190476 0.001674486 0.07965027
tibble(
wtdiffc = levels(marathon$wtdiffc),
p = predict(mod_cat, data.frame(wtdiffc), type = "response"),
odds = exp(predict(mod_cat, data.frame(wtdiffc))),
rr = c(p/p[5]),
or = c(odds/odds[5])
)
# A tibble: 7 x 5
wtdiffc p odds rr or
<chr> <dbl> <dbl> <dbl> <dbl>
1 3.0 to 4.9 0.714 2.50 8.65 27.8
2 2.0 to 2.9 0.437 0.778 5.30 8.64
3 1.0 to 1.9 0.282 0.393 3.42 4.37
4 0.0 to 0.9 0.187 0.231 2.27 2.56
5 -1.0 to -0.1 0.0826 0.0900 1 1
6 -2.0 to -1.1 0.0606 0.0645 0.734 0.717
7 -5.0 to -2.1 0.0119 0.0120 0.144 0.134
ctr <- model.matrix(~ relevel(wtdiffc, 5), marathon)
ctr[, 1] <- 0
or_cat <- data.frame(ci.exp(mod_cat, ctr.mat = ctr))
ggplot(subset(marathon, !is.na(wtdiffc)), aes(wtdiff, or_cat$exp.Est..)) +
geom_step() +
geom_step(aes(y = or_cat$X2.5.), lty = 2) +
geom_step(aes(y = or_cat$X97.5.), lty = 2) +
scale_y_continuous(trans = "log", breaks = c(0.2, .5, 1, 3, 9, 27)) +
xlim(-5, 3) + labs(x = "Weight change categories", y = "Odds ratio")
marathon$or_c <- exp(predict(mod_cat, marathon) - coef(mod_cat)[1])
marathon %>% group_by(wtdiffc) %>%
na.omit() %>%
summarise(orm = mean(or_c)) %>%
ggplot(aes(x = wtdiffc, y = orm)) + geom_bar(stat = "identity") +
scale_y_continuous(trans = "log", breaks = c(0.2, .5, 1, 3, 9, 27)) +
labs(x = "Weight change categories", y = "Odds ratio")
marathon$p_c <- predict(mod_cat, marathon, type = "response")
marathon %>%
group_by(wtdiffc) %>%
na.omit() %>%
summarise(pcm = mean(p_c)) %>%
ggplot(aes(x = wtdiffc, y = pcm)) + geom_bar(stat = "identity") +
labs(x = "Weight change categories", y = "Probability") +
theme_classic() + theme(axis.title = element_text(size = 12))
mod_r <- glm(nas135 ~ I(runtime/10), data = marathon, family = "binomial")
ci.exp(mod_r)
exp(Est.) 2.5% 97.5%
(Intercept) 0.003725353 0.0008215826 0.0168921
I(runtime/10) 1.167680463 1.0990425509 1.2406050
ggplot(marathon, aes(female, runtime)) +
geom_boxplot() +
labs(y = "Race duration (minutes)", x = "")
lm(runtime ~ female, data = marathon) %>%
ci.lin()
Estimate StdErr z P 2.5% 97.5%
(Intercept) 217.71384 2.250627 96.734733 0.000000e+00 213.30269 222.12499
femalefemale 23.49371 3.898201 6.026809 1.672289e-09 15.85338 31.13404
with(marathon, twoby2(table(female, nas135)[2:1, 2:1]))
2 by 2 table analysis:
------------------------------------------------------
Outcome : na <= 135
Comparing : female vs. male
na <= 135 na > 135 P(na <= 135) 95% conf. interval
female 37 129 0.2229 0.166 0.2925
male 25 297 0.0776 0.053 0.1124
95% conf. interval
Relative Risk: 2.8708 1.7914 4.6007
Sample Odds Ratio: 3.4074 1.9701 5.8936
Conditional MLE Odds Ratio: 3.3982 1.9037 6.1528
Probability difference: 0.1453 0.0790 0.2186
Exact P-value: 0
Asymptotic P-value: 0
------------------------------------------------------
mod_mc <- glm(nas135 ~ I((runtime-225.5)/10) + female, data = marathon, family = "binomial")
summary(mod_mc)
Call:
glm(formula = nas135 ~ I((runtime - 225.5)/10) + female, family = "binomial",
data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.2157 -0.5714 -0.3668 -0.2899 2.6427
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.51589 0.21913 -11.481 < 2e-16 ***
I((runtime - 225.5)/10) 0.14214 0.03295 4.314 1.6e-05 ***
femalefemale 0.96384 0.29105 3.312 0.000928 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 360.90 on 476 degrees of freedom
Residual deviance: 324.48 on 474 degrees of freedom
(11 observations deleted due to missingness)
AIC: 330.48
Number of Fisher Scoring iterations: 5
wald.test(vcov(mod_mc), coef(mod_mc), Terms = 2:3)
Wald test:
----------
Chi-squared test:
X2 = 32.4, df = 2, P(> X2) = 9.2e-08
Alternatevely, likelihood ratio test (OBS the two models must be fitted using the same set of observations).
mod_0 <- glm(nas135 ~ 1, family = "binomial",
data = filter(marathon, !is.na(runtime)))
anova(mod_mc, mod_0, test = "LRT")
Analysis of Deviance Table
Model 1: nas135 ~ I((runtime - 225.5)/10) + female
Model 2: nas135 ~ 1
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 474 324.48
2 476 360.90 -2 -36.416 1.237e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ci.exp(mod_mc)
exp(Est.) 2.5% 97.5%
(Intercept) 0.08079135 0.05258247 0.1241334
I((runtime - 225.5)/10) 1.15273361 1.08064959 1.2296259
femalefemale 2.62173527 1.48200104 4.6379831
runtime <- seq(180, 380, 2)
ci.exp(mod_mc, ctr.mat = cbind(0, (runtime - 222.5)/10, 0)) %>%
data.frame() %>%
ggplot(aes(runtime, `exp.Est..`, ymin = `X2.5.`, ymax = `X97.5.`)) +
geom_line() +
geom_ribbon(alpha = .2) +
scale_y_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Race duration (minutes)", y = "Sex-adjusted OR")
p_lodds <- ci.lin(mod_mc, ctr.mat = cbind(1, (240-225.5)/10, 1))
p_lodds
Estimate StdErr z P 2.5% 97.5%
[1,] -1.345951 0.1992313 -6.755723 1.421243e-11 -1.736438 -0.9554653
invlogit(p_lodds[c(1, 5, 6)])
[1] 0.206533 0.149766 0.277787
# use directly the ci.pred function
ci.pred(mod_mc, newdata = data.frame(runtime = 240, female = "female"))
Estimate 2.5% 97.5%
1 0.206533 0.149766 0.277787
pred_mc <- expand.grid(
runtime = runtime,
female = levels(marathon$female)
) %>%
mutate(
prob = predict(mod_mc, newdata = ., type = "response"),
logodds = predict(mod_mc, newdata = ., type = "link")
)
filter(pred_mc, runtime %in% c(180, 210, 240, 300))
runtime female prob logodds
1 180 male 0.04059747 -3.1626049
2 210 male 0.06087098 -2.7361964
3 240 male 0.09031557 -2.3097879
4 300 male 0.18893106 -1.4569709
5 180 female 0.09986113 -2.1987685
6 210 female 0.14524909 -1.7723600
7 240 female 0.20653305 -1.3459515
8 300 female 0.37915545 -0.4931344
gather(pred_mc, measure, pred, prob, logodds) %>%
ggplot(aes(runtime, pred, col = female)) +
geom_line() +
facet_grid(measure ~ ., scale = "free") +
labs(x = "Race duration (minutes)", y = "", female = "Sex")
mod_mi <- glm(nas135 ~ I((runtime - 225.5)/10)*female, data = marathon, family = "binomial")
summary(mod_mi)
Call:
glm(formula = nas135 ~ I((runtime - 225.5)/10) * female, family = "binomial",
data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1505 -0.5874 -0.3611 -0.2798 2.6786
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.53576 0.22705 -11.169 < 2e-16
I((runtime - 225.5)/10) 0.15392 0.04299 3.581 0.000343
femalefemale 1.02285 0.32197 3.177 0.001489
I((runtime - 225.5)/10):femalefemale -0.02845 0.06657 -0.427 0.669123
(Intercept) ***
I((runtime - 225.5)/10) ***
femalefemale **
I((runtime - 225.5)/10):femalefemale
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 360.9 on 476 degrees of freedom
Residual deviance: 324.3 on 473 degrees of freedom
(11 observations deleted due to missingness)
AIC: 332.3
Number of Fisher Scoring iterations: 5
lrtest(mod_mi, mod_0)
Likelihood ratio test for MLE method
Chi-squared 3 d.f. = 36.5978 , P value = 5.597194e-08
ci.exp(mod_mi)
exp(Est.) 2.5% 97.5%
(Intercept) 0.07920132 0.05075406 0.123593
I((runtime - 225.5)/10) 1.16639792 1.07215212 1.268928
femalefemale 2.78111394 1.47964122 5.227345
I((runtime - 225.5)/10):femalefemale 0.97195141 0.85305855 1.107415
newd <- expand.grid(runtime = c(215.5, 225.5, 235.5),
female = levels(marathon$female))
tibble(
logodds = round(predict(mod_mi, newdata = newd), 3),
odds = round(exp(logodds), 3),
or = round(odds/odds[2], 3),
p = round(invlogit(logodds), 3)
)
# A tibble: 6 x 4
logodds odds or p
<dbl> <dbl> <dbl> <dbl>
1 -2.69 0.068 0.861 0.064
2 -2.54 0.079 1 0.073
3 -2.38 0.092 1.16 0.085
4 -1.64 0.194 2.46 0.163
5 -1.51 0.22 2.78 0.18
6 -1.39 0.25 3.16 0.2
pred_mi <- expand.grid(
runtime = runtime,
female = levels(marathon$female)
) %>%
mutate(
prob = predict(mod_mi, newdata = ., type = "response"),
logodds = predict(mod_mi, newdata = ., type = "link"),
odds = exp(logodds),
or = odds/odds[1]
) %>%
arrange(runtime)
filter(pred_mc, runtime %in% c(180, 210, 240, 300))
runtime female prob logodds
1 180 male 0.04059747 -3.1626049
2 210 male 0.06087098 -2.7361964
3 240 male 0.09031557 -2.3097879
4 300 male 0.18893106 -1.4569709
5 180 female 0.09986113 -2.1987685
6 210 female 0.14524909 -1.7723600
7 240 female 0.20653305 -1.3459515
8 300 female 0.37915545 -0.4931344
gather(pred_mi, measure, pred, prob, logodds) %>%
ggplot(aes(runtime, pred, col = female)) +
geom_line() +
facet_grid(measure ~ ., scale = "free") +
labs(x = "Race duration (minutes)", y = "", female = "Sex")
marathon$runtime_cat <- cut(marathon$runtime, 10)
pred <- na.omit(marathon, runtime) %>%
group_by(runtime_cat, female) %>%
summarise(
runtime = mean(runtime),
pobs = mean(nas135 == "na <= 135"),
logoddsobs = log(pobs/(1-pobs))
) %>%
ungroup() %>%
mutate(
logodds_mc = predict(mod_mc, newdata = ., type = "link"),
p_mc = predict(mod_mc, newdata = ., type = "response"),
logodds_mi = predict(mod_mi, newdata = ., type = "link"),
p_mi = predict(mod_mi, newdata = ., type = "response")
)
melt(setDT(pred), id = 1:5, measure = patterns("^logodds_", "^p_"),
value.name = c("logodds", "p"), variable.name = "model") %>%
mutate(model = factor(model, labels = c("Confounding", "Interaction"))) %>%
gather(measure, pred, logodds, p) %>%
mutate(obs = ifelse(measure == "logodds", logoddsobs, pobs)) %>%
ggplot(aes(runtime, pred, col = female)) +
geom_line() +
geom_point(aes(y = obs)) +
facet_grid(measure ~ model, scales = "free") +
labs(x = "Race duration (minutes)", y = "", col = "Sex")
mod1 <- glm(nas135 ~ wtdiff, data = filter(marathon, !is.na(wtdiff), !is.na(bmi),
!is.na(runtime), !is.na(prevmara), !is.na(age)), family = "binomial")
mod2 <- update(mod1, . ~ . + bmi + runtime)
mod3 <- update(mod2, . ~ . + female + age + prevmara)
mod4 <- glm(nas135 ~ height + prewt + age, data = filter(marathon, !is.na(wtdiff), !is.na(bmi),
!is.na(runtime), !is.na(prevmara), !is.na(age)), family = "binomial")
# compare fit of models
data.frame(
model = paste0("mod", 1:4),
logLik = round(sapply(list(mod1, mod2, mod3, mod4), logLik), 2),
k = sapply(list(mod1, mod2, mod3, mod4), function(x) length(coef(x))),
aic = round(sapply(list(mod1, mod2, mod3, mod4), AIC), 2)
) %>%
arrange(aic)
model logLik k aic
1 mod2 -131.50 4 270.99
2 mod3 -130.47 7 274.95
3 mod1 -141.64 2 287.27
4 mod4 -164.30 4 336.61
mod_b <- glm(nas135 ~ female + I(bmi - 23), data = marathon, family = "binomial")
ci.exp(mod_b)
exp(Est.) 2.5% 97.5%
(Intercept) 0.07908786 0.05085438 0.1229961
femalefemale 4.09024715 2.16876951 7.7141078
I(bmi - 23) 1.08789422 0.97116197 1.2186575
na.omit(marathon, bmi) %>%
mutate(
p = predict(mod_b, newdata = ., type = "response"),
logodds = predict(mod_b, newdata = ., type = "link")
) %>%
gather(measure, pred, p, logodds) %>%
ggplot(aes(bmi, pred, col = female)) +
geom_line() +
facet_grid(measure ~ ., scale = "free") +
labs(x = "Body-mass index (kg/m2)", y = "", female = "Sex")
(hl_b <- hoslem.test(mod_b$y, fitted(mod_b), g = 10))
Hosmer and Lemeshow goodness of fit (GOF) test
data: mod_b$y, fitted(mod_b)
X-squared = 20.496, df = 8, p-value = 0.008615
mod_b2 <- update(mod_b, . ~ . + I((bmi-23)^2))
summary(mod_b2)
Call:
glm(formula = nas135 ~ female + I(bmi - 23) + I((bmi - 23)^2),
family = "binomial", data = marathon)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4499 -0.6059 -0.3571 -0.3253 2.4404
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.92158 0.26184 -11.158 < 2e-16 ***
femalefemale 1.29091 0.33906 3.807 0.000141 ***
I(bmi - 23) -0.02717 0.05758 -0.472 0.637038
I((bmi - 23)^2) 0.04473 0.01085 4.124 3.73e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 357.62 on 464 degrees of freedom
Residual deviance: 320.93 on 461 degrees of freedom
(23 observations deleted due to missingness)
AIC: 328.93
Number of Fisher Scoring iterations: 5
# multivariate wald test
wald.test(vcov(mod_b2), coef(mod_b2), Terms = 2:3)
Wald test:
----------
Chi-squared test:
X2 = 21.0, df = 2, P(> X2) = 2.8e-05
# likelihood ratio test
modf <- glm(nas135 ~ female, data = filter(marathon, !is.na(bmi)),
family = "binomial")
lrtest(mod_b2, modf)
Likelihood ratio test for MLE method
Chi-squared 2 d.f. = 18.45288 , P value = 9.840307e-05
bmi <- seq(20, 26, .1)
ci.exp(mod_b2, ctr.mat = cbind(0, 0, bmi - 23, (bmi - 23)^2)) %>%
data.frame() %>%
ggplot(aes(bmi, `exp.Est..`, ymin = `X2.5.`, ymax = `X97.5.`)) +
geom_line() +
geom_ribbon(alpha = .2) +
scale_y_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Body-mass index (kg/m2)", y = "Sex-adjusted OR")
# log odds
p_lodds <- ci.lin(mod_b2, ctr.mat = cbind(1, 0, 25-23, (25-23)^2))
p_lodds
Estimate StdErr z P 2.5% 97.5%
[1,] -2.796986 0.2553433 -10.95383 6.369859e-28 -3.29745 -2.296523
# probability
invlogit(p_lodds[c(1, 5, 6)])
[1] 0.05748725 0.03565878 0.09141137
ci.pred(mod_b2, newdata = data.frame(female = "male", bmi = 25))
Estimate 2.5% 97.5%
1 0.05748725 0.03565878 0.09141137
bmi <- seq(20, 26, .1)
ci.exp(mod_b2, ctr.mat = cbind(0, 0, bmi - 23, (bmi - 23)^2)) %>%
data.frame() %>%
ggplot(aes(bmi, `exp.Est..`, ymin = `X2.5.`, ymax = `X97.5.`)) +
geom_line() +
geom_ribbon(alpha = .2) +
scale_y_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Body-mass index (kg/m2)", y = "Sex-adjusted OR")
expand.grid(bmi = bmi, female = levels(marathon$female)) %>%
mutate(prob = predict(mod_b2, newdata = ., type = "response")) %>%
ggplot(aes(bmi, prob, col = female)) +
geom_line() +
labs(x = "Body-mass index (kg/m2)", y = "Risk of hyponatremia", col = "Sex")
(hl_b2 <- hoslem.test(mod_b2$y, fitted(mod_b2), g = 10))
Hosmer and Lemeshow goodness of fit (GOF) test
data: mod_b2$y, fitted(mod_b2)
X-squared = 6.2402, df = 8, p-value = 0.6203
rbind(
cbind(model = 1, hl_b$observed, hl_b$expected),
cbind(model = 2, hl_b2$observed, hl_b2$expected)
) %>%
data.frame() %>%
mutate(
model = factor(model, labels = c("Linear", "Quadratic")),
p_obs = y1/(y0 + y1),
p_pred = yhat1/(yhat0 + yhat1)
) %>%
cbind(
BinomCI(x = .$y1, n = .$y0 + .$y1)
) %>%
ggplot(aes(p_pred, p_obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
geom_errorbar(aes(ymin = lwr.ci, ymax = upr.ci), col = "grey", width = .001) +
labs(x = "Predicted probabilities", y = "Observed proportions") +
facet_grid(. ~ model, scale = "free")