pacman::p_load(
  car,
  broom,
  tidyverse,
  ggfortify,
  mosaic,
  huxtable,
  jtools,
  latex2exp,
  pubh,
  sjlabelled,
  sjPlot,
  sjmisc
)

theme_set(sjPlot::theme_sjplot2(base_size = 10))
theme_update(legend.position = "top")
# options('huxtable.knit_print_df' = FALSE)
options('huxtable.knit_print_df_theme' = theme_article)
options('huxtable.autoformat_number_format' = list(numeric = "%5.2f"))
knitr::opts_chunk$set(collapse = TRUE, comment = NA)

Descriptive analysis

Onchocerciasis in Sierra Leone.

data(Oncho)
Oncho %>% head()

id	mf	area	agegrp	sex	mfload	lesions
1	Infected	Savannah	20-39	Female	1	No
2	Infected	Rainforest	40+	Male	3	No
3	Infected	Savannah	40+	Female	1	No
4	Not-infected	Rainforest	20-39	Female	0	No
5	Not-infected	Savannah	40+	Female	0	No
6	Not-infected	Rainforest	20-39	Female	0	No

A two-by-two contingency table:

Oncho %>%
  mutate(mf = relevel(mf, ref = "Infected")) %>%
  copy_labels(Oncho) %>%
  cross_tab(mf ~ area)

Infection	levels	Infected	Not-infected	Total
Total N (%)		822 (63.1)	480 (36.9)	1302
Residence	Savannah	281 (34.2)	267 (55.6)	548 (42.1)
	Rainforest	541 (65.8)	213 (44.4)	754 (57.9)

Table with all descriptive statistics except id and mfload:

Oncho %>%
  select(-c(id, mfload)) %>%
  mutate(mf = relevel(mf, ref = "Infected")) %>%
  copy_labels(Oncho) %>%
  cross_tab(mf ~ area + .)

Infection	levels	Infected	Not-infected	Total
Total N (%)		822 (63.1)	480 (36.9)	1302
Residence	Savannah	281 (34.2)	267 (55.6)	548 (42.1)
	Rainforest	541 (65.8)	213 (44.4)	754 (57.9)
Age group (years)	5-9	46 (5.6)	156 (32.5)	202 (15.5)
	10-19	99 (12.0)	119 (24.8)	218 (16.7)
	20-39	299 (36.4)	125 (26.0)	424 (32.6)
	40+	378 (46.0)	80 (16.7)	458 (35.2)
Sex	Male	426 (51.8)	190 (39.6)	616 (47.3)
	Female	396 (48.2)	290 (60.4)	686 (52.7)
Number of lesions	No	640 (77.9)	461 (96.0)	1101 (84.6)
	Yes	182 (22.1)	19 (4.0)	201 (15.4)

Data set about blood counts of T cells from patients in remission from Hodgkin’s disease or in remission from disseminated malignancies. We generate the new variable Ratio and add labels to the variables.

data(Hodgkin)
Hodgkin <- Hodgkin %>%
  mutate(Ratio = CD4/CD8) %>%
  var_labels(
    Ratio = "CD4+ / CD8+ T-cells ratio"
    )

Hodgkin %>% head()

CD4	CD8	Group	Ratio
396	836	Hodgkin	0.47
568	978	Hodgkin	0.58
1212	1678	Hodgkin	0.72
171	212	Hodgkin	0.81
554	670	Hodgkin	0.83
1104	1335	Hodgkin	0.83

Descriptive statistics for CD4+ T cells:

Hodgkin %>%
  estat(~ CD4)

	N	Min.	Max.	Mean	Median	SD	CV
CD4+ T-cells	40.00	116.00	2415.00	672.62	528.50	470.49	0.70

For stratification, estat recognises the following two syntaxes:

outcome ~ exposure: outcome | exposure where, outcome is continuous and exposure is a categorical (factor) variable.

Hodgkin %>%
  estat(~ Ratio|Group)

	Disease	N	Min.	Max.	Mean	Median	SD	CV
CD4+ / CD8+ T-cells ratio	Non-Hodgkin	20.00	1.10	3.49	2.12	2.15	0.73	0.34
	Hodgkin	20.00	0.47	3.82	1.50	1.19	0.91	0.61

descriptive statistics for all variables in the data set:

Hodgkin %>%
  mutate(Group = relevel(Group, ref = "Hodgkin")) %>%
  copy_labels(Hodgkin) %>%
  cross_tab(Group ~ CD4 + .)

Disease	levels	Hodgkin	Non-Hodgkin	Total
Total N (%)		20 (50.0)	20 (50.0)	40
CD4+ T-cells	Mean (SD)	823.2 (566.4)	522.0 (293.0)	672.6 (470.5)
CD8+ T-cells	Mean (SD)	613.9 (397.9)	260.0 (154.7)	436.9 (347.7)
CD4+ / CD8+ T-cells ratio	Mean (SD)	1.5 (0.9)	2.1 (0.7)	1.8 (0.9)

  # theme_article() %>%
  # add_footnote("Values are medians with interquartile range.")

Inferential statistics

var.test(Ratio ~ Group, data = Hodgkin)

    F test to compare two variances

data:  Ratio by Group
F = 0.6294, num df = 19, denom df = 19, p-value = 0.3214
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.2491238 1.5901459
sample estimates:
ratio of variances 
         0.6293991

QQ-plots against the standard Normal distribution.

Hodgkin %>%
  qq_plot(~ Ratio|Group) %>%
  axis_labs()

non-parametric test to compare the groups:

wilcox.test(Ratio ~ Group, data = Hodgkin)

    Wilcoxon rank sum test

data:  Ratio by Group
W = 298, p-value = 0.007331
alternative hypothesis: true location shift is not equal to 0

For relatively small samples (for example, less than 30 observations per group) is a standard practice to show the actual data in dot plots with error bars. The pubh package offers two options to show graphically differences in continuous outcomes among groups:

For small samples: strip_error For medium to large samples: bar_error

Hodgkin %>%
  strip_error(Ratio ~ Group) %>%
  axis_labs()

In the previous code, axis_labs applies labels from labelled data to the axis.

The error bars represent 95% confidence intervals around mean values.

Is relatively easy to add a line on top, to show that the two groups are significantly different. The function gf_star needs the reference point on how to draw an horizontal line to display statistical difference or to annotate a plot; in summary, gf_star:

Hodgkin %>%
  strip_error(Ratio ~ Group) %>%
  axis_labs() %>%
  gf_star(x1 = 1, y1 = 4, x2 = 2, y2 = 4.05, y3 = 4.1, "**")

For larger samples we could use bar charts with error bars. For example:

data(birthwt, package = "MASS")
birthwt <- birthwt %>%
  mutate(
    smoke = factor(smoke, labels = c("Non-smoker", "Smoker")),
    Race = factor(race > 1, labels = c("White", "Non-white")),
    race = factor(race, labels = c("White", "Afican American", "Other"))
    ) %>%
  var_labels(
    bwt = 'Birth weight (g)',
    smoke = 'Smoking status',
    race = 'Race',
  )

birthwt %>%
  bar_error(bwt ~ smoke) %>%
  axis_labs()
No summary function supplied, defaulting to `mean_se()`

Quick normality check:

birthwt %>%
  qq_plot(~ bwt|smoke) %>%
  axis_labs()

The (unadjusted) t-test:

t.test(bwt ~ smoke, data = birthwt)

    Welch Two Sample t-test

data:  bwt by smoke
t = 2.7299, df = 170.1, p-value = 0.007003
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  78.57486 488.97860
sample estimates:
mean in group Non-smoker     mean in group Smoker 
                3055.696                 2771.919

Final plot with annotation to highlight statistical difference (unadjusted):

birthwt %>%
  bar_error(bwt ~ smoke) %>%
  axis_labs() %>%
  gf_star(x1 = 1, x2 = 2, y1 = 3400, y2 = 3500, y3 = 3550, "**")
No summary function supplied, defaulting to `mean_se()`

strip_error and bar_error can generate plots stratified by a third variable, for example:

birthwt %>%
  bar_error(bwt ~ smoke, fill = ~ Race) %>%
  gf_refine(ggsci::scale_fill_jama()) %>%
  axis_labs()
No summary function supplied, defaulting to `mean_se()`

birthwt %>%
  bar_error(bwt ~ smoke|Race) %>%
  axis_labs()
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`

analysis of the linear model of smoking on birth weight, adjusting by race (and not by other potential confounders)

birthwt %>%
  strip_error(bwt ~ smoke, pch = ~ Race, col = ~ Race) %>%
  gf_refine(ggsci::scale_color_jama()) %>%
  axis_labs()

analysis of the linear model of smoking on birth weight, adjusting by race (and not by other potential confounders)

model_bwt <- lm(bwt ~ smoke + race, data = birthwt)

model_bwt %>%
  glm_coef(labels = model_labels(model_bwt))

Parameter	Coefficient	Pr(>\|t\|)
Constant	3334.95 (3036.11, 3633.78)	< 0.001
Smoking status: Smoker	-428.73 (-667.02, -190.44)	< 0.001
Race: Afican American	-450.36 (-750.31, -150.4)	0.003
Race: Other	-452.88 (-775.17, -130.58)	0.006

Similar results can be obtained with the function tab_model from the sjPlot package.

tab_model(model_bwt,  collapse.ci = TRUE)

	Birth weight(g)
Predictors	Estimates	p
(Intercept)	3334.95 (3153.89 – 3516.01)	<0.001
Smoking status: Smoker	-428.73 (-643.86 – -213.60)	<0.001
Race: Afican American	-450.36 (-752.45 – -148.27)	0.004
Race: Other	-452.88 (-682.67 – -223.08)	<0.001
Observations	189
R² / R² adjusted	0.123 / 0.109

multiple(model_bwt, ~ race)$df

contrast	estimate	SE	t.ratio	p.value	lower.CL	upper.CL
White - Afican American	450.36	153.12	2.94	0.01	89.87	810.85
White - Other	452.88	116.48	3.89	< 0.001	178.65	727.10
Afican American - Other	2.52	160.59	0.02	1	-375.57	380.60

Some advantages of glm_coef over tab_model are:

Script documents can be knitted to Word and LaTEX (besides HTML). Uses robust standard errors by default. The option to not use robust standard errors is part of the arguments. Recognises some type of models that tab_model does not recognise. Some advantages of tab_model over glm_coef are:

Recognises labels from variables and use those labels to display the table. Includes some statistics about the model. It can display more than one model on the same output. Tables are more aesthetically appealing. In the previous table of coefficients confidence intervals and p-values for race had not been adjusted for multiple comparisons. We use functions from the emmeans package to make the corrections.

multiple(model_bwt, ~ race)$fig_ci %>%
  gf_labs(x = "Difference in birth weights (g)")

multiple(model_bwt, ~ race)$fig_pval %>%
  gf_labs(y = " ")

data(Bernard)
head(Bernard)

id	treat	race	fate	apache	o2del	followup	temp0	temp10
1.00	Placebo	White	Dead	27.00	539.20	50.00	35.22	36.56
2.00	Ibuprofen	African American	Alive	14.00		720.00	38.67	37.56
3.00	Placebo	African American	Dead	33.00	551.12	33.00	38.33
4.00	Ibuprofen	White	Alive	3.00	1376.14	720.00	38.33	36.44
5.00	Placebo	White	Alive	5.00		720.00	38.56	37.56
6.00	Ibuprofen	White	Alive	13.00	1523.39	720.00	38.17	38.17

Bernard %>%
  mutate(
    fate = relevel(fate, ref = "Dead"),
    treat = relevel(treat, ref = "Ibuprofen")
  ) %>%
  copy_labels(Bernard) %>%
  cross_tab(fate ~ treat)

Mortality status	levels	Dead	Alive	Total
Total N (%)		176 (38.7)	279 (61.3)	455
Treatment	Ibuprofen	84 (47.7)	140 (50.2)	224 (49.2)
	Placebo	92 (52.3)	139 (49.8)	231 (50.8)

dat <- matrix(c(84, 140 , 92, 139), 
              nrow = 2, byrow = TRUE)
epiR::epi.2by2(as.table(dat))
             Outcome +    Outcome -      Total        Inc risk *        Odds
Exposed +           84          140        224              37.5       0.600
Exposed -           92          139        231              39.8       0.662
Total              176          279        455              38.7       0.631

Point estimates and 95% CIs:
-------------------------------------------------------------------
Inc risk ratio                               0.94 (0.75, 1.19)
Odds ratio                                   0.91 (0.62, 1.32)
Attrib risk *                                -2.33 (-11.27, 6.62)
Attrib risk in population *                  -1.15 (-8.88, 6.59)
Attrib fraction in exposed (%)               -6.20 (-33.90, 15.76)
Attrib fraction in population (%)            -2.96 (-15.01, 7.82)
-------------------------------------------------------------------
 Test that OR = 1: chi2(1) = 0.260 Pr>chi2 = 0.61
 Wald confidence limits
 CI: confidence interval
 * Outcomes per 100 population units

Bernard %>%
  contingency(fate ~ treat)
           Outcome
Predictor   Dead Alive
  Ibuprofen   84   140
  Placebo     92   139

             Outcome +    Outcome -      Total        Inc risk *        Odds
Exposed +           84          140        224              37.5       0.600
Exposed -           92          139        231              39.8       0.662
Total              176          279        455              38.7       0.631

Point estimates and 95% CIs:
-------------------------------------------------------------------
Inc risk ratio                               0.94 (0.75, 1.19)
Odds ratio                                   0.91 (0.62, 1.32)
Attrib risk *                                -2.33 (-11.27, 6.62)
Attrib risk in population *                  -1.15 (-8.88, 6.59)
Attrib fraction in exposed (%)               -6.20 (-33.90, 15.76)
Attrib fraction in population (%)            -2.96 (-15.01, 7.82)
-------------------------------------------------------------------
 Test that OR = 1: chi2(1) = 0.260 Pr>chi2 = 0.61
 Wald confidence limits
 CI: confidence interval
 * Outcomes per 100 population units 


    Pearson's Chi-squared test with Yates' continuity correction

data:  dat
X-squared = 0.17076, df = 1, p-value = 0.6794

data(oswego, package = "epitools")

oswego <- oswego %>%
  mutate(
    ill = factor(ill, labels = c("No", "Yes")),
    sex = factor(sex, labels = c("Female", "Male")),
    chocolate.ice.cream = factor(chocolate.ice.cream, labels = c("No", "Yes"))
  ) %>%
  var_labels(
    ill = "Developed illness",
    sex = "Sex",
    chocolate.ice.cream = "Consumed chocolate ice cream"
  )

oswego %>%
  mutate(
    ill = relevel(ill, ref = "Yes"),
    chocolate.ice.cream = relevel(chocolate.ice.cream, ref = "Yes")
  ) %>%
  copy_labels(oswego) %>%
  cross_tab(ill ~ sex + chocolate.ice.cream)

Developed illness	levels	Yes	No	Total
Total N (%)		46 (61.3)	29 (38.7)	75
Sex	Female	30 (65.2)	14 (48.3)	44 (58.7)
	Male	16 (34.8)	15 (51.7)	31 (41.3)
Consumed chocolate ice cream	Yes	25 (55.6)	22 (75.9)	47 (63.5)
	No	20 (44.4)	7 (24.1)	27 (36.5)

oswego %>%
  mhor(ill ~ sex/chocolate.ice.cream)

                                   OR Lower.CI Upper.CI Pr(>|z|)
sexFemale:chocolate.ice.creamYes 0.47     0.11     2.06    0.318
sexMale:chocolate.ice.creamYes   0.24     0.05     1.13    0.072

                          Common OR Lower CI Upper CI Pr(>|z|)
Cochran-Mantel-Haenszel:       0.35     0.12     1.01    0.085

Test for effect modification (interaction): p =  0.5434

data(Oncho)

odds_trend(mf ~ agegrp, data = Oncho, angle = 0, hjust = 0.5)$fig

Freq <- c(1739, 8, 51, 22)
BCG <- gl(2, 1, 4, labels=c("Negative", "Positive"))
Xray <- gl(2, 2, labels=c("Negative", "Positive"))
tb <- data.frame(Freq, BCG, Xray)
tb <- expand_df(tb)

diag_test(BCG ~ Xray, data = tb)
          Outcome +    Outcome -      Total
Test +           22           51         73
Test -            8         1739       1747
Total            30         1790       1820

Point estimates and 95 % CIs:
---------------------------------------------------------
Apparent prevalence                    0.04 (0.03, 0.05)
True prevalence                        0.02 (0.01, 0.02)
Sensitivity                            0.73 (0.54, 0.88)
Specificity                            0.97 (0.96, 0.98)
Positive predictive value              0.30 (0.20, 0.42)
Negative predictive value              1.00 (0.99, 1.00)
Positive likelihood ratio              25.74 (18.21, 36.38)
Negative likelihood ratio              0.27 (0.15, 0.50)
---------------------------------------------------------

diag_test2(22, 51, 8, 1739)

          Outcome +    Outcome -      Total
Test +           22           51         73
Test -            8         1739       1747
Total            30         1790       1820

Point estimates and 95 % CIs:
---------------------------------------------------------
Apparent prevalence                    0.04 (0.03, 0.05)
True prevalence                        0.02 (0.01, 0.02)
Sensitivity                            0.73 (0.54, 0.88)
Specificity                            0.97 (0.96, 0.98)
Positive predictive value              0.30 (0.20, 0.42)
Negative predictive value              1.00 (0.99, 1.00)
Positive likelihood ratio              25.74 (18.21, 36.38)
Negative likelihood ratio              0.27 (0.15, 0.50)
---------------------------------------------------------

Regression

data(birthwt, package = "MASS")
birthwt <- birthwt %>%
  mutate(smoke = factor(smoke, labels = c("Non-smoker", "Smoker")),
         race = factor(race, labels = c("White", "African American", "Other"))) %>%
  var_labels(bwt = 'Birth weight (g)',
             smoke = 'Smoking status',
             race = 'Race')

birthwt %>%
  group_by(race, smoke) %>%
  summarise(
    n = n(),
    Mean = mean(bwt, na.rm = TRUE),
    SD = sd(bwt, na.rm = TRUE),
    Median = median(bwt, na.rm = TRUE),
    CV = rel_dis(bwt)
  )
`summarise()` regrouping output by 'race' (override with `.groups` argument)
# A tibble: 6 x 7
# Groups:   race [3]
  race             smoke          n  Mean    SD Median    CV
  <fct>            <fct>      <int> <dbl> <dbl>  <dbl> <dbl>
1 White            Non-smoker    44 3429.  710.  3593  0.207
2 White            Smoker        52 2827.  626.  2776. 0.222
3 African American Non-smoker    16 2854.  621.  2920  0.218
4 African American Smoker        10 2504   637.  2381  0.254
5 Other            Non-smoker    55 2816.  709.  2807  0.252
6 Other            Smoker        12 2757.  810.  3146. 0.294

birthwt %>%
  gen_bst_df(bwt ~ race|smoke)

Birth weight (g)	LowerCI	UpperCI	Race	Smoking status
3428.75	3210.57	3621.60	White	Non-smoker
2826.85	2668.94	2992.67	White	Smoker
2854.50	2571.50	3131.50	African American	Non-smoker
2504.00	2124.40	2883.20	African American	Smoker
2815.78	2615.37	2988.95	Other	Non-smoker
2757.17	2254.79	3121.87	Other	Smoker

birthwt %>%
  bar_error(bwt ~ race, fill = ~ smoke) %>%
  axis_labs() %>%
  gf_labs(fill = "Smoking status:")
No summary function supplied, defaulting to `mean_se()`

model_norm <- lm(bwt ~ smoke + race, data = birthwt)

model_norm %>% Anova() %>% tidy()

term	sumsq	df	statistic	p.value
smoke	7322574.73	1.00	15.46	0.00
race	8712354.03	2.00	9.20	0.00
Residuals	87631355.83	185.00

model_norm %>% tidy()

term	estimate	std.error	statistic
(Intercept)	3334.95	91.78	36.34
smokeSmoker	-428.73	109.04	-3.93
raceAfrican American	-450.36	153.12	-2.94
raceOther	-452.88	116.48	-3.89

model_norm %>% 
  glm_coef(labels = model_labels(model_norm))

Parameter	Coefficient	Pr(>\|t\|)
Constant	3334.95 (3036.11, 3633.78)	< 0.001
Smoking status: Smoker	-428.73 (-667.02, -190.44)	< 0.001
Race: African American	-450.36 (-750.31, -150.4)	0.003
Race: Other	-452.88 (-775.17, -130.58)	0.006

Compare models

model_norm %>%
  glm_coef(se_rob = TRUE, labels = model_labels(model_norm))

Parameter	Coefficient	Pr(>\|t\|)
Constant	3334.95 (3036.11, 3633.78)	< 0.001
Smoking status: Smoker	-428.73 (-667.02, -190.44)	< 0.001
Race: African American	-450.36 (-750.31, -150.4)	0.003
Race: Other	-452.88 (-775.17, -130.58)	0.006

model_norm %>% glance()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual
0.12	0.11	688.25	8.68	0.00	4	-1501.11	3012.22	3028.43	87631355.83	185

model_norm %>%
  plot_model("pred", terms = ~race|smoke, dot.size = 1.5, title = "")

emmip(model_norm, smoke ~ race) %>%
  gf_labs(y = get_label(birthwt$bwt), x = "", col = "Smoking status")

Logistic regression

data(diet, package = "Epi")
diet <- diet %>%
  mutate(
    chd = factor(chd, labels = c("No CHD", "CHD"))
  ) %>%
  var_labels(
    chd = "Coronary Heart Disease",
    fibre = "Fibre intake (10 g/day)"
    )

diet %>% estat(~ fibre|chd)

	Coronary Heart Disease	N	Min.	Max.	Mean	Median	SD	CV
Fibre intake (10 g/day)	No CHD	288.00	0.60	5.35	1.75	1.69	0.58	0.33
	CHD	45.00	0.76	2.43	1.49	1.51	0.40	0.27

diet %>%
  gf_boxploth(chd ~ fibre, fill = "indianred3", alpha = 0.7) %>%
  axis_labs()
Warning: Removed 4 rows containing non-finite values (stat_boxploth).

model_binom <- glm(chd ~ fibre, data = diet, family = binomial)

model_binom %>%
  glm_coef(labels = model_labels(model_binom))

Parameter	Odds ratio	Pr(>\|z\|)
Constant	0.95 (0.3, 3.01)	0.93
Fibre intake (10 g/day)	0.33 (0.16, 0.67)	0.00

model_binom %>%
  plot_model("pred", terms = "fibre [all]", title = "")

Matched Case-Control Studies: Conditional Logistic Regression

data(bdendo, package = "Epi") 
bdendo <- bdendo %>%
  mutate(
    cancer = factor(d, labels = c('Control', 'Case')),
    gall = factor(gall, labels = c("No GBD", "GBD")),
    est = factor(est, labels = c("No oestrogen", "Oestrogen"))
  ) %>%
  var_labels(
    cancer = 'Endometrial cancer',
    gall = 'Gall bladder disease',
    est = 'Oestrogen'
  )

bdendo %>%
  mutate(
    cancer = relevel(cancer, ref = "Case"),
    est = relevel(est, ref = "Oestrogen"),
    gall = relevel(gall, ref = "GBD")
  ) %>%
  copy_labels(bdendo) %>%
  cross_tab(cancer ~ est + gall)

Endometrial cancer	levels	Case	Control	Total
Total N (%)		63 (20.0)	252 (80.0)	315
Oestrogen	Oestrogen	56 (88.9)	127 (50.4)	183 (58.1)
	No oestrogen	7 (11.1)	125 (49.6)	132 (41.9)
Gall bladder disease	GBD	17 (27.0)	24 (9.5)	41 (13.0)
	No GBD	46 (73.0)	228 (90.5)	274 (87.0)

library(survival)
model_clogit <- clogit(cancer == 'Case'  ~ est * gall + strata(set), data = bdendo)

model_clogit %>%
  glm_coef(labels = c("Oestrogen/No oestrogen", "GBD/No GBD",
                      "Oestrogen:GBD Interaction"))

Parameter	Odds ratio	Pr(>\|z\|)
Oestrogen/No oestrogen	14.88 (4.49, 49.36)	< 0.001
GBD/No GBD	18.07 (3.2, 102.01)	0.001
Oestrogen:GBD Interaction	0.13 (0.02, 0.9)	0.039

require(ggeffects)
Loading required package: ggeffects
bdendo_pred <- ggemmeans(model_clogit, terms = c('gall', 'est'))

bdendo_pred %>%
  gf_pointrange(predicted + conf.low + conf.high ~ x|group, col = ~ x) %>%
  gf_labs(y = "P(cancer)", x = "", col = get_label(bdendo$gall))

Ordinal Logistic Regression

library(ordinal)

Attaching package: 'ordinal'
The following object is masked from 'package:dplyr':

    slice
data(housing, package = "MASS")
housing <- housing %>%
  var_labels(
    Sat = "Satisfaction",
    Infl = "Perceived influence",
    Type = "Type of rental",
    Cont = "Contact"
    )

model_clm <- clm(Sat ~ Infl + Type + Cont, weights = Freq, data = housing)

model_clm %>%
  glm_coef(labels = model_labels(model_clm, intercept = FALSE))

Parameter	Ordinal OR	Pr(>\|Z\|)
Perceived influence: Low	0.61 (0.48, 0.78)	< 0.001
Perceived influence: Medium	2 (1.56, 2.55)	< 0.001
Contact: Low	1.76 (1.44, 2.16)	< 0.001
Perceived influence: High	3.63 (2.83, 4.66)	< 0.001
Contact: High	0.56 (0.45, 0.71)	< 0.001
Type of rental: Apartment	0.69 (0.51, 0.94)	0.018
Type of rental: Atrium	0.34 (0.25, 0.45)	< 0.001
Type of rental: Terrace	1.43 (1.19, 1.73)	< 0.001

model_clm %>%
  plot_model(type = "pred", terms = c("Infl", "Cont"), 
           dot.size = 1, title = "") %>%
  gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

model_clm %>%
  plot_model(type = "pred", terms = c("Infl", "Type"), 
           dot.size = 1, title = "") %>%
  gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

emmip(model_clm, Infl ~ Type |Cont) %>%
  gf_labs(x = "Type of rental", col = "Perceived influence")

Poisson Regression

data(quine, package = "MASS")
levels(quine$Eth) <- c("Aboriginal", "White")
levels(quine$Sex) <- c("Female", "Male")

quine <- quine %>%
  var_labels(
    Days = "Number of absent days",
    Eth = "Ethnicity",
    Age = "Age group"
    )

EDA

quine %>%
  group_by(Eth, Sex, Age) %>%
  summarise(
    n = n(),
    Mean = mean(Days, na.rm = TRUE),
    SD = sd(Days, na.rm = TRUE),
    Median = median(Days, na.rm = TRUE),
    CV = rel_dis(Days)
  )
`summarise()` regrouping output by 'Eth', 'Sex' (override with `.groups` argument)
# A tibble: 16 x 8
# Groups:   Eth, Sex [4]
   Eth        Sex    Age       n  Mean    SD Median    CV
   <fct>      <fct>  <fct> <int> <dbl> <dbl>  <dbl> <dbl>
 1 Aboriginal Female F0        5 17.6  17.4      11 0.987
 2 Aboriginal Female F1       15 18.9  16.3      13 0.865
 3 Aboriginal Female F2        9 32.6  27.3      20 0.839
 4 Aboriginal Female F3        9 14.6  14.9      10 1.02 
 5 Aboriginal Male   F0        8 11.5   7.23     12 0.629
 6 Aboriginal Male   F1        5  9.6   4.51      7 0.469
 7 Aboriginal Male   F2       11 30.9  17.8      32 0.576
 8 Aboriginal Male   F3        7 27.1  10.4      28 0.382
 9 White      Female F0        5 19.8   9.68     20 0.489
10 White      Female F1       17  7.76  6.48      6 0.834
11 White      Female F2       10  5.7   4.97      4 0.872
12 White      Female F3       10 13.5  11.5      12 0.851
13 White      Male   F0        9 13.6  20.9       7 1.54 
14 White      Male   F1        9  5.56  5.39      5 0.970
15 White      Male   F2       10 15.2  12.9      12 0.848
16 White      Male   F3        7 27.3  22.9      27 0.840

Model

model_pois <- glm(Days ~ Eth + Sex + Age, family = poisson, data = quine)

model_pois %>%
  glm_coef(labels = model_labels(model_pois), se_rob = TRUE)

Parameter	Rate ratio	Pr(>\|z\|)
Constant	17.66 (10.66, 29.24)	< 0.001
Ethnicity: White	0.59 (0.39, 0.88)	0.01
Sex: Male	1.11 (0.77, 1.6)	0.57
Age group: F1	0.8 (0.42, 1.5)	0.482
Age group: F2	1.42 (0.85, 2.36)	0.181
Age group: F3	1.35 (0.77, 2.34)	0.292

model_pois %>% glance()

null.deviance	df.null	logLik	AIC	BIC	deviance	df.residual
2073.53	145	-1165.49	2342.98	2360.88	1742.50	140

Negative-binomial

deviance(model_pois) / df.residual(model_pois)
[1] 12.44646

Thus, we have over-dispersion. One option is to use a negative binomial distribution.

library(MASS)

Attaching package: 'MASS'
The following objects are masked _by_ '.GlobalEnv':

    birthwt, housing, quine
The following object is masked from 'package:dplyr':

    select
model_negbin <- glm.nb(Days ~ Eth + Sex + Age, data = quine)

model_negbin %>%
  glm_coef(labels = model_labels(model_negbin), se_rob = TRUE)

Parameter	Rate ratio	Pr(>\|z\|)
Constant	20.24 (12.24, 33.47)	< 0.001
Ethnicity: White	0.57 (0.38, 0.84)	0.005
Sex: Male	1.07 (0.74, 1.53)	0.73
Age group: F1	0.69 (0.39, 1.23)	0.212
Age group: F2	1.2 (0.7, 2.03)	0.507
Age group: F3	1.29 (0.73, 2.28)	0.385

model_negbin %>% glance()

null.deviance	df.null	logLik	AIC	BIC	deviance	df.residual
192.24	145	-547.83	1109.65	1130.54	167.86	140

age group is a factor with more than two levels and is significant:

model_negbin %>% Anova()

LR Chisq	Df	Pr(>Chisq)
12.66	1.00	0.00
0.15	1.00	0.70
9.48	3.00	0.02

model_negbin %>%
  plot_model(type = "pred", terms = c("Age", "Eth"), 
           dot.size = 1.5, title = "")

emmip(model_negbin, Eth ~ Age|Sex) %>%
  gf_labs(y = "Number of absent days", x = "Age group", col = "Ethnicity")

## Adjusting CIs and p-values for multiple comparisons

multiple(model_negbin, ~ Age|Eth)$df

contrast	Eth	ratio	SE	z.ratio	p.value	lower.CL	upper.CL
F0 / F1	Aboriginal	1.44	0.34	1.57	0.40	0.79	2.62
F0 / F2	Aboriginal	0.84	0.19	-0.77	0.86	0.46	1.52
F0 / F3	Aboriginal	0.78	0.19	-1.04	0.72	0.42	1.45
F1 / F2	Aboriginal	0.58	0.12	-2.65	0.04	0.34	0.98
F1 / F3	Aboriginal	0.54	0.12	-2.89	0.02	0.31	0.93
F2 / F3	Aboriginal	0.93	0.20	-0.34	0.99	0.53	1.63
F0 / F1	White	1.44	0.34	1.57	0.40	0.79	2.62
F0 / F2	White	0.84	0.19	-0.77	0.86	0.46	1.52
F0 / F3	White	0.78	0.19	-1.04	0.72	0.42	1.45
F1 / F2	White	0.58	0.12	-2.65	0.04	0.34	0.98
F1 / F3	White	0.54	0.12	-2.89	0.02	0.31	0.93
F2 / F3	White	0.93	0.20	-0.34	0.99	0.53	1.63

multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
  gf_labs(x = "IRR")

multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
  gf_labs(x = "IRR")

Survival Analysis

effect of thiotepa versus placebo on the development of bladder cancer.

library(survival)
data(bladder)
bladder <- bladder %>%
  mutate(times = stop,
         rx = factor(rx, labels=c("Placebo", "Thiotepa"))
         ) %>%
  var_labels(times = "Survival time",
             rx = "Treatment")

Parametric method

model_surv <- survreg(Surv(times, event) ~ rx, data = bladder)

model_surv %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo", "Scale"), se_rob = TRUE)

Parameter	Survival time ratio	Pr(>\|z\|)
Treatment: Thiotepa/Placebo	1.64 (0.89, 3.04)	0.116
Scale	1 (0.85, 1.18)	0.992

model_exp <- survreg(Surv(times, event) ~ rx, data = bladder, dist = "exponential")

model_exp %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"), se_rob = TRUE)

Parameter	Survival time ratio	Pr(>\|z\|)
Treatment: Thiotepa/Placebo	1.64 (0.85, 3.16)	0.139

Interpretation: Patients receiving Thiotepa live on average 64% more than those in the Placebo group.

Using naive standard errors

model_exp %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"))

Parameter	Survival time ratio	Pr(>\|z\|)
Treatment: Thiotepa/Placebo	1.64 (0.85, 3.16)	0.139

model_exp %>%
  plot_model(type = "pred", terms = ~ rx, dot.size = 1.5, title = "") %>%
  gf_labs(y = "Survival time", x = "Treatment", title = "")

### Cox proportional hazards regression

model_cox <-  coxph(Surv(times, event) ~ rx, data = bladder)

model_cox %>%
  glm_coef(labels = c("Treatment: Thiotepa/Placebo"))

Parameter	Hazard ratio	Pr(>\|z\|)
Treatment: Thiotepa/Placebo	0.64 (0.44, 0.94)	0.02

model_cox %>%
  plot_model(type = "pred", terms = ~ rx, dot.size = 1.5, 
           title = "") %>%
  gf_labs(x = "Treatment", title = "")

Mixed Linear Regression Models

Continuous outcomes

relationship between sex and age on the distance from the pituitary to the pterygomaxillary fissure (mm).

library(nlme)

Attaching package: 'nlme'
The following objects are masked from 'package:ordinal':

    ranef, VarCorr
The following object is masked from 'package:dplyr':

    collapse
data(Orthodont)

Orthodont <- Orthodont %>%
  var_labels(
    distance = "Pituitary distance (mm)",
    age = "Age (years)"
    )

model_lme <- lme(distance ~ Sex * I(age - mean(age, na.rm = TRUE)), random = ~ 1|Subject, 
                 method = "ML", data = Orthodont)

model_lme %>%
  glm_coef(labels = c(
    "Constant", 
    "Sex: female-male", 
    "Age (years)",
    "Sex:Age interaction"
    ))

Parameter	Coefficient	Pr(>\|t\|)
Constant	24.97 (24.03, 25.9)	< 0.001
Sex: female-male	-2.32 (-3.78, -0.86)	0.005
Age (years)	0.78 (0.63, 0.94)	< 0.001
Sex:Age interaction	-0.3 (-0.54, -0.07)	0.015

model_lme %>%
  plot_model("pred", terms = age ~ Sex, 
           show.data = TRUE, jitter = 0.1, dot.size = 1.5) %>%
  gf_labs(y = get_label(Orthodont$distance), x = "Age (years)", title = "")

Regression modelling and Epidemiological analysis

Sergio Uribe

March 22, 2005