Show the code
knitr::opts_chunk$set(
echo = TRUE)
library(dplyr)
library(pander)
library(knitr)
library(kableExtra)
library(reactable)
library(devtools)
library(ggplot2)
library(plotly)
plotly_height = 400
plotly_width = 700knitr::opts_chunk$set(
echo = TRUE)
library(dplyr)
library(pander)
library(knitr)
library(kableExtra)
library(reactable)
library(devtools)
library(ggplot2)
library(plotly)
plotly_height = 400
plotly_width = 700Welcome to Epidemiology 204: Quantitative Epidemiology III (Statistical Models).
In this course, we will start where Epi 203 left off: with linear regression models.
If you haven’t taken Epi 203 or a similar introduction to mathematical statistical inference, please talk to me after class.
Epi 202: probability models for different data types
Epi 203: inference for one or two populations
Linear (Gaussian) regression models (review and more details)
Regression models for non-Gaussian outcomes
Statistical analysis using R
Why do we need them?
continuous predictors
not enough data to analyze some subgroups individually
library(ggplot2)
library(plotly)
library(dplyr)
ggpenguins <-
palmerpenguins::penguins |>
dplyr::filter(species == "Adelie") |>
ggplot(
aes(x = bill_length_mm , y = body_mass_g)) +
geom_point() +
xlab("Bill length (mm)") +
ylab("Body mass (g)")
ggpenguins |> ggplotly()ggpenguins2 =
ggpenguins +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth")
ggpenguins2 |> ggplotly()Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
ggpenguins2 = ggpenguins +
stat_smooth(method = "lm",
formula = y ~ log(x),
geom = "smooth") +
xlab("Bill length (mm)") +
ylab("Body mass (g)")
ggpenguins2 |> ggplotly()Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
ggpenguins =
palmerpenguins::penguins |>
ggplot(
aes(x = bill_length_mm ,
y = body_mass_g,
color = species
)
) +
geom_point() +
stat_smooth(
method = "lm",
formula = y ~ x,
geom = "smooth") +
xlab("Bill length (mm)") +
ylab("Body mass (g)")
ggpenguins |> ggplotly()Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
library(glmx)
data(BeetleMortality)
beetles = BeetleMortality |>
mutate(
pct = died/n,
survived = n - died
)
plot1 =
beetles |>
ggplot(aes(x = dose, y = pct)) +
geom_point(aes(size = n)) +
xlab("Dose (log mg/L)") +
ylab("Mortality rate (%)") +
scale_y_continuous(labels = scales::percent) +
# xlab(bquote(log[10]), bquote(CS[2])) +
scale_size(range = c(1,2))
plot1 |> ggplotly()# plot1Mortality rates of adult flour beetles after five hours’ exposure to gaseous carbon disulphide (Bliss 1935)
glm1 = beetles |>
glm(formula = cbind(died, survived) ~ dose, family = "binomial")
lm1 =
beetles |>
reframe(.by = everything(),
outcome = c(rep(1, times = died), rep(0, times = survived))
) |>
lm(
formula = outcome ~ dose,
data = _)
lm2 =
beetles |>
reframe(
.by = everything(),
outcome = c(rep(1, times = died), rep(0, times = survived))
) |>
lm(
formula = outcome ~ log(dose),
data = _)
range1 = range(beetles$dose) + c(-.2, .2)
f = function(x) predict(glm1, newdata = data.frame(dose = x), type = "response")
f.linear = function(x) predict(lm1, newdata = data.frame(dose = x))
f.linearlog = function(x) predict(lm2, newdata = data.frame(dose = x))
plot2 =
plot1 +
geom_function(fun = f.linear, aes(col = "Straight line")) +
labs(colour="Model", size = "")
plot2 |> ggplotly()(plot2 + expand_limits(x = c(1.6, 2))) |> ggplotly()plot3 = plot2 +
expand_limits(x = c(1.6, 2)) +
geom_function(fun = f.linearlog, aes(col = "Log-transform dose"))
(plot3 + expand_limits(x = c(1.6, 2))) |> ggplotly()plot4 = plot3 + geom_function(fun = f, aes(col = "Logistic regression"))
plot4 |> ggplotly()What distribution does the outcome have for a specific subpopulation defined by covariates? (outcome model)
How does the combination of covariates relate to the mean? (link function)
How do the covariates combine? (linear predictor, interactions)