How to use this document
Each section covers one analytical domain. Within each section you will find:
- A short description of each package or Base R function
- Key functions with their purpose
- A worked example showing typical usageBadge convention:
[PKG]= CRAN/GitHub package (requires installation) ·[BASE]= ships with R, no install needed.
here [PKG]Purpose: Builds file paths relative to the R project root, making scripts portable across machines and operating systems.
Key functions:
| Function | Description |
|---|---|
here() |
Returns the project root path |
here("folder", "file.csv") |
Builds a full path safely |
library(here)
# Instead of: read.csv("C:/Users/me/project/data/linelist.csv")
# Use:
df <- read.csv(here("data", "linelist.csv"))
# Works on Windows, Mac, and Linux without change
here() # prints the project rootrio [PKG]Purpose: A Swiss-army knife for data import/export — one function handles CSV, Excel, Stata, SPSS, SAS, RDS, JSON, and more.
Key functions:
| Function | Description |
|---|---|
import() |
Import almost any file format |
export() |
Export to almost any file format |
convert() |
Convert between formats directly |
library(rio)
# Import — format auto-detected from extension
df <- import("data/linelist.xlsx")
df2 <- import("data/survey.dta") # Stata
df3 <- import("data/results.csv")
# Export
export(df, "outputs/linelist_clean.csv")
export(df, "outputs/linelist_clean.rds")
# One-step conversion
convert("data/raw.xlsx", "data/raw.csv")openxlsx [PKG]Purpose: Read and write multi-sheet Excel workbooks with full formatting control — no Java required.
Key functions:
| Function | Description |
|---|---|
read.xlsx() |
Read one sheet from an xlsx file |
write.xlsx() |
Write a data frame to xlsx |
createWorkbook() |
Start a new workbook object |
addWorksheet() |
Add a sheet to the workbook |
writeData() |
Write data to a specific sheet |
saveWorkbook() |
Save the workbook to disk |
library(openxlsx)
# Read
df <- read.xlsx("data/hospital_data.xlsx", sheet = "Admissions")
# Write multiple sheets
wb <- createWorkbook()
addWorksheet(wb, "Cases")
addWorksheet(wb, "Deaths")
writeData(wb, "Cases", cases_df)
writeData(wb, "Deaths", deaths_df)
saveWorkbook(wb, "outputs/report.xlsx", overwrite = TRUE)pacman [PKG]Purpose: Simplifies package management — installs if absent, then loads. Reduces boilerplate at the top of scripts.
# Install pacman itself first (once only)
if (!require("pacman")) install.packages("pacman")
# Install + load multiple packages in one call
pacman::p_load(
tidyverse, gtsummary, survival, ggplot2
)
# Unload a package
pacman::p_unload(survival)renv [PKG]Purpose: Snapshot and restore the exact package versions used in a project — essential for reproducible collaborative analyses.
library(renv)
renv::init() # Initialise renv in the project
renv::snapshot() # Record current package versions to renv.lock
renv::restore() # Reinstall packages from renv.lock (on a new machine)
renv::status() # Check if lockfile is in sync with libraryreadr [PKG]Purpose: Fast, consistent reading of rectangular text files (CSV, TSV, fixed-width). Part of the tidyverse. Produces tibbles with sensible column-type guessing.
Key functions:
| Function | Description |
|---|---|
read_csv() |
Read comma-separated file |
read_tsv() |
Read tab-separated file |
read_delim() |
Read with custom delimiter |
read_fwf() |
Read fixed-width file |
write_csv() |
Write CSV |
cols() |
Specify column types explicitly |
library(readr)
# Basic import
df <- read_csv("data/linelist.csv")
# Specify column types to avoid surprises
df <- read_csv(
"data/linelist.csv",
col_types = cols(
id = col_character(),
date_onset = col_date(format = "%Y-%m-%d"),
age = col_double(),
sex = col_factor()
),
na = c("", "NA", "N/A", "unknown")
)
# Export
write_csv(df, "outputs/linelist_clean.csv")haven [PKG]Purpose: Import and export SPSS (.sav), SAS (.sas7bdat), and Stata (.dta) datasets. Preserves value labels and variable attributes.
Key functions:
| Function | Description |
|---|---|
read_sav() |
Read SPSS file |
read_dta() |
Read Stata file |
read_sas() |
Read SAS file |
as_factor() |
Convert labelled integers to factors |
zap_labels() |
Strip labels for plain data frames |
library(haven)
# Import SPSS
df_spss <- read_sav("data/survey.sav")
# Import Stata
df_stata <- read_dta("data/cohort.dta")
# Import SAS
df_sas <- read_sas("data/claims.sas7bdat")
# Convert labelled variables to factors
df_stata <- as_factor(df_stata)
# Export to Stata
write_dta(df_spss, "outputs/converted.dta")readxl [PKG]Purpose: Read Excel files (.xls and .xlsx) without Java. Lightweight and reliable for single-sheet imports.
library(readxl)
# List sheet names
excel_sheets("data/hospital.xlsx")
# Read a specific sheet
df <- read_excel("data/hospital.xlsx", sheet = "Admissions")
# Read a specific cell range
df <- read_excel(
"data/hospital.xlsx",
sheet = "Summary",
range = "A1:F50",
col_types = c("text", "date", "numeric", "numeric", "text", "logical")
)data.table (import via fread)
[PKG]Purpose: fread() is the fastest CSV
reader in R — ideal for files with millions of rows. Automatically
detects separators and column types.
library(data.table)
# Import large file
dt <- fread("data/large_dataset.csv")
# With options
dt <- fread(
"data/large_dataset.csv",
sep = ",",
header = TRUE,
na.strings = c("", "NA"),
select = c("id", "age", "sex", "outcome") # only read needed columns
)jsonlite [PKG]Purpose: Parse JSON from APIs or files, and convert R objects to JSON. Essential for working with web data.
library(jsonlite)
# Parse a JSON file
df <- fromJSON("data/api_response.json", flatten = TRUE)
# Parse from a URL
df <- fromJSON("https://api.example.com/cases")
# Convert R object to JSON string
json_str <- toJSON(df, pretty = TRUE, auto_unbox = TRUE)DBI + RSQLite / RPostgres
[PKG]Purpose: Connect to and query relational databases (SQLite, PostgreSQL, MySQL) from R using standard SQL.
library(DBI)
library(RSQLite) # or RPostgres, RMySQL
# Connect to SQLite database
con <- dbConnect(RSQLite::SQLite(), "data/registry.db")
# List tables
dbListTables(con)
# Query
df <- dbGetQuery(con, "SELECT * FROM cases WHERE year = 2023")
# Write a table
dbWriteTable(con, "results", df_results, overwrite = TRUE)
# Always close the connection
dbDisconnect(con)[BASE]# Read CSV
df <- read.csv("data/linelist.csv", stringsAsFactors = FALSE, na.strings = c("", "NA"))
# Read general delimited file
df <- read.table("data/data.txt", header = TRUE, sep = "\t", quote = '"')
# Read from a URL
df <- read.csv(url("https://example.com/data.csv"))
# Read RDS (R's native binary format — fast and preserves object classes)
df <- readRDS("data/processed.rds")
saveRDS(df, "data/processed.rds")
# Read .RData (multiple objects at once)
load("data/workspace.RData")tidyverse [PKG]Purpose: A meta-package that loads the core
tidyverse: dplyr, tidyr, ggplot2,
stringr, forcats, lubridate,
purrr, readr, and tibble. The
de-facto standard for modern R data analysis.
library(tidyverse) # loads all core packages at once
# Confirm which packages were loaded
tidyverse_packages()dplyr [PKG]Purpose: The grammar of data manipulation. Provides a consistent set of verbs for filtering, selecting, transforming, grouping, and summarising data frames.
Key functions:
| Function | Description |
|---|---|
filter() |
Keep rows matching conditions |
select() |
Keep or drop columns |
mutate() |
Create or modify columns |
rename() |
Rename columns |
arrange() |
Sort rows |
group_by() |
Group for aggregation |
summarise() |
Aggregate within groups |
left_join() / right_join() /
inner_join() |
Merge data frames |
bind_rows() / bind_cols() |
Stack or concatenate data frames |
case_when() |
Vectorised if-else |
across() |
Apply function to multiple columns |
distinct() |
Remove duplicate rows |
count() |
Count rows per group |
library(dplyr)
# Typical cleaning pipeline
df_clean <- df_raw %>%
filter(!is.na(id), age >= 0, age <= 120) %>%
select(id, date_onset, age, sex, district, outcome) %>%
mutate(
age_group = case_when(
age < 5 ~ "<5",
age < 18 ~ "5-17",
age < 60 ~ "18-59",
TRUE ~ "60+"
),
outcome = recode(outcome, "Dead" = "Death", "Alive" = "Survived")
) %>%
arrange(date_onset)
# Grouped summary
df_clean %>%
group_by(district, age_group) %>%
summarise(
n_cases = n(),
n_deaths = sum(outcome == "Death", na.rm = TRUE),
cfr = n_deaths / n_cases,
.groups = "drop"
)
# Join two tables
df_full <- df_cases %>%
left_join(df_hospitals, by = c("hospital_id" = "id"))tidyr [PKG]Purpose: Reshape data between wide and long formats, split and unite columns, and handle missing values structurally.
Key functions:
| Function | Description |
|---|---|
pivot_longer() |
Wide → long (tidy) format |
pivot_wider() |
Long → wide format |
separate() |
Split one column into multiple |
unite() |
Combine multiple columns into one |
drop_na() |
Remove rows with any/selected NAs |
replace_na() |
Replace NA with a specified value |
fill() |
Fill NAs using adjacent values |
complete() |
Make implicit missing rows explicit |
library(tidyr)
# Wide to long (e.g. symptom columns → one row per symptom)
df_long <- df_wide %>%
pivot_longer(
cols = starts_with("symptom_"),
names_to = "symptom",
values_to = "present"
)
# Long to wide (one row per person, columns for each visit)
df_wide <- df_long %>%
pivot_wider(
id_cols = patient_id,
names_from = visit_number,
values_from = result
)
# Drop rows missing key variables
df_clean <- df %>% drop_na(id, date_onset, outcome)
# Replace NA
df <- df %>% replace_na(list(age = 99, sex = "Unknown"))janitor [PKG]Purpose: Fast, practical data cleaning utilities: standardise column names, remove empty rows/columns, and produce quick frequency tables.
Key functions:
| Function | Description |
|---|---|
clean_names() |
Standardise column names to snake_case |
remove_empty() |
Remove empty rows or columns |
tabyl() |
Frequency table with percentages |
adorn_*() |
Add totals, percentages to tabyl |
get_dupes() |
Find duplicate rows |
round_half_up() |
Consistent rounding |
library(janitor)
# Clean column names (removes spaces, special chars, to snake_case)
df <- df %>% clean_names()
# "Date of Onset" → "date_of_onset", "Sex (M/F)" → "sex_m_f"
# Remove empty rows and columns
df <- df %>% remove_empty(which = c("rows", "cols"))
# Find duplicates
get_dupes(df, id) # exact ID duplicates
get_dupes(df, id, date_onset) # duplicates on key combination
# Quick cross-tabulation
df %>%
tabyl(sex, outcome) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() # add raw counts in parentheseslinelist [PKG]Purpose: Purpose-built for epidemiological linelists. Tag columns with epi meaning (date of onset, case ID, outcome) to enable downstream tools to work automatically.
library(linelist)
# Tag columns with epi meaning
ll <- make_linelist(
df,
id = "case_id",
date_onset = "symptom_onset",
date_outcome = "date_death",
outcome = "final_outcome",
age = "age_years",
sex = "patient_sex"
)
# Extract tagged variables
get_tags(ll)
# Safe extraction by tag
dates <- tags_df(ll)$date_onsetstringr [PKG]Purpose: Consistent, pipe-friendly string
manipulation. All functions start with str_ for easy
autocomplete.
Key functions:
| Function | Description |
|---|---|
str_detect() |
Does string match pattern? (TRUE/FALSE) |
str_replace() / str_replace_all() |
Replace pattern |
str_extract() / str_extract_all() |
Extract matching text |
str_to_lower() / str_to_upper() |
Change case |
str_trim() |
Remove leading/trailing whitespace |
str_pad() |
Pad string to a fixed width |
str_c() |
Concatenate strings |
str_split() |
Split string by pattern |
str_count() |
Count pattern occurrences |
str_glue() |
Interpolate R variables into strings |
library(stringr)
df <- df %>%
mutate(
# Standardise text
sex = str_to_lower(str_trim(sex)),
district = str_to_title(district),
# Clean phone numbers (keep digits only)
phone = str_replace_all(phone, "[^0-9]", ""),
# Extract year from free-text date
year = str_extract(date_text, "\\d{4}"),
# Flag records mentioning hospital transfer
transfer = str_detect(notes, regex("transfer|referr", ignore_case = TRUE))
)
# Interpolated strings
str_glue("There are {nrow(df)} cases as of {Sys.Date()}.")lubridate [PKG]Purpose: Intuitive date and date-time parsing, arithmetic, and extraction. Handles time zones, intervals, and periods.
Key functions:
| Function | Description |
|---|---|
ymd(), dmy(), mdy() |
Parse dates from strings |
ymd_hms() |
Parse date-times |
today(), now() |
Current date / datetime |
year(), month(), day() |
Extract components |
epiweek(), isoweek() |
Get epidemiological week |
interval() |
Define an interval between two dates |
as.duration() |
Convert interval to duration |
floor_date() / ceiling_date() |
Round dates down/up |
difftime() |
Calculate time difference |
library(lubridate)
df <- df %>%
mutate(
date_onset = dmy(date_onset), # "23/04/2023" → Date
date_admit = ymd(date_admit),
date_outcome = ymd_hms(date_outcome),
# Derived variables
days_to_admit = as.numeric(date_admit - date_onset),
age_at_onset = as.numeric(interval(dob, date_onset) / years(1)),
epi_week = epiweek(date_onset),
epi_year = epiyear(date_onset)
)
# Filter to last 28 days
recent <- df %>% filter(date_onset >= today() - days(28))forcats [PKG]Purpose: Tools for working with categorical (factor) variables — reordering, lumping rare levels, and recoding.
library(forcats)
df <- df %>%
mutate(
# Reorder levels manually
outcome = fct_relevel(outcome, "Survived", "Death", "Unknown"),
# Reorder by frequency (most common first)
district = fct_infreq(district),
# Lump rare categories into "Other"
hospital = fct_lump_n(hospital, n = 10, other_level = "Other"),
# Recode levels
sex = fct_recode(sex, "Male" = "M", "Female" = "F", "Unknown" = "U")
)
# Inspect levels
levels(df$outcome)
fct_count(df$district)naniar [PKG]Purpose: Visualise, summarise, and manipulate missing data. Integrates with ggplot2 for missingness plots.
library(naniar)
# Summary of missingness
miss_var_summary(df) # % missing per variable
miss_case_summary(df) # % missing per row
# Visualise missingness
vis_miss(df) # heatmap of NA pattern
gg_miss_var(df) # bar chart of NAs per variable
gg_miss_upset(df) # UpSet plot of NA combinations
# Replace specific values with NA
df <- df %>%
replace_with_na(replace = list(age = 99, sex = "Unknown")) %>%
replace_with_na_all(condition = ~.x == -999)
# Shadow matrix (add NA indicator columns)
df_shadow <- bind_shadow(df)[BASE]# Identify missing
is.na(df$age)
sum(is.na(df$age)) # count NAs
df[complete.cases(df), ] # remove rows with any NA
# Subset rows and columns
df[df$sex == "Male", ] # subset rows
df[, c("id", "age", "sex")] # subset columns
# Type conversion
df$sex <- as.factor(df$sex)
df$outcome <- as.character(df$outcome)
df$age <- as.numeric(df$age)
# Find-replace in strings
df$district <- gsub("\\bDst\\b", "District", df$district)
# Merge two data frames
df_merged <- merge(df_cases, df_labs, by = "case_id", all.x = TRUE)
# Sort
df <- df[order(df$date_onset, df$age), ]
# Apply a function across columns
df[, c("a","b","c")] <- lapply(df[, c("a","b","c")], as.numeric)skimr [PKG]Purpose: Produces a rich, compact data summary —
including inline histograms, missingness, and distributional statistics
— far beyond summary().
library(skimr)
skim(df)
# Skim by group
df %>% group_by(sex) %>% skim(age, days_to_outcome)
# Customise which statistics to show
my_skim <- skim_with(numeric = sfl(mean, sd, median))
my_skim(df)DataExplorer [PKG]Purpose: Automated EDA in one function call — generates an HTML report with missing data, distributions, correlations, and PCA.
library(DataExplorer)
# Full automated report
create_report(df, output_file = "eda_report.html")
# Individual plots
plot_missing(df) # missingness bar chart
plot_bar(df) # frequency bars for categoricals
plot_histogram(df) # histograms for numerics
plot_correlation(df) # correlation heatmap
plot_boxplot(df, by = "outcome")psych [PKG]Purpose: Comprehensive descriptive statistics, reliability analysis (Cronbach’s alpha), and factor analysis.
library(psych)
# Rich descriptive statistics
describe(df[, c("age", "days_ill", "bmi")])
# Stratified by group
describeBy(df[, c("age", "days_ill")], group = df$outcome)
# Correlation matrix with significance
corr.test(df[, c("age", "weight", "height")])
# Cronbach's alpha (scale reliability)
alpha(df[, scale_items])Hmisc [PKG]Purpose: Extended descriptive statistics, data summary with label support, and single/multiple imputation utilities.
library(Hmisc)
# Detailed describe with histograms
describe(df)
# Correlation with p-values
rcorr(as.matrix(df[, numeric_vars]))
# Specify variable labels (persist through analyses)
label(df$age) <- "Age at enrolment (years)"
label(df$outcome) <- "Final clinical outcome"
# Impute missing values (single imputation)
df$age <- impute(df$age, median)rstatix [PKG]Purpose: Pipe-friendly statistical tests and summaries that integrate cleanly with ggplot2 and tidyverse workflows.
library(rstatix)
# Quick summary statistics
df %>% get_summary_stats(age, type = "common")
# T-test
df %>% t_test(age ~ sex)
# ANOVA
df %>% anova_test(bmi ~ district)
# Chi-squared test
df %>% chisq_test(sex, outcome)
# Kruskal-Wallis
df %>% kruskal_test(age ~ district)[BASE]# Structure overview
str(df)
dim(df)
nrow(df); ncol(df)
head(df, 10); tail(df, 10)
# Descriptive statistics
summary(df)
summary(df$age)
# Frequency table (single variable)
table(df$sex)
prop.table(table(df$sex))
# Cross-tabulation
table(df$sex, df$outcome)
prop.table(table(df$sex, df$outcome), margin = 1) # row percentages
# Correlation matrix
cor(df[, c("age", "weight", "height")], use = "complete.obs")
# Basic plots
hist(df$age, breaks = 20, main = "Age distribution", xlab = "Age")
boxplot(age ~ sex, data = df, main = "Age by sex")
plot(df$age, df$days_ill, xlab = "Age", ylab = "Days ill")lm() —
Ordinary Least Squares [BASE]Purpose: Fit linear regression models. The foundational function for OLS in R.
# Fit model
mod <- lm(bmi ~ age + sex + district + smoking, data = df)
# Coefficients and inference
summary(mod)
# Confidence intervals
confint(mod, level = 0.95)
# Predicted values and residuals
df$predicted <- predict(mod)
df$residuals <- residuals(mod)
# ANOVA table
anova(mod)
# Compare nested models
mod_null <- lm(bmi ~ 1, data = df)
anova(mod_null, mod) # likelihood ratio / F testcar [PKG]Purpose: Companion to Applied Regression — VIF for multicollinearity, type II/III ANOVA, outlier tests, and more.
library(car)
# Variance inflation factor (multicollinearity check)
vif(mod)
# VIF > 5 is concerning, > 10 indicates severe multicollinearity
# Type II ANOVA (correct for unbalanced designs)
Anova(mod, type = "II")
# Outlier test (Bonferroni-corrected)
outlierTest(mod)
# Residual plots
residualPlots(mod)broom [PKG]Purpose: Turn messy model output into tidy data frames — ideal for combining results from multiple models or feeding into tables/plots.
library(broom)
# Tidy coefficient table (estimates, SE, CIs, p-values)
tidy(mod, conf.int = TRUE, conf.level = 0.95)
# Model-level statistics (R², AIC, BIC, etc.)
glance(mod)
# Observation-level: fitted values, residuals, influence measures
augment(mod, data = df)lmtest [PKG]Purpose: Hypothesis tests for linear model assumptions: heteroscedasticity, serial correlation, and functional form.
library(lmtest)
# Breusch-Pagan test for heteroscedasticity
bptest(mod)
# Durbin-Watson test for serial correlation (time-series data)
dwtest(mod)
# Likelihood ratio test (compare two models)
lrtest(mod_reduced, mod_full)
# Coeftest with robust SEs (combine with sandwich)
library(sandwich)
coeftest(mod, vcov = vcovHC(mod, type = "HC3"))sandwich [PKG]Purpose: Compute robust (heteroscedasticity-consistent) and cluster-robust standard errors for regression models.
library(sandwich)
library(lmtest)
# HC3 robust standard errors (default recommendation)
coeftest(mod, vcov = vcovHC(mod, type = "HC3"))
# Cluster-robust SEs (cluster by district)
coeftest(mod, vcov = vcovCL(mod, cluster = ~district))
# Newey-West (for time-series with autocorrelation)
coeftest(mod, vcov = NeweyWest(mod))performance [PKG]Purpose: Assess model quality — R², RMSE, AIC, ICC for mixed models, and a comprehensive visual check of assumptions.
library(performance)
# Key model metrics
model_performance(mod)
# VIF / multicollinearity
check_collinearity(mod)
# Full visual diagnostic panel (normality, heteroscedasticity, outliers)
check_model(mod)
# Compare multiple models
compare_performance(mod1, mod2, mod3, rank = TRUE)easystats [PKG]Purpose: A meta-package bundling
parameters, effectsize,
performance, correlation, see,
modelbased, and report — a modern all-in-one
modelling ecosystem.
library(easystats)
# Tidy parameters with effect sizes and CIs
parameters(mod)
# Automatic model report in plain English
report(mod)
# Standardised effect sizes
effectsize(mod)
# Visualise parameters (forest plot)
library(see)
plot(parameters(mod))glm()
with family = binomial [BASE]Purpose: Fit logistic regression for binary outcomes. Returns log-odds coefficients; exponentiate for odds ratios.
# Fit model
mod_log <- glm(death ~ age + sex + comorbidity + district,
data = df,
family = binomial(link = "logit"))
summary(mod_log)
# Odds ratios with 95% CIs
exp(cbind(OR = coef(mod_log), confint(mod_log)))
# Predicted probabilities
df$prob_death <- predict(mod_log, type = "response")
# Null model comparison (likelihood ratio test)
mod_null <- glm(death ~ 1, data = df, family = binomial)
anova(mod_null, mod_log, test = "Chisq")pROC [PKG]Purpose: Compute, plot, and compare ROC curves; estimate AUC with confidence intervals.
library(pROC)
# ROC curve
roc_obj <- roc(df$death, df$prob_death)
auc(roc_obj)
# Plot
plot(roc_obj, col = "#1B7F79", lwd = 2,
main = paste0("ROC curve (AUC = ", round(auc(roc_obj), 3), ")"))
# CI for AUC
ci.auc(roc_obj)
# Compare two models
roc_test <- roc.test(roc_obj_1, roc_obj_2, method = "delong")logistf [PKG]Purpose: Firth’s penalised logistic regression — essential when data are sparse, outcomes are rare, or complete/quasi-complete separation exists.
library(logistf)
# Firth regression (reduces small-sample bias)
mod_firth <- logistf(death ~ age + sex + hiv, data = df)
summary(mod_firth)
# Profile likelihood CIs (more accurate than Wald)
exp(cbind(OR = coef(mod_firth), mod_firth$ci.lower, mod_firth$ci.upper))nnet / MASS for Multinomial & Ordinal
Logistic [PKG]library(nnet)
library(MASS)
# Multinomial logistic (>2 unordered categories)
mod_multi <- multinom(outcome_3cat ~ age + sex + district, data = df)
summary(mod_multi)
exp(coef(mod_multi)) # relative risk ratios
# Ordinal logistic (proportional odds)
mod_ord <- polr(severity ~ age + sex + comorbidity,
data = df,
method = "logistic",
Hess = TRUE)
summary(mod_ord)
exp(cbind(OR = coef(mod_ord), confint(mod_ord)))ResourceSelection [PKG]Purpose: Hosmer-Lemeshow goodness-of-fit test for logistic regression — assesses whether predicted probabilities match observed outcomes.
library(ResourceSelection)
# Hosmer-Lemeshow test (g = number of groups, usually 10)
hoslem.test(df$death, fitted(mod_log), g = 10)
# p > 0.05 suggests acceptable fitmarginaleffects [PKG]Purpose: Compute average marginal effects, contrasts, and marginal predictions from GLMs and many other model types.
library(marginaleffects)
# Average marginal effects (change in probability per unit change in predictor)
avg_marginaleffects(mod_log)
# Marginal predictions for specific covariate values
predictions(mod_log, newdata = datagrid(age = c(20, 40, 60), sex = "Male"))
# Contrasts between groups
avg_comparisons(mod_log, variables = "sex")survival [PKG]Purpose: The foundational R package for survival analysis — Kaplan-Meier estimation, log-rank test, Cox proportional hazards regression, and parametric models.
Key functions:
| Function | Description |
|---|---|
Surv() |
Create survival object (time, event) |
survfit() |
Kaplan-Meier estimator |
coxph() |
Cox proportional hazards model |
cox.zph() |
Test PH assumption |
survdiff() |
Log-rank test |
survreg() |
Parametric survival model |
library(survival)
# Create survival object
surv_obj <- Surv(time = df$days_follow_up, event = df$death)
# Kaplan-Meier estimator
km_fit <- survfit(surv_obj ~ sex, data = df)
summary(km_fit)
# Log-rank test
survdiff(surv_obj ~ sex, data = df)
# Cox proportional hazards model
cox_mod <- coxph(surv_obj ~ age + sex + hiv + district, data = df)
summary(cox_mod)
# Hazard ratios with CI
exp(cbind(HR = coef(cox_mod), confint(cox_mod)))
# Test proportional hazards assumption
cox.zph(cox_mod)survminer [PKG]Purpose: Publication-ready survival plots built on ggplot2. Adds risk tables, p-values, median survival, and confidence intervals.
library(survminer)
# KM plot with risk table and p-value
ggsurvplot(
km_fit,
data = df,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
ggtheme = theme_bw(),
palette = c("#1B7F79", "#E8A838"),
xlab = "Days since onset",
legend.labs = c("Female", "Male"),
title = "Overall survival by sex"
)
# Median survival times
surv_median(km_fit)
# Cumulative incidence (alternative to KM for competing risks context)
ggsurvplot(km_fit, data = df, fun = "cumhaz")flexsurv [PKG]Purpose: Flexible parametric survival models — Weibull, log-normal, log-logistic, Gompertz, and more.
library(flexsurv)
# Weibull model
mod_weibull <- flexsurvreg(
Surv(days_follow_up, death) ~ age + sex,
data = df, dist = "weibull"
)
summary(mod_weibull)
# Compare distributions by AIC
mod_ln <- flexsurvreg(Surv(days_follow_up, death) ~ age, data = df, dist = "lnorm")
mod_gg <- flexsurvreg(Surv(days_follow_up, death) ~ age, data = df, dist = "gengamma")
# Plot fitted survival curves
plot(mod_weibull)cmprsk [PKG]Purpose: Competing risks analysis — cumulative incidence functions (CIF) and Fine-Gray subdistribution hazard regression.
library(cmprsk)
# Cumulative incidence (1 = event of interest, 2 = competing event)
cif <- cuminc(
ftime = df$days_follow_up,
fstatus = df$event_type, # 0=censored, 1=death, 2=discharge
group = df$treatment
)
plot(cif)
# Fine-Gray regression
fg_mod <- crr(
ftime = df$days_follow_up,
fstatus = df$event_type,
cov1 = model.matrix(~ age + sex - 1, data = df)
)
summary(fg_mod)glm()
with Poisson family [BASE]Purpose: Model count outcomes (e.g. number of cases per area) or estimate incidence rate ratios when person-time is included as an offset.
# Poisson regression (count outcome)
mod_pois <- glm(n_cases ~ age_group + sex + year + offset(log(population)),
data = df_agg,
family = poisson(link = "log"))
summary(mod_pois)
# Incidence rate ratios
exp(cbind(IRR = coef(mod_pois), confint(mod_pois)))
# Quasi-Poisson (corrects for overdispersion without changing estimates)
mod_qpois <- glm(n_cases ~ age_group + sex,
data = df_agg,
family = quasipoisson(link = "log"))
summary(mod_qpois) # note dispersion parameterMASS::glm.nb() [PKG]Purpose: Negative binomial regression — a better choice than Poisson when the variance substantially exceeds the mean (overdispersion).
library(MASS)
mod_nb <- glm.nb(n_cases ~ age_group + sex + district + offset(log(population)),
data = df_agg)
summary(mod_nb)
# IRRs
exp(cbind(IRR = coef(mod_nb), confint(mod_nb)))
# Compare with Poisson via likelihood ratio test
library(lmtest)
lrtest(mod_pois, mod_nb)pscl [PKG]Purpose: Zero-inflated models (ZIP, ZINB) and hurdle models for count data with excess zeros.
library(pscl)
# Zero-inflated Poisson
mod_zip <- zeroinfl(n_cases ~ age + sex | district,
data = df, dist = "poisson")
# Zero-inflated Negative Binomial
mod_zinb <- zeroinfl(n_cases ~ age + sex | district,
data = df, dist = "negbin")
# Vuong test: ZIP vs Poisson
vuong(mod_pois, mod_zip)
# Hurdle model
mod_hurdle <- hurdle(n_cases ~ age + sex, data = df, dist = "negbin")
summary(mod_hurdle)lme4 [PKG]Purpose: Fit linear (lmer) and
generalised linear (glmer) mixed-effects models with random
effects for clustered or repeated-measures data.
Key functions:
| Function | Description |
|---|---|
lmer() |
Linear mixed model |
glmer() |
Generalised linear mixed model |
ranef() |
Extract random effects |
fixef() |
Extract fixed effects |
isSingular() |
Check for singular fit |
library(lme4)
# Random intercept model (patients nested in hospitals)
mod_lmm <- lmer(bmi ~ age + sex + (1 | hospital_id), data = df)
summary(mod_lmm)
# Random intercept + slope
mod_lmm2 <- lmer(bmi ~ time + (time | patient_id), data = df_long)
# Mixed logistic regression
mod_glmm <- glmer(death ~ age + sex + (1 | district),
data = df, family = binomial)
summary(mod_glmm)
# ICC (intraclass correlation)
performance::icc(mod_lmm)lmerTest [PKG]Purpose: Extends lme4 to provide
p-values for fixed effects using Satterthwaite degrees of freedom.
library(lmerTest)
# Fit exactly as with lme4 — p-values now included automatically
mod <- lmer(bmi ~ age + sex + (1 | hospital_id), data = df)
summary(mod) # p-values for fixed effects
anova(mod) # type III ANOVA with Satterthwaite dfglmmTMB [PKG]Purpose: Fit a wide range of GLMMs including zero-inflated, hurdle, and truncated models using Template Model Builder.
library(glmmTMB)
# Zero-inflated negative binomial GLMM
mod_zi <- glmmTMB(
n_cases ~ age + sex + (1 | district),
ziformula = ~1,
family = nbinom2,
data = df
)
summary(mod_zi)emmeans [PKG]Purpose: Estimated marginal means (aka least-squares means), pairwise contrasts, and trend tests for model output.
library(emmeans)
# Estimated marginal means by sex
emmeans(mod_lmm, ~ sex)
# Pairwise comparison
pairs(emmeans(mod_lmm, ~ sex))
# Trend across age groups
emtrends(mod_lmm, ~ age_group, var = "age")
# Custom contrasts
contrast(emmeans(mod_log, ~ district), method = "trt.vs.ctrl")gtsummary [PKG]Purpose: The most popular package for publication-ready Table 1 and regression tables with one or two lines of code.
Key functions:
| Function | Description |
|---|---|
tbl_summary() |
Descriptive Table 1 |
tbl_regression() |
Regression results table |
tbl_uvregression() |
Univariable regression table |
tbl_merge() |
Combine multiple tables side-by-side |
add_p() |
Add p-values to summary table |
add_overall() |
Add overall column |
bold_labels() |
Format variable labels |
library(gtsummary)
# Table 1: Descriptive statistics stratified by outcome
tbl_summary(
df %>% select(age, sex, district, hiv, outcome),
by = outcome,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
missing_text = "Missing"
) %>%
add_p() %>%
add_overall() %>%
bold_labels() %>%
italicize_levels()
# Logistic regression table (ORs + CIs)
tbl_regression(
mod_log,
exponentiate = TRUE,
label = list(age ~ "Age (years)", sex ~ "Sex")
)
# Univariable then multivariable combined
t1 <- tbl_uvregression(df, method = glm, y = death,
method.args = list(family = binomial),
exponentiate = TRUE)
t2 <- tbl_regression(mod_log, exponentiate = TRUE)
tbl_merge(list(t1, t2), tab_spanner = c("Univariable", "Multivariable"))gt [PKG]Purpose: Grammar of tables — build highly customised HTML, PDF, and Word tables with headers, footnotes, column spanners, and cell-level formatting.
library(gt)
df_summary %>%
gt() %>%
tab_header(
title = "Summary of cases by district",
subtitle = "Ebola outbreak, 2024"
) %>%
tab_spanner(
label = "Cases",
columns = c(n_cases, attack_rate)
) %>%
fmt_number(columns = attack_rate, decimals = 1) %>%
fmt_integer(columns = n_cases) %>%
cols_label(
district = "District",
n_cases = "N",
attack_rate = "Attack rate (%)"
) %>%
tab_footnote("Per 100,000 population", locations = cells_column_labels("attack_rate")) %>%
tab_options(table.font.size = 12)flextable [PKG]Purpose: Create tables that render in Word (.docx), PowerPoint (.pptx), and HTML — the best choice for reports going to Word.
library(flextable)
ft <- flextable(df_summary) %>%
set_header_labels(district = "District", n_cases = "Cases", cfr = "CFR (%)") %>%
bold(part = "header") %>%
bg(bg = "#0B2545", part = "header") %>%
color(color = "white", part = "header") %>%
colformat_double(j = "cfr", digits = 1) %>%
autofit()
# Save to Word
library(officer)
doc <- read_docx() %>% body_add_flextable(ft)
print(doc, target = "outputs/table.docx")tableone [PKG]Purpose: Create Table 1 with standardised mean differences (SMD) — widely used after propensity score matching to assess balance.
library(tableone)
vars <- c("age", "sex", "hiv", "malnutrition", "bmi")
cat_vars <- c("sex", "hiv", "malnutrition")
# Unmatched
tab1 <- CreateTableOne(
vars = vars,
strata = "treated",
data = df,
factorVars = cat_vars,
addOverall = TRUE
)
print(tab1, smd = TRUE, showAllLevels = TRUE)
kableone(tab1) # formatted kable outputrmarkdown [PKG]Purpose: Generate reproducible reports in HTML, PDF,
Word, and PowerPoint from a single .Rmd source file that
mixes R code and narrative text.
# This file IS an R Markdown document.
# Key YAML options for the output section:
# output:
# html_document:
# toc: true
# toc_float: true
# code_folding: show
# pdf_document:
# latex_engine: xelatex
# word_document:
# reference_docx: my_template.docx
# Knit from R console
rmarkdown::render("report.Rmd", output_format = "html_document")
rmarkdown::render("report.Rmd", output_format = "all") # all formats at onceofficer [PKG]Purpose: Create and manipulate Word (.docx) and PowerPoint (.pptx) files programmatically from R.
library(officer)
# Create a Word document
doc <- read_docx() %>%
body_add_par("Outbreak Report", style = "heading 1") %>%
body_add_par("Summary statistics are shown below.", style = "Normal") %>%
body_add_flextable(ft) %>% # add a flextable
body_add_gg(my_plot) # add a ggplot
print(doc, target = "outputs/report.docx")
# Create a PowerPoint
ppt <- read_pptx() %>%
add_slide(layout = "Title Slide", master = "Office Theme") %>%
ph_with("Outbreak Report", location = ph_location_type("ctrTitle"))
print(ppt, target = "outputs/slides.pptx")ggplot2 [PKG]Purpose: The definitive R graphics package, based on the Grammar of Graphics. Builds plots layer by layer with a consistent, composable syntax.
Key geoms:
| Geom | Plot type |
|---|---|
geom_point() |
Scatter plot |
geom_line() |
Line chart |
geom_bar() / geom_col() |
Bar chart |
geom_histogram() |
Histogram |
geom_boxplot() |
Box plot |
geom_violin() |
Violin plot |
geom_density() |
Density curve |
geom_smooth() |
Smoothed trend line |
geom_tile() |
Heatmap |
geom_errorbar() |
Error bars |
geom_ribbon() |
Confidence band |
library(ggplot2)
# Epicurve (case counts over time)
ggplot(df, aes(x = week_onset, fill = outcome)) +
geom_bar() +
scale_fill_manual(values = c("Survived" = "#1B7F79", "Death" = "#E8A838")) +
labs(
title = "Epidemic curve by week of onset",
x = "Epidemiological week",
y = "Number of cases",
fill = "Outcome"
) +
theme_bw() +
theme(legend.position = "bottom")
# Age-sex pyramid
ggplot(df, aes(x = age_group, fill = sex)) +
geom_bar(data = filter(df, sex == "Male"), aes(y = after_stat(count))) +
geom_bar(data = filter(df, sex == "Female"), aes(y = -after_stat(count))) +
coord_flip() +
labs(title = "Age-sex pyramid") +
theme_minimal()
# Forest plot of odds ratios
library(broom)
tidy(mod_log, conf.int = TRUE, exponentiate = TRUE) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(x = estimate, xmin = conf.low, xmax = conf.high, y = term)) +
geom_pointrange() +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_x_log10() +
labs(x = "Odds ratio (95% CI)", y = NULL, title = "Logistic regression results")patchwork [PKG]Purpose: Combine multiple ggplot2 figures into a single composite figure with simple arithmetic operators.
library(patchwork)
p1 <- ggplot(df, aes(x = age)) + geom_histogram() + labs(title = "A")
p2 <- ggplot(df, aes(x = sex, fill = outcome)) + geom_bar() + labs(title = "B")
p3 <- ggplot(df, aes(x = bmi, y = age)) + geom_point() + labs(title = "C")
# Arrange side by side
p1 | p2
# Stack vertically
p1 / p2
# Complex layout
(p1 | p2) / p3
# Add figure labels (A, B, C)
(p1 | p2) / p3 + plot_annotation(tag_levels = "A")survminer (see Survival Analysis section)plotly [PKG]Purpose: Create interactive HTML charts from scratch or convert ggplot2 plots with a single function call.
library(plotly)
# Convert existing ggplot to interactive
ggplotly(my_ggplot)
# Build a plotly chart directly
plot_ly(df, x = ~age, y = ~bmi, color = ~sex, type = "scatter", mode = "markers") %>%
layout(title = "BMI vs Age", xaxis = list(title = "Age"), yaxis = list(title = "BMI"))gganimate [PKG]Purpose: Add animation to ggplot2 charts — ideal for showing epidemic progression over time.
library(gganimate)
p <- ggplot(df_time, aes(x = week, y = n_cases, group = district, colour = district)) +
geom_line() +
geom_point() +
transition_reveal(week) + # animate along week axis
labs(title = "Week: {frame_along}", x = "Epi week", y = "Cases")
animate(p, fps = 10, duration = 8, width = 800, height = 500)
anim_save("epicurve_animated.gif")corrplot [PKG]Purpose: Visualise correlation matrices with colour, shape, and size encoding.
library(corrplot)
cor_mat <- cor(df[, numeric_vars], use = "complete.obs")
corrplot(cor_mat,
method = "circle",
type = "upper",
tl.col = "black",
col = colorRampPalette(c("#E8A838", "white", "#1B7F79"))(200))epitools [PKG]Purpose: Calculate attack rates, odds ratios, risk ratios, and 2×2 table statistics with confidence intervals.
library(epitools)
# 2x2 table: exposure vs disease
tab <- table(df$exposed, df$case)
epitable(tab) # display formatted table
# Risk ratio
riskratio(tab, method = "wald")
# Odds ratio
oddsratio(tab, method = "fisher")
# Attack rate table (age-stratified)
agecat <- cut(df$age, breaks = c(0, 5, 15, 25, 45, 65, Inf), right = FALSE)
epitab(df$case, df$exposed, rev = "b")epiR [PKG]Purpose: Comprehensive epi measures: standardised mortality ratios, population attributable fractions, sample size calculations, and diagnostic test statistics.
library(epiR)
# 2x2 table analysis (a=exposed cases, b=exposed controls, c=unexp. cases, d=unexp. controls)
dat <- as.table(matrix(c(45, 15, 30, 120), nrow = 2))
epi.2by2(dat, method = "cohort.count")
# Population attributable fraction
epi.2by2(dat, method = "cohort.count")$massoc.detail$PAFRisk.strata.wald
# Sample size for cohort study
epi.sscohortt(irexp1 = 0.05, irexp0 = 0.02, FU = 2, n = NA, power = 0.80)
# Sensitivity, specificity, PPV, NPV
epi.tests(dat)EpiEstim [PKG]Purpose: Estimate the instantaneous reproduction number (Rt) from incidence time series data, with uncertainty quantification.
library(EpiEstim)
# Estimate Rt using a parametric serial interval
res <- estimate_R(
incid = df_incid$n_cases, # daily case counts
method = "parametric_si",
config = make_config(list(
mean_si = 4.7, # mean serial interval
std_si = 2.9
))
)
# Plot Rt over time
plot(res, what = "R")
# Extract Rt estimates
res$Rincidence2 [PKG]Purpose: Compute and visualise disease incidence (epicurves) over time from linelist or aggregated data. Handles daily, weekly, and monthly intervals.
library(incidence2)
# Build incidence object from linelist
inc <- incidence(
df,
date_index = "date_onset",
interval = "week",
groups = "district"
)
# Plot epicurve
plot(inc, fill = "district", title = "Cases by epi week")
# Stacked or grouped
plot(inc, fill = "district", stack = TRUE)
# Extract counts
as.data.frame(inc)epicontacts [PKG]Purpose: Build and analyse transmission networks from linelist and contact tracing data. Compute network statistics and visualise chains of transmission.
library(epicontacts)
# Build epicontacts object
epc <- make_epicontacts(
linelist = df_cases,
contacts = df_contacts,
id = "case_id",
from = "infector_id",
to = "case_id",
directed = TRUE
)
# Summary
summary(epc)
# Visualise transmission chains
plot(epc, node_shape = "sex", node_color = "outcome")
# Network statistics
get_degree(epc, type = "out") # number of secondary casesmatchit [PKG]Purpose: Propensity score matching, covariate matching, and other matching methods for causal inference in observational studies.
library(MatchIt)
# Nearest-neighbour propensity score matching (1:1)
m.out <- matchit(
treated ~ age + sex + hiv + malnutrition,
data = df,
method = "nearest",
ratio = 1
)
summary(m.out)
plot(m.out, type = "density") # balance plot
# Extract matched dataset
df_matched <- match.data(m.out)
# Analyse matched data (weighted regression)
mod_matched <- glm(death ~ treated + age + sex,
data = df_matched,
weights = weights,
family = binomial)WeightIt [PKG]Purpose: Estimate inverse probability weights (IPW) for average treatment effect (ATE/ATT) estimation.
library(WeightIt)
# IPW with logistic propensity model
w.out <- weightit(
treated ~ age + sex + hiv,
data = df,
method = "ps",
estimand = "ATE"
)
summary(w.out)
df$ipw <- w.out$weights
# Weighted outcome model
mod_ipw <- glm(death ~ treated,
data = df,
weights = ipw,
family = binomial)cobalt [PKG]Purpose: Assess covariate balance before and after matching or weighting with tabular and graphical summaries.
library(cobalt)
library(MatchIt)
# Balance table (standardised mean differences)
bal.tab(m.out, stats = c("m", "v"), un = TRUE)
# Love plot (visual balance check)
love.plot(m.out,
stats = c("mean.diffs"),
threshold = 0.1, # SMD < 0.1 considered balanced
abs = TRUE)[BASE]# Chi-squared test of independence
chisq.test(table(df$sex, df$outcome))
# Fisher's exact test (small samples)
fisher.test(table(df$exposed, df$case))
# Two-sample t-test
t.test(age ~ sex, data = df, var.equal = FALSE)
# Wilcoxon rank-sum test (non-parametric)
wilcox.test(age ~ sex, data = df)
# Proportion test
prop.test(x = c(45, 30), n = c(100, 120))
# McNemar's test (paired binary data)
mcnemar.test(table(df$test1, df$test2))
# Shapiro-Wilk normality test
shapiro.test(df$age)
# Kruskal-Wallis test (non-parametric ANOVA)
kruskal.test(bmi ~ district, data = df)sf [PKG]Purpose: Handle spatial vector data (points, lines,
polygons) in Simple Feature format. Integrates with dplyr
and ggplot2.
library(sf)
# Read a shapefile
districts <- st_read("data/shapefiles/districts.shp")
# Spatial join: assign district to each case by coordinates
df_sf <- st_as_sf(df, coords = c("longitude", "latitude"), crs = 4326)
df_joined <- st_join(df_sf, districts, join = st_within)
# Plot with ggplot2
ggplot(districts) +
geom_sf(aes(fill = n_cases)) +
scale_fill_viridis_c() +
labs(title = "Cases by district", fill = "N cases")
# Calculate area
districts$area_km2 <- as.numeric(st_area(districts)) / 1e6tmap [PKG]Purpose: Thematic maps — both static (print-quality) and interactive (leaflet-based) — with a ggplot2-like syntax.
library(tmap)
# Static map
tm_shape(districts) +
tm_polygons("n_cases", palette = "Blues", title = "Cases") +
tm_layout(title = "Disease distribution by district")
# Interactive mode (opens in browser)
tmap_mode("view")
tm_shape(districts) +
tm_polygons("n_cases", popup.vars = c("district", "n_cases"))
# Switch back to static
tmap_mode("plot")spdep [PKG]Purpose: Spatial dependence statistics — define spatial weights, compute Moran’s I for spatial autocorrelation, and spatial lag/error models.
library(spdep)
# Build spatial weights (queen contiguity)
nb <- poly2nb(districts, queen = TRUE)
wts <- nb2listw(nb, style = "W")
# Moran's I (test for spatial autocorrelation)
moran.test(districts$n_cases, wts)
# Local Moran's I (LISA)
lisa <- localmoran(districts$n_cases, wts)
districts$local_moran <- lisa[, 1]EpiNow2 [PKG]Purpose: Real-time estimation of Rt with adjustment for reporting delays and right-truncation. Wraps Stan for full Bayesian inference.
library(EpiNow2)
# Specify generation time distribution
generation_time <- get_generation_time(disease = "SARS-CoV-2", source = "ganyani")
# Specify incubation period
incubation_period <- get_incubation_period(disease = "SARS-CoV-2", source = "lauer")
# Estimate Rt and nowcast
estimates <- epinow(
reported_cases = df_cases, # data frame with columns: date, confirm
generation_time = generation_time_opts(generation_time),
delays = delay_opts(incubation_period),
rt = rt_opts(prior = list(mean = 2, sd = 0.2))
)
plot(estimates)
summary(estimates)projections [PKG]Purpose: Short-term incidence projections based on serial interval distributions and a given Rt estimate.
library(projections)
library(distcrete)
# Define serial interval
si <- distcrete("gamma", shape = 2.5, scale = 2, w = 0, interval = 1L)
# Project forward 14 days from current incidence
proj <- project(
x = inc, # incidence2 or incidence object
R = 1.3, # assumed Rt
si = si,
n_days = 14,
n_sim = 1000
)
plot(proj)
quantile(proj, c(0.025, 0.5, 0.975))distcrete [PKG]Purpose: Create discrete delay distributions (serial interval, generation time, incubation period) for use in epidemic models.
library(distcrete)
# Discrete gamma serial interval (mean ~4.7 days, sd ~2.9)
si <- distcrete("gamma", shape = 2.6, scale = 1.8, w = 0, interval = 1L)
# Sample from the distribution
si$r(100)
# Probability mass function
si$d(0:14)shiny [PKG]Purpose: Build interactive web applications in R — no JavaScript required. Perfect for surveillance dashboards and interactive analysis tools.
library(shiny)
# Minimal Shiny app structure
ui <- fluidPage(
titlePanel("Outbreak Dashboard"),
sidebarLayout(
sidebarPanel(
selectInput("district", "District:", choices = unique(df$district)),
dateRangeInput("dates", "Date range:", start = min(df$date_onset), end = max(df$date_onset))
),
mainPanel(
plotOutput("epicurve"),
tableOutput("summary_table")
)
)
)
server <- function(input, output) {
filtered <- reactive({
df %>%
filter(district == input$district,
date_onset >= input$dates[1],
date_onset <= input$dates[2])
})
output$epicurve <- renderPlot({
ggplot(filtered(), aes(x = week_onset)) +
geom_bar(fill = "#1B7F79") +
theme_bw()
})
}
shinyApp(ui = ui, server = server)flexdashboard [PKG]Purpose: Convert an R Markdown script into a multi-panel dashboard with minimal extra code. Supports static and Shiny-powered interactive versions.
# In the YAML header of your .Rmd:
# ---
# title: "Outbreak Dashboard"
# output:
# flexdashboard::flex_dashboard:
# orientation: rows
# vertical_layout: fill
# runtime: shiny # (optional, for interactive elements)
# ---
# Row 1 {data-height=300}
# ------------------------------------
# ### Active cases (value box)
# ```{r}
# valueBox(nrow(df), icon = "fa-users", color = "teal")
# ```
# ### Epicurve
# ```{r}
# plot(inc)
# ```rstanarm [PKG]Purpose: Bayesian regression using Stan with a
familiar lm()/glm() formula interface.
Requires minimal code changes from frequentist analyses.
library(rstanarm)
# Bayesian logistic regression
mod_bayes <- stan_glm(
death ~ age + sex + hiv,
data = df,
family = binomial(link = "logit"),
prior = normal(0, 2.5), # weakly informative prior on log-OR
chains = 4, iter = 2000
)
summary(mod_bayes)
posterior_interval(mod_bayes, prob = 0.95)
exp(posterior_interval(mod_bayes, prob = 0.95)) # OR credible intervalsbrms [PKG]Purpose: Bayesian multi-level modelling via Stan with an extended formula interface. Supports survival models, zero-inflation, ordinal outcomes, and more.
library(brms)
# Bayesian mixed logistic regression
mod_brms <- brm(
death ~ age + sex + (1 | district),
data = df,
family = bernoulli(),
prior = c(prior(normal(0, 2.5), class = "b")),
chains = 4, iter = 2000, cores = 4
)
summary(mod_brms)
conditional_effects(mod_brms) # marginal effects plotbayesplot [PKG]Purpose: Diagnostic visualisations for MCMC output and posterior distributions.
library(bayesplot)
# Trace plots (check chain mixing)
mcmc_trace(mod_brms, pars = c("b_age", "b_sexMale"))
# Posterior distributions
mcmc_areas(mod_brms, pars = c("b_age", "b_sexMale"), prob = 0.95)
# Posterior predictive check
pp_check(mod_brms, ndraws = 100)tidybayes [PKG]Purpose: Extract, summarise, and visualise posterior draws in tidy format — integrates with dplyr, ggplot2, and ggdist.
library(tidybayes)
# Extract draws
mod_brms %>%
spread_draws(b_age, b_sexMale) %>%
median_qi(b_age)
# Visualise posteriors
mod_brms %>%
gather_draws(b_age, b_sexMale) %>%
ggplot(aes(x = .value, y = .variable)) +
stat_halfeye() +
geom_vline(xintercept = 0, linetype = "dashed")tidymodels [PKG]Purpose: A unified framework for modelling and machine learning — data pre-processing (recipes), model specification (parsnip), cross-validation (rsample), and tuning (tune).
library(tidymodels)
# 1. Split data
set.seed(123)
splits <- initial_split(df, prop = 0.75, strata = death)
df_train <- training(splits)
df_test <- testing(splits)
# 2. Pre-processing recipe
rec <- recipe(death ~ age + sex + hiv + district, data = df_train) %>%
step_impute_median(age) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
# 3. Model specification
mod_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
# 4. Workflow
wf <- workflow() %>% add_recipe(rec) %>% add_model(mod_spec)
# 5. Tune with cross-validation
folds <- vfold_cv(df_train, v = 5, strata = death)
tuned <- tune_grid(wf, resamples = folds, grid = 20, metrics = metric_set(roc_auc))
# 6. Finalise & evaluate
best <- select_best(tuned, "roc_auc")
final_wf <- finalize_workflow(wf, best) %>% last_fit(splits)
collect_metrics(final_wf)glmnet [PKG]Purpose: Regularised regression (LASSO, ridge, elastic net) for variable selection and prediction in high-dimensional data.
library(glmnet)
# Prepare matrix input
X <- model.matrix(death ~ age + sex + hiv + district - 1, data = df)
y <- df$death
# LASSO (alpha = 1), ridge (alpha = 0), elastic net (0 < alpha < 1)
fit <- cv.glmnet(X, y, family = "binomial", alpha = 1, nfolds = 10)
# Optimal lambda
fit$lambda.min
fit$lambda.1se
# Coefficients at best lambda
coef(fit, s = "lambda.1se")
# Predictions
predict(fit, newx = X_test, s = "lambda.min", type = "response")mice [PKG]Purpose: Multiple imputation by chained equations (MICE) — handles missing data by creating multiple complete datasets and pooling results.
library(mice)
# Inspect missing data pattern
md.pattern(df)
# Impute (m = number of imputations, method per variable)
imp <- mice(
df,
m = 10,
method = c(age = "pmm", sex = "logreg", bmi = "pmm"), # predictive mean matching
seed = 123,
printFlag = FALSE
)
# Fit model on each imputed dataset and pool
fit_imp <- with(imp, glm(death ~ age + sex + bmi, family = binomial))
pooled <- pool(fit_imp)
summary(pooled)ggtree [PKG]Purpose: Visualise and annotate phylogenetic trees using a ggplot2-inspired grammar.
library(ggtree)
library(ape)
# Read a tree (Newick format)
tree <- read.tree("data/outbreak_tree.nwk")
# Basic tree plot
ggtree(tree) + geom_tiplab()
# Annotate with metadata
ggtree(tree) %<+% df_meta + # attach metadata
geom_tippoint(aes(color = district)) +
geom_tiplab(aes(label = paste(id, date_onset))) +
scale_color_brewer(palette = "Set2") +
theme_tree2()
# Rectangular or circular layout
ggtree(tree, layout = "circular") + geom_tiplab()ape [PKG]Purpose: Analysis of phylogenetics and evolution — read/write tree files, compute pairwise distances, maximum likelihood, and ancestral reconstruction.
library(ape)
# Read tree formats
tree <- read.tree("tree.nwk") # Newick
tree <- read.nexus("tree.nex") # NEXUS
# Tree statistics
Ntip(tree) # number of tips
Nnode(tree) # number of internal nodes
# Compute pairwise cophenetic distances
dist_matrix <- cophenetic(tree)
# Root the tree
tree_rooted <- root(tree, outgroup = "reference_seq")
# Write tree
write.tree(tree, file = "outputs/tree_rooted.nwk")treeio [PKG]Purpose: Read phylogenetic files from diverse
formats (BEAST, MrBayes, RAxML, IQ-TREE) and convert to treedata objects
for use with ggtree.
library(treeio)
# Read BEAST output
beast_tree <- read.beast("data/mcmc.tree")
# Read IQ-TREE
iqtree <- read.iqtree("data/output.treefile")
# Read RAxML
raxml <- read.raxml("data/RAxML_bipartitions.result")
# Convert to tibble for inspection
as_tibble(beast_tree)
# Combine with ggtree
ggtree(beast_tree, aes(color = rate)) +
geom_tiplab() +
scale_color_viridis_c()# Ensure pacman is installed
if (!require("pacman")) install.packages("pacman")
# Install and load all packages in this guide
pacman::p_load(
# Project management
here, rio, openxlsx, renv, remotes,
# Import
readr, haven, readxl, data.table, jsonlite, DBI, RSQLite,
# Cleaning
tidyverse, janitor, linelist, naniar,
# EDA
skimr, DataExplorer, psych, Hmisc, rstatix,
# Regression
car, broom, lmtest, sandwich, performance, easystats,
pROC, logistf, nnet, MASS, ResourceSelection, marginaleffects,
# Survival
survival, survminer, flexsurv, cmprsk, timereg,
# Count models
pscl, AER, countreg,
# Mixed models
lme4, nlme, lmerTest, glmmTMB, emmeans,
# Tables & reporting
gtsummary, gt, flextable, tableone, arsenal, knitr, officer, rmarkdown,
# Visualisation
ggplot2, patchwork, cowplot, ggpubr, ggrepel, gghighlight,
plotly, gganimate, RColorBrewer, corrplot,
# Epi-specific
epitools, epiR, EpiEstim, EpiNow2, incidence2, i2extras,
epicontacts, epitrix, projections, distcrete,
MatchIt, WeightIt, cobalt,
# GIS
sf, tmap, spdep, SpatialEpi,
# Dashboards
shiny, flexdashboard, reportfactory,
# Bayesian
rstanarm, brms, bayesplot, tidybayes,
# Machine learning
tidymodels, caret, glmnet, randomForest, xgboost, mice,
# Phylogenetics
ggtree, ape, treeio
)
# Bioconductor packages (genomics)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("DESeq2", "limma", "SNPRelate"))#> R version 4.5.2 (2025-10-31 ucrt)
#> Platform: x86_64-w64-mingw32/x64
#> Running under: Windows 11 x64 (build 26100)
#>
#> Matrix products: default
#> LAPACK version 3.12.1
#>
#> locale:
#> [1] LC_COLLATE=English_Zambia.utf8 LC_CTYPE=English_Zambia.utf8
#> [3] LC_MONETARY=English_Zambia.utf8 LC_NUMERIC=C
#> [5] LC_TIME=English_Zambia.utf8
#>
#> time zone: Africa/Lusaka
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.37 R6_2.6.1 fastmap_1.2.0 xfun_0.52
#> [5] cachem_1.1.0 knitr_1.51 htmltools_0.5.8.1 rmarkdown_2.30
#> [9] lifecycle_1.0.5 cli_3.6.5 sass_0.4.10 jquerylib_0.1.4
#> [13] compiler_4.5.2 rstudioapi_0.17.1 tools_4.5.2 evaluate_1.0.4
#> [17] bslib_0.9.0 yaml_2.3.10 otel_0.2.0 jsonlite_2.0.0
#> [21] rlang_1.1.6
Generated with R Markdown · 2026-03-31