Introduction

This project demonstrates analysis of a large synthetic health dataset.

Odds Ratio Results

# Load datasets
persons <- fread("persons.csv")
events  <- fread("disease_events.csv")
hosp    <- fread("hospitalizations.csv")
neigh   <- fread("neighbourhoods.csv")

# Mark hospitalized events
events[, hospitalized := 0]
events[event_id %in% hosp$event_id, hospitalized := 1]

# Build dataset
df <- merge(
  events[, .(person_id, condition, hospitalized)],
  persons[, .(person_id, age_2024 = 2024 - birth_year, sex, bmi,
              diabetes, hypertension, copd_asthma,
              immunocompromised, cancer_history,
              neighbourhood_id)],
  by="person_id"
)
df <- merge(df, neigh[, .(neighbourhood_id, deprivation_quintile)],
            by="neighbourhood_id")

# Logistic regression
m <- glm(hospitalized ~ condition + sex + scale(age_2024) + scale(bmi) +
           diabetes + hypertension + copd_asthma + immunocompromised + cancer_history +
           factor(deprivation_quintile),
         data=df, family=binomial())

# Robust SEs + tidy results
coefs <- coeftest(m, vcov = sandwich::vcovHC(m, type="HC0"))
tidy_table <- broom::tidy(coefs)
results <- tidy_table %>%
  mutate(
    OR   = exp(estimate),
    LCL  = exp(estimate - 1.96*std.error),
    UCL  = exp(estimate + 1.96*std.error)
  ) %>%
  select(term, OR, LCL, UCL, p.value)

head(results, 10)
## # A tibble: 10 × 5
##    term                         OR      LCL      UCL p.value
##    <chr>                     <dbl>    <dbl>    <dbl>   <dbl>
##  1 (Intercept)           2.90 e-12 2.84e-12 2.97e-12   0    
##  2 conditionInfluenza    1.00 e+ 0 9.85e- 1 1.02e+ 0   1.000
##  3 conditionRSV          1.00 e+ 0 9.78e- 1 1.02e+ 0   1.000
##  4 conditionTuberculosis 1.00 e+ 0 9.08e- 1 1.10e+ 0   1.000
##  5 sexM                  1.000e+ 0 9.86e- 1 1.01e+ 0   1.000
##  6 scale(age_2024)       1.00 e+ 0 9.93e- 1 1.01e+ 0   1.000
##  7 scale(bmi)            1.00 e+ 0 9.93e- 1 1.01e+ 0   1.000
##  8 diabetes              1.00 e+ 0 9.79e- 1 1.02e+ 0   1.000
##  9 hypertension          1.000e+ 0 9.85e- 1 1.02e+ 0   1.000
## 10 copd_asthma           1.00 e+ 0 9.78e- 1 1.02e+ 0   1.000

Interpretation