library(tidyverse)
library(lubridate)
library(readxl)
library(arrow)
library(scales)
library(patchwork)
library(broom)
library(testthat)
library(knitr)
patients <- read_csv("data/patients.csv", show_col_types = FALSE)
symptoms <- read_csv("data/symptoms.csv", show_col_types = FALSE)
medications <- read_csv("data/medications.csv", show_col_types = FALSE)
conditions <- read_excel("data/conditions.xlsx")
encounters <- read_parquet("data/encounters.parquet")
cat("patients: ", nrow(patients), "x", ncol(patients), "\n")
## patients: 10000 x 27
cat("encounters: ", nrow(encounters), "x", ncol(encounters), "\n")
## encounters: 104262 x 15
cat("symptoms: ", nrow(symptoms), "x", ncol(symptoms), "\n")
## symptoms: 1423 x 9
cat("medications: ", nrow(medications), "x", ncol(medications), "\n")
## medications: 22237 x 13
cat("conditions: ", nrow(conditions), "x", ncol(conditions), "\n")
## conditions: 4149 x 6
The five tables all revolve around PATIENT (a UUID). The
patients table is the central reference — every other table has a
PATIENT column that maps to PATIENT_ID in
patients.
On top of that, encounters, medications, and conditions share an
ENCOUNTER column (which maps to Id in
encounters). So the relationships are:
PATIENT_ID = PATIENT (one patient, many visits)PATIENT_ID = PATIENT (one patient, one or more symptom
rows)PATIENT_ID = PATIENTPATIENT_ID = PATIENTId = ENCOUNTER (meds prescribed during a visit)Id = ENCOUNTER (conditions diagnosed at a visit)Symptoms doesn’t have an encounter ID, so it’s only linked through the patient.
Note: the encounters file is Parquet — I read it with
arrow::read_parquet()while the other files are CSV/XLSX.
I ran into a few issues while exploring the data:
"Rash:34;Joint Pain:39;Fatigue:9;Fever:12". Had to parse
this out with regex into separate numeric columns.There are two ways to slice “Lupus patients” here, and they give different counts:
conditionssymptomsThe symptom subset is smaller. I’ll use the diagnosed cohort for demographics and med trends, and the symptom subset for sections that need symptom scores (sections 3.4, 4, and 5).
# Lowercase all IDs so joins actually work
patients <- patients %>% mutate(PATIENT_ID = tolower(PATIENT_ID))
encounters <- encounters %>% mutate(Id = tolower(Id), PATIENT = tolower(PATIENT))
symptoms <- symptoms %>% mutate(PATIENT = tolower(PATIENT))
medications <- medications %>% mutate(PATIENT = tolower(PATIENT), ENCOUNTER = tolower(ENCOUNTER))
conditions <- conditions %>% mutate(PATIENT = tolower(PATIENT), ENCOUNTER = tolower(ENCOUNTER))
# Standardize medication names
medications <- medications %>% mutate(DESCRIPTION = str_to_upper(str_trim(DESCRIPTION)))
cat("Unique medication descriptions after cleaning:", n_distinct(medications$DESCRIPTION), "\n")
## Unique medication descriptions after cleaning: 5
# Parse dates properly
patients <- patients %>%
mutate(BIRTHDATE = as.Date(BIRTHDATE), DEATHDATE = as.Date(DEATHDATE))
encounters <- encounters %>%
mutate(START = ymd_hms(START), STOP = ymd_hms(STOP))
medications <- medications %>%
mutate(START = ymd_hms(START), STOP = ymd_hms(STOP))
conditions <- conditions %>%
mutate(START = as.Date(START), STOP = as.Date(STOP))
# Extract the 4 symptom scores from the packed string
symptoms <- symptoms %>%
mutate(
Rash = as.integer(str_extract(SYMPTOMS, "(?<=Rash:)\\d+")),
Joint_Pain = as.integer(str_extract(SYMPTOMS, "(?<=Joint Pain:)\\d+")),
Fatigue = as.integer(str_extract(SYMPTOMS, "(?<=Fatigue:)\\d+")),
Fever = as.integer(str_extract(SYMPTOMS, "(?<=Fever:)\\d+"))
)
cat("Symptoms before filtering:", nrow(symptoms), "rows,", n_distinct(symptoms$PATIENT), "patients\n")
## Symptoms before filtering: 1423 rows, 948 patients
# Keep only Lupus rows and deduplicate to one row per patient
symptoms <- symptoms %>%
filter(str_to_upper(PATHOLOGY) == "LUPUS ERYTHEMATOSUS") %>%
distinct(PATIENT, .keep_all = TRUE)
cat("After filtering to Lupus only:", nrow(symptoms), "rows,", n_distinct(symptoms$PATIENT), "patients\n")
## After filtering to Lupus only: 948 rows, 948 patients
# Fill in the missing GENDER from patients table
gender_lookup <- patients %>% select(PATIENT_ID, PAT_GENDER = GENDER)
symptoms <- symptoms %>%
mutate(GENDER = na_if(as.character(GENDER), "NaN")) %>%
left_join(gender_lookup, by = c("PATIENT" = "PATIENT_ID")) %>%
mutate(GENDER = coalesce(GENDER, PAT_GENDER)) %>%
select(-PAT_GENDER)
cat("GENDER NAs remaining:", sum(is.na(symptoms$GENDER)), "\n")
## GENDER NAs remaining: 0
symptoms %>%
select(PATIENT, PATHOLOGY, Rash, Joint_Pain, Fatigue, Fever) %>%
head(5) %>%
kable(caption = "Parsed symptom scores (first 5 rows)")
| PATIENT | PATHOLOGY | Rash | Joint_Pain | Fatigue | Fever |
|---|---|---|---|---|---|
| 28d7b56c-6056-d0a2-2991-39d6e917216c | Lupus erythematosus | 34 | 39 | 9 | 12 |
| 6c434506-fb4b-3e3f-c19d-553dec3b6c17 | Lupus erythematosus | 19 | 44 | 48 | 15 |
| 44a8ca45-6c6e-38bb-fac0-ddbf7a7ee3a4 | Lupus erythematosus | 2 | 32 | 12 | 6 |
| 780ec78c-22a0-fcdb-17c6-ae9b2fcace9c | Lupus erythematosus | 30 | 30 | 41 | 19 |
| cf5956bb-34f2-841b-2505-57b99991c377 | Lupus erythematosus | 28 | 26 | 33 | 5 |
Running some basic checks to make sure the cleaning didn’t break anything and the data is consistent.
patient_ids <- unique(patients$PATIENT_ID)
invisible(capture.output({
test_that("All medication patients exist in patients table", {
expect_true(all(medications$PATIENT %in% patient_ids))
})
test_that("All encounter patients exist in patients table", {
expect_true(all(encounters$PATIENT %in% patient_ids))
})
test_that("Patient IDs are unique", {
expect_equal(nrow(patients), n_distinct(patients$PATIENT_ID))
})
test_that("Symptoms table has one row per patient after dedup", {
expect_equal(nrow(symptoms), n_distinct(symptoms$PATIENT))
})
}))
cat("\n**All checks passed.**\n")
All checks passed.
# Diagnosed cohort
lupus_patient_ids <- conditions %>%
filter(str_to_upper(DESCRIPTION) == "LUPUS ERYTHEMATOSUS") %>%
pull(PATIENT) %>%
unique()
lupus_patients <- patients %>% filter(PATIENT_ID %in% lupus_patient_ids)
# Symptom-observed subset
symptom_patient_ids <- unique(symptoms$PATIENT)
n_total <- n_distinct(patients$PATIENT_ID)
n_diagnosed <- n_distinct(lupus_patients$PATIENT_ID)
n_symptom <- n_distinct(symptom_patient_ids)
# Lupus-directed meds — used in sections 3.2, 4, and 5
# Pulling this out early so later chunks don't depend on running 3.2 first
lupus_meds <- medications %>%
filter(PATIENT %in% lupus_patient_ids,
str_to_upper(REASONDESCRIPTION) == "LUPUS ERYTHEMATOSUS")
tibble(
Cohort = c("All patients", "Diagnosed Lupus cohort", "Symptom-observed subset"),
N = c(comma(n_total), comma(n_diagnosed), comma(n_symptom))
) %>% kable(caption = "Cohort sizes")
| Cohort | N |
|---|---|
| All patients | 10,000 |
| Diagnosed Lupus cohort | 2,784 |
| Symptom-observed subset | 948 |
cat("Total patients in dataset:", comma(n_total), "\n")
## Total patients in dataset: 10,000
cat("Diagnosed with Lupus: ", comma(n_diagnosed), "\n")
## Diagnosed with Lupus: 2,784
cat("With symptom data: ", comma(n_symptom), "\n")
## With symptom data: 948
So we have 10,000 patients total, 2,784 with a Lupus diagnosis, and 948 of those have symptom observations.
I’m only looking at prescriptions where
REASONDESCRIPTION is “Lupus erythematosus” — this filters
out things like Vitamin B12 or ferrous sulfate that are really being
prescribed for comorbid anemia, not for Lupus itself.
meds_over_time <- lupus_meds %>%
mutate(month = floor_date(START, "month")) %>%
group_by(month) %>%
summarise(n_meds = n_distinct(DESCRIPTION), .groups = "drop")
ggplot(meds_over_time, aes(month, n_meds)) +
geom_line(colour = "steelblue", linewidth = 0.7) +
geom_point(colour = "steelblue", size = 1.5) +
scale_y_continuous(breaks = pretty_breaks()) +
labs(title = "Distinct Lupus medications prescribed per month",
x = NULL, y = "Distinct medications") +
theme_minimal()
race_data <- lupus_patients %>%
count(RACE) %>%
mutate(pct = n / sum(n),
label = paste0(RACE, "\n", round(pct * 100, 1), "%"))
p1 <- ggplot(race_data, aes(x = "", y = n, fill = RACE)) +
geom_col(width = 1, colour = "white") +
coord_polar("y") +
geom_text(aes(label = label), position = position_stack(vjust = 0.5), size = 3) +
labs(title = "By Race", fill = "Race") +
theme_void()
gender_data <- lupus_patients %>%
count(GENDER) %>%
mutate(pct = n / sum(n),
label = paste0(GENDER, "\n", round(pct * 100, 1), "%"))
p2 <- ggplot(gender_data, aes(x = "", y = n, fill = GENDER)) +
geom_col(width = 1, colour = "white") +
coord_polar("y") +
geom_text(aes(label = label), position = position_stack(vjust = 0.5), size = 3.5) +
labs(title = "By Gender", fill = "Gender") +
theme_void()
p1 + p2
How many patients have all four symptom scores >= 30?
all_high <- symptoms %>%
filter(Rash >= 30, Joint_Pain >= 30, Fatigue >= 30, Fever >= 30)
cat(paste0("Patients with all 4 scores >= 30: ", nrow(all_high), " / ", nrow(symptoms),
" (", round(nrow(all_high) / nrow(symptoms) * 100, 1), "%)\n"))
## Patients with all 4 scores >= 30: 0 / 948 (0%)
cat(paste0("Fever range: ", min(symptoms$Fever, na.rm = TRUE), " to ", max(symptoms$Fever, na.rm = TRUE), "\n"))
## Fever range: 0 to 19
No patients met the threshold of 30 on all four symptom categories. This is not a code bug — in this dataset, Fever scores only range from 0 to 19, so the combined threshold is structurally impossible. Rash, Joint Pain, and Fatigue all go up to 49, but Fever never gets close to 30.
For this section I’m using the symptom-observed subset (948 patients) since we need actual symptom scores to measure outcomes.
My approach:
In the symptom-observed subset, the only primary therapy groups that emerged were Naproxen (NSAID) and Prednisone (Corticosteroid). Cyclosporine has plenty of prescriptions overall, but for every patient Prednisone always had at least as many Rx, so Cyclosporine never wins primary.
Important caveat about the therapy grouping: every patient in this subset receives both Naproxen and Prednisone. The 282 “Naproxen-primary” patients are actually those with exactly 1 Rx of each (a tie), where Naproxen wins by alphabetical tiebreak. The 666 “Prednisone-primary” patients are those with 2+ Prednisone Rx (who also have Cyclosporine). So this grouping really reflects prescription intensity rather than different treatment strategies. The near-identical composite scores across groups (~20.7 vs ~20.9) are consistent with this — there’s no meaningful clinical distinction here.
I’m keeping the comparison in the analysis since it demonstrates the methodology, but the lack of separation in the results is expected given how the groups are formed.
# Map drug names to therapy groups
classify_therapy <- function(desc) {
case_when(
str_detect(desc, "NAPROXEN") ~ "Naproxen (NSAID)",
str_detect(desc, "PREDNISONE") ~ "Prednisone (Corticosteroid)",
str_detect(desc, "CYCLOSPORINE") ~ "Cyclosporine (Immunosuppressant)",
str_detect(desc, "HYDROXYCHLOROQUINE") ~ "Hydroxychloroquine",
str_detect(desc, "VITAMIN B12") ~ "Vitamin B12",
TRUE ~ "Other"
)
}
lupus_meds <- lupus_meds %>% mutate(THERAPY = classify_therapy(DESCRIPTION))
lupus_meds %>% count(THERAPY, sort = TRUE) %>% kable(caption = "Prescription counts by therapy")
| THERAPY | n |
|---|---|
| Prednisone (Corticosteroid) | 10046 |
| Cyclosporine (Immunosuppressant) | 9094 |
| Naproxen (NSAID) | 2784 |
# For each patient, pick the therapy they got the most prescriptions for
sym_meds <- lupus_meds %>% filter(PATIENT %in% symptom_patient_ids)
patient_therapy <- sym_meds %>%
count(PATIENT, THERAPY, name = "n_rx") %>%
arrange(PATIENT, desc(n_rx), THERAPY) %>%
group_by(PATIENT) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(PATIENT, THERAPY)
patient_therapy %>% count(THERAPY, sort = TRUE) %>% kable(caption = "Primary therapy distribution")
| THERAPY | n |
|---|---|
| Prednisone (Corticosteroid) | 666 |
| Naproxen (NSAID) | 282 |
# Composite score = average of the 4 symptom categories
# Note: Rash, Joint_Pain, and Fatigue range 0-49 while Fever only ranges 0-19,
# so this is an unweighted summary rather than a normalized severity index.
symptoms <- symptoms %>%
mutate(COMPOSITE_SCORE = rowMeans(across(c(Rash, Joint_Pain, Fatigue, Fever)), na.rm = TRUE))
# Age groups
assign_age_group <- function(age) {
cut(age, breaks = c(-Inf, 29, 54, Inf),
labels = c("Young (<30)", "Middle (30-54)", "Older (55+)"))
}
symptoms <- symptoms %>% mutate(AGE_GROUP = assign_age_group(AGE_BEGIN))
cohort <- symptoms %>% inner_join(patient_therapy, by = "PATIENT")
# Drop therapy groups with fewer than 10 patients — not enough to say anything meaningful
therapy_counts <- cohort %>% count(THERAPY, sort = TRUE)
keep <- therapy_counts %>% filter(n >= 10) %>% pull(THERAPY)
cohort_main <- cohort %>% filter(THERAPY %in% keep)
cat("Analysis cohort:", nrow(cohort_main), "patients\n")
## Analysis cohort: 948 patients
cohort_main %>% count(THERAPY, sort = TRUE) %>% kable()
| THERAPY | n |
|---|---|
| Prednisone (Corticosteroid) | 666 |
| Naproxen (NSAID) | 282 |
summary_age <- cohort_main %>%
group_by(THERAPY, AGE_GROUP) %>%
summarise(mean_score = mean(COMPOSITE_SCORE), .groups = "drop")
ggplot(summary_age, aes(THERAPY, mean_score, fill = AGE_GROUP)) +
geom_col(position = "dodge", colour = "black", linewidth = 0.3) +
labs(title = "Mean composite score by therapy and age group",
x = "Therapy", y = "Mean composite score", fill = "Age group") +
theme_bw() +
theme(axis.text.x = element_text(angle = 10, hjust = 1))
summary_eth <- cohort_main %>%
group_by(THERAPY, ETHNICITY) %>%
summarise(mean_score = mean(COMPOSITE_SCORE), .groups = "drop")
ggplot(summary_eth, aes(THERAPY, mean_score, fill = ETHNICITY)) +
geom_col(position = "dodge", colour = "black", linewidth = 0.3) +
labs(title = "Mean composite score by therapy and ethnicity",
x = "Therapy", y = "Mean composite score", fill = "Ethnicity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 10, hjust = 1))
cohort_main <- cohort_main %>%
mutate(subgroup = paste(AGE_GROUP, "/", ETHNICITY))
heat <- cohort_main %>%
group_by(THERAPY, subgroup) %>%
summarise(mean_score = mean(COMPOSITE_SCORE), .groups = "drop")
ggplot(heat, aes(subgroup, THERAPY, fill = mean_score)) +
geom_tile(colour = "white", linewidth = 0.5) +
geom_text(aes(label = round(mean_score, 1)), size = 3) +
scale_fill_gradient(low = "#ffffcc", high = "#d7191c") +
labs(title = "Composite score heatmap: therapy x age/ethnicity",
x = "Subgroup", y = "Therapy", fill = "Mean score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 25, hjust = 1))
long <- cohort_main %>%
pivot_longer(c(Rash, Joint_Pain, Fatigue, Fever),
names_to = "symptom", values_to = "score")
ggplot(long, aes(THERAPY, score, fill = THERAPY)) +
geom_boxplot(outlier.size = 0.5) +
facet_wrap(~symptom, scales = "free_y", ncol = 2) +
labs(title = "Symptom distributions by therapy", x = NULL, y = "Score") +
theme_light() +
theme(axis.text.x = element_blank(), legend.position = "bottom")
overall_means <- cohort_main %>%
group_by(THERAPY) %>%
summarise(n = n(),
mean = round(mean(COMPOSITE_SCORE), 1),
sd = round(sd(COMPOSITE_SCORE), 1),
.groups = "drop")
kable(overall_means, col.names = c("Therapy", "N", "Mean", "SD"),
caption = "Composite score summary by therapy")
| Therapy | N | Mean | SD |
|---|---|---|---|
| Naproxen (NSAID) | 282 | 20.7 | 6.6 |
| Prednisone (Corticosteroid) | 666 | 20.9 | 6.3 |
age_spread <- summary_age %>%
group_by(THERAPY) %>%
summarise(range = round(max(mean_score) - min(mean_score), 1), .groups = "drop")
eth_spread <- summary_eth %>%
group_by(THERAPY) %>%
summarise(range = round(max(mean_score) - min(mean_score), 1), .groups = "drop")
cat("\nWithin-therapy spread across age groups:\n")
##
## Within-therapy spread across age groups:
print(age_spread)
## # A tibble: 2 × 2
## THERAPY range
## <chr> <dbl>
## 1 Naproxen (NSAID) 2.5
## 2 Prednisone (Corticosteroid) 0.5
cat("\nWithin-therapy spread across ethnicities:\n")
##
## Within-therapy spread across ethnicities:
print(eth_spread)
## # A tibble: 2 × 2
## THERAPY range
## <chr> <dbl>
## 1 Naproxen (NSAID) 0.3
## 2 Prednisone (Corticosteroid) 0.2
There isn’t much separation between therapies here. The means are nearly identical (~20.7 vs ~20.9, p = 0.66), standard deviations overlap, and the within-therapy differences by age/ethnicity are only a few points. The boxplots show the same thing — the ranges all overlap. As noted above, this is expected: the two “therapy groups” are really defined by a prescription-count tiebreak, not by genuinely different treatment approaches, so there’s no reason to expect different symptom profiles.
Ideally I’d look at whether symptoms decrease after starting medication — a before/after comparison. But each patient only has one symptom observation, so there’s no baseline to compare against.
Instead, I’m modelling: is a patient’s Fatigue score below the median? (low fatigue = 1, high fatigue = 0). It’s not a perfect setup, but it’s the best I can do with a single snapshot per patient.
I’m using therapy type, age, gender, race, ethnicity, and number of prescriptions as inputs, with a 75/25 train/test split.
rx_counts <- sym_meds %>% count(PATIENT, name = "n_rx")
model_df <- cohort_main %>%
left_join(rx_counts, by = "PATIENT") %>%
filter(!is.na(GENDER), !is.na(RACE), !is.na(ETHNICITY)) %>%
mutate(
LOW_FATIGUE = as.integer(Fatigue < median(Fatigue, na.rm = TRUE)),
THERAPY = droplevels(factor(THERAPY)),
GENDER = droplevels(factor(GENDER)),
RACE = fct_lump_min(factor(RACE), min = 20, other_level = "other"),
RACE = if ("white" %in% levels(RACE)) relevel(RACE, ref = "white") else RACE,
ETHNICITY = droplevels(factor(ETHNICITY))
)
# If any factor has only 1 level it'll break the regression, so drop those
single_lvl <- names(model_df)[sapply(model_df, function(x) is.factor(x) && nlevels(x) < 2)]
if (length(single_lvl) > 0) {
model_df <- model_df %>% select(-all_of(single_lvl))
}
cat("Modelling dataset:", nrow(model_df), "patients\n")
## Modelling dataset: 948 patients
cat("Fatigue median used for split:", median(model_df$Fatigue, na.rm = TRUE), "\n")
## Fatigue median used for split: 25.5
cat("LOW_FATIGUE distribution:", table(model_df$LOW_FATIGUE)["0"], "high /",
table(model_df$LOW_FATIGUE)["1"], "low\n")
## LOW_FATIGUE distribution: 474 high / 474 low
predictors <- c("AGE_BEGIN", "GENDER", "RACE", "ETHNICITY", "THERAPY", "n_rx")
predictors <- intersect(predictors, names(model_df))
fml <- as.formula(paste("LOW_FATIGUE ~", paste(predictors, collapse = " + ")))
fit <- glm(fml, data = model_df, family = binomial)
summary(fit)
##
## Call:
## glm(formula = fml, family = binomial, data = model_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.519741 0.289303 1.797 0.0724 .
## AGE_BEGIN -0.005203 0.004874 -1.067 0.2857
## GENDERM 0.006352 0.131218 0.048 0.9614
## RACEasian 0.282274 0.275372 1.025 0.3053
## RACEblack -0.445855 0.192070 -2.321 0.0203 *
## RACEother -0.351306 0.472883 -0.743 0.4575
## ETHNICITYnonhispanic -0.182894 0.132721 -1.378 0.1682
## THERAPYPrednisone (Corticosteroid) -0.021391 0.213379 -0.100 0.9201
## n_rx -0.021492 0.034913 -0.616 0.5382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1314.2 on 947 degrees of freedom
## Residual deviance: 1303.2 on 939 degrees of freedom
## AIC: 1321.2
##
## Number of Fisher Scoring iterations: 4
or_tbl <- tidy(fit, conf.int = TRUE, exponentiate = TRUE) %>%
select(term, estimate, conf.low, conf.high, p.value)
kable(or_tbl, digits = 3, caption = "Odds ratios with 95% CIs",
col.names = c("Term", "OR", "CI low", "CI high", "p"))
| Term | OR | CI low | CI high | p |
|---|---|---|---|---|
| (Intercept) | 1.682 | 0.955 | 2.972 | 0.072 |
| AGE_BEGIN | 0.995 | 0.985 | 1.004 | 0.286 |
| GENDERM | 1.006 | 0.778 | 1.302 | 0.961 |
| RACEasian | 1.326 | 0.776 | 2.296 | 0.305 |
| RACEblack | 0.640 | 0.438 | 0.931 | 0.020 |
| RACEother | 0.704 | 0.268 | 1.767 | 0.458 |
| ETHNICITYnonhispanic | 0.833 | 0.642 | 1.080 | 0.168 |
| THERAPYPrednisone (Corticosteroid) | 0.979 | 0.644 | 1.487 | 0.920 |
| n_rx | 0.979 | 0.914 | 1.048 | 0.538 |
set.seed(42)
idx <- sample(nrow(model_df), floor(0.75 * nrow(model_df)))
train <- model_df[idx, ]
test <- model_df[-idx, ]
cat("Train:", nrow(train), "/ Test:", nrow(test), "\n")
## Train: 711 / Test: 237
fit_train <- glm(fml, data = train, family = binomial)
test$prob <- predict(fit_train, newdata = test, type = "response")
test$pred <- as.integer(test$prob >= 0.5)
cat("\nConfusion matrix:\n")
##
## Confusion matrix:
print(table(Predicted = test$pred, Actual = test$LOW_FATIGUE))
## Actual
## Predicted 0 1
## 0 40 27
## 1 87 83
acc <- mean(test$pred == test$LOW_FATIGUE)
cat(paste0("\nAccuracy: ", round(acc * 100, 1), "%\n"))
##
## Accuracy: 51.9%
# ROC
roc <- tibble(truth = test$LOW_FATIGUE, prob = test$prob) %>%
arrange(desc(prob)) %>%
mutate(tpr = cumsum(truth == 1) / sum(truth == 1),
fpr = cumsum(truth == 0) / sum(truth == 0))
auc_val <- abs(sum(diff(roc$fpr) * (head(roc$tpr, -1) + tail(roc$tpr, -1)) / 2))
ggplot(roc, aes(fpr, tpr)) +
geom_line(colour = "steelblue", linewidth = 0.8) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
annotate("text", x = 0.6, y = 0.3, label = paste("AUC =", round(auc_val, 3)), size = 4.5) +
labs(title = "ROC curve — low fatigue classification",
x = "False positive rate", y = "True positive rate") +
theme_minimal()
Quick guide to the table: an odds ratio (OR) above 1 means that group has higher odds of low fatigue compared to the reference; below 1 means lower odds. If the confidence interval includes 1.0, the difference isn’t statistically significant.
sig <- or_tbl %>% filter(term != "(Intercept)", p.value < 0.05)
if (nrow(sig) == 0) {
cat("None of the predictors reach significance at p < 0.05. ",
"Therapy type, demographics, and prescription volume don't seem to ",
"predict fatigue in this cohort.\n\n")
} else {
cat(nrow(sig), "predictor(s) reached nominal significance at p < 0.05:\n\n")
for (i in seq_len(nrow(sig))) {
cat(paste0("- **", sig$term[i], "**: OR = ", round(sig$estimate[i], 2),
" (95% CI: ", round(sig$conf.low[i], 2), "-", round(sig$conf.high[i], 2),
", p = ", round(sig$p.value[i], 3), ")\n"))
}
cat("\nHowever, overall model performance remained close to chance, ",
"so the model does not have useful predictive ability on its own.\n\n")
}
1 predictor(s) reached nominal significance at p < 0.05:
However, overall model performance remained close to chance, so the model does not have useful predictive ability on its own.
cat(paste0("**AUC = ", round(auc_val, 3), "**"))
AUC = 0.506
if (auc_val < 0.55) {
cat(" — basically a coin flip. The model isn't really distinguishing between the two groups.\n\n")
} else if (auc_val < 0.7) {
cat(" — the model is picking up a little signal, but not much.\n\n")
} else {
cat(" — decent predictive performance.\n\n")
}
— basically a coin flip. The model isn’t really distinguishing between the two groups.
cat(paste0("**Accuracy = ", round(acc * 100, 1), "%** on the held-out test set.\n\n"))
Accuracy = 51.9% on the held-out test set.
cat("**Caveats:**\n\n")
Caveats:
cat("- The main limitation is that each patient only has one symptom observation, so this is a snapshot rather than a before/after comparison. The assignment asks about symptom changes over time, but the data doesn't support that kind of analysis.\n")
cat("- The therapy grouping is driven by a prescription-count tiebreak rather than genuinely different treatment strategies (all patients receive the same drugs). The near-zero THERAPY coefficient (OR ~0.98, p = 0.92) is consistent with this.\n")
cat("- Patients weren't randomly assigned to therapies, so we can't draw conclusions about whether one treatment is better than another from this alone.\n")
cat("- This is simulated Synthea data, so any apparent associations should be interpreted cautiously and are not expected to reflect real clinical relationships.\n")