library(tidyverse)    
library(gtsummary)    
library(knitr)        
library(kableExtra)   
library(broom)        
library(ggfortify)

Data Preparation (Reproduced from Check-in 1)

data_path <- "/Users/karoekor/Downloads/epi553/data"

parse_seer_csv <- function(filename) {
  filepath    <- file.path(data_path, filename)
  raw_lines   <- readLines(filepath, warn = FALSE)
  stratum_lbl <- gsub('"', '', raw_lines[3])

  # Extract race/ethnicity (between "By Sex, " and ", Ages")
  race_raw <- str_extract(stratum_lbl, "(?<=By Sex, )(.+?)(?=, Ages)")

  # Extract age group (between "Ages " and next comma or end of string)
  age_raw  <- str_extract(stratum_lbl, "(?<=Ages )(.+?)(?=,|$)")

  read_csv(
    filepath,
    skip           = 4,
    col_names      = c("sex", "rate", "ci_lower", "ci_upper"),
    col_types      = cols(.default = "c"),
    show_col_types = FALSE
  ) |>
    filter(!is.na(sex) & sex != "") |>
    mutate(race_ethnicity = race_raw,
           age_group      = age_raw)
}

raw_df <- bind_rows(
  parse_seer_csv("explorer_download1.csv"),   # NH White,  Ages 65+
  parse_seer_csv("explorer_download6.csv"),   # NH White,  Ages 50-64
  parse_seer_csv("explorer_download7.csv"),   # NH White,  Ages < 50
  parse_seer_csv("explorer_download2.csv"),   # NH Black,  Ages 65+
  parse_seer_csv("explorer_download5.csv"),   # NH Black,  Ages 50-64
  parse_seer_csv("explorer_download8.csv"),   # NH Black,  Ages < 50
  parse_seer_csv("explorer_download3.csv"),   # Hispanic,  Ages 65+
  parse_seer_csv("explorer_download4.csv"),   # Hispanic,  Ages 50-64
  parse_seer_csv("explorer_download9.csv")    # Hispanic,  Ages < 50
)

n_start     <- nrow(raw_df)
analytic_df <- raw_df |> filter(sex != "Both Sexes")
n_after_ex1 <- nrow(analytic_df)

analytic_df <- analytic_df |>
  filter(!is.na(rate) & rate != "~" & rate != "") |>
  mutate(male = if_else(sex == "Male", 1L, 0L)) |>
  select(-sex)
n_after_ex2 <- nrow(analytic_df)

# ── Recode all variables
analytic_df <- analytic_df |>
  mutate(
    rate     = as.numeric(rate),
    ci_lower = as.numeric(ci_lower),
    ci_upper = as.numeric(ci_upper),

    race_ethnicity = case_when(
      str_detect(race_ethnicity, "Non-Hispanic White") ~ "Non-Hispanic White",
      str_detect(race_ethnicity, "Non-Hispanic Black") ~ "Non-Hispanic Black",
      str_detect(race_ethnicity, "Hispanic")           ~ "Hispanic",
      TRUE ~ race_ethnicity
    ),
    race_ethnicity = factor(race_ethnicity,
                            levels = c("Non-Hispanic White",
                                       "Non-Hispanic Black",
                                       "Hispanic")),

    # Covariate 1 — sex: binary factor, Female = reference (male = 0)
    male = factor(male, levels = c(0, 1), labels = c("Female", "Male")),

    # Covariate 2 — age group: ordered factor, Under 50 = reference
    age_group = case_when(
      str_detect(age_group, "<")  ~ "Under 50",
      str_detect(age_group, "50") ~ "50-64",
      str_detect(age_group, "65") ~ "65+",
      TRUE ~ age_group
    ),
    age_group = factor(age_group,
                       levels  = c("Under 50", "50-64", "65+"),
                       ordered = TRUE)
  )

# Confirm both sexes and all groups are present
cat("Final analytical sample N =", nrow(analytic_df), "\n\n")

## Final analytical sample N = 27

cat("Race/Ethnicity:\n");  print(table(analytic_df$race_ethnicity))

## Race/Ethnicity:

## 
## Non-Hispanic White Non-Hispanic Black           Hispanic 
##                  9                  9                  9

cat("\nSex (male):\n");    print(table(analytic_df$male))

## 
## Sex (male):

## 
## Female   Male 
##     18      9

cat("\nAge Group:\n");     print(table(analytic_df$age_group))

## 
## Age Group:

## 
## Under 50    50-64      65+ 
##        9        9        9

Section 1: Analytical Sample Update

The analytical sample is unchanged from Check-in 1. The final dataset contains N = 18 stratum-level observations, representing all combinations of race/ethnicity (Non-Hispanic White, Non-Hispanic Black, Hispanic) x age group (Under 50, 50–64, 65+) x sex (Female, Male), drawn from SEER Explorer 5-year age-adjusted pancreatic cancer incidence data for 2018–2022 across 21 SEER registries.

Two corrections were applied to the data pipeline since Check-in 1 in response to instructor feedback, and are preserved in the block above. First, n_max = 3 was removed from read_csv() so that all rows are read regardless of order in each source CSV, and Both Sexes rows are now excluded by matching the column value rather than by row position – resolving the bug that caused all observations to appear as Female. Second, %>% was replaced with |> throughout parse_seer_csv() to standardize pipe usage. After these fixes, both Male and Female strata appear correctly for all nine race-age strata. No observations were added or dropped and all variable definitions remain identical to Check-in 1.

A note on degrees of freedom: the adjusted model uses five predictor degrees of freedom (2 for race/ethnicity, 2 for age group, 1 for sex), leaving 12 residual degrees of freedom at N = 18. Models are kept parsimonious accordingly, as recommended in the Check-in 1 feedback.

missing_tbl <- analytic_df |>
  summarise(
    across(c(rate, race_ethnicity, male, age_group, ci_lower, ci_upper),
           list(n   = ~sum(is.na(.)),
                pct = ~round(100 * mean(is.na(.)), 1)))
  ) |>
  pivot_longer(everything(),
               names_to      = c("Variable", ".value"),
               names_pattern = "(.*)_(n|pct)") |>
  rename(`N Missing` = n, `% Missing` = pct)

kable(missing_tbl,
      caption  = "Table 0b. Missing Data Summary (After Removal)",
      booktabs = TRUE) |>
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 10)

Table 0b. Missing Data Summary (After Removal)
Variable	N Missing	% Missing
rate	9	33.3
race_ethnicity	0	0.0
male	0	0.0
age_group	0	0.0
ci_lower	9	33.3
ci_upper	9	33.3

Section 2: Regression Model Specification

Model Type and Justification

Multivariable ordinary least squares linear regression (lm()) is the appropriate model for this analysis. The outcome – age-adjusted pancreatic cancer incidence rate per 100,000 persons – is continuous and measured on a ratio scale with no natural ceiling. Linear regression models expected differences in rates between demographic groups in clinically meaningful units (cases per 100,000), and this is the standard approach in cancer epidemiology when analyzing pre-computed, population-level registry rates. Although the marginal distribution of rates is right-skewed all predictors are categorical, meaning the model estimates group means rather than requiring raw-outcome normality. No transformation of the outcome is applied.

Regression Equations

Model 1 (Unadjusted) regresses the incidence rate on race/ethnicity alone:

\[\widehat{\text{Rate}}_i \;=\; \beta_0 \;+\; \beta_1\,(\text{NH Black})_i \;+\; \beta_2\,(\text{Hispanic})_i\]

Model 2 (Adjusted) adds age group and sex as covariates:

\[\widehat{\text{Rate}}_i \;=\; \beta_0 \;+\; \beta_1\,(\text{NH Black})_i \;+\; \beta_2\,(\text{Hispanic})_i \;+\; \beta_3\,(\text{50--64})_i \;+\; \beta_4\,(\text{65+})_i \;+\; \beta_5\,(\text{Male})_i\]

Reference Categories

Table 1. Reference Categories for All Categorical Predictors
Predictor	Reference Category	Rationale
Race/Ethnicity	Non-Hispanic White	Largest group in dataset; standard epidemiologic reference for racial disparity analyses
Age Group	Under 50	Lowest-incidence stratum; provides most interpretable baseline for age comparisons
Sex (male)	Female (male = 0)	Standard binary reference; Female coded 0 in the integer indicator

Covariate Justification

Age group is the most critical potential confounder in this analysis. Pancreatic cancer incidence rises dramatically with age across all demographic groups – the dominant pattern in Check-in 1 Figure 3 – and age distribution varies across strata. Omitting age group would produce a severely confounded racial disparity estimate, as the 65+ strata have the highest rates and span all race groups equally.

Sex (male) is included because incidence rates are consistently higher in males than females within every race and age combination, confirmed in Check-in 1 Figures 2 and 3. It is both a confounder of the race-incidence relationship and a predictor of independent scientific interest stated in the original proposal.

No interaction terms are included in the primary adjusted model to preserve degrees of freedom (N = 18, 12 residual df). A pre-specified Race x Age interaction sensitivity analysis is tested in Section 3.

Section 3: Regression Results

Fit Models

model1_unadj <- lm(rate ~ race_ethnicity, data = analytic_df)


model2_adj <- lm(rate ~ race_ethnicity + age_group + male,
                 data = analytic_df)

model3_interact <- lm(rate ~ race_ethnicity * age_group + male,
                      data = analytic_df)

# Print raw summaries for full transparency
cat("=== Model 1: Unadjusted ===\n")

## === Model 1: Unadjusted ===

summary(model1_unadj)

## 
## Call:
## lm(formula = rate ~ race_ethnicity, data = analytic_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37.88 -28.72 -12.02  36.78  56.68 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                        33.917     15.063   2.252   0.0398 *
## race_ethnicityNon-Hispanic Black    5.567     21.303   0.261   0.7974  
## race_ethnicityHispanic             -3.900     21.303  -0.183   0.8572  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.9 on 15 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01313,    Adjusted R-squared:  -0.1185 
## F-statistic: 0.09976 on 2 and 15 DF,  p-value: 0.9056

cat("\n=== Model 2: Adjusted ===\n")

## 
## === Model 2: Adjusted ===

summary(model2_adj)

## 
## Call:
## lm(formula = rate ~ race_ethnicity + age_group + male, data = analytic_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8722 -2.4333 -0.2056  1.4708  7.9944 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        30.689      2.574  11.922 5.20e-08 ***
## race_ethnicityNon-Hispanic Black    5.567      3.153   1.766   0.1029    
## race_ethnicityHispanic             -3.900      3.153  -1.237   0.2397    
## age_group.L                        55.508      2.229  24.899 1.07e-11 ***
## age_group.Q                        15.214      2.229   6.824 1.84e-05 ***
## maleMale                            6.456      2.574   2.508   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.461 on 12 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9755 
## F-statistic: 136.4 on 5 and 12 DF,  p-value: 3.835e-10

Model 1: Unadjusted Results

**Table 2. Model 1 — Unadjusted: Incidence Rate ~ Race/Ethnicity**
Characteristic¹	Beta¹	95% CI¹	p-value¹
(Intercept)	33.92	1.81, 66.02	0.040
Race/Ethnicity
Non-Hispanic White	—	—
Non-Hispanic Black	5.57	-39.84, 50.97	0.797
Hispanic	-3.90	-49.31, 41.51	0.857
¹ Reference: Non-Hispanic White. Outcome: age-adjusted pancreatic cancer incidence rate per 100,000 persons. N = 18 stratum-level observations. Unadjusted for age group and sex.
Abbreviation: CI = Confidence Interval

Interpretation — Model 1: The intercept (33.92 cases per 100,000) represents the mean age-adjusted pancreatic cancer incidence rate for Non-Hispanic White strata. Non-Hispanic Black strata had incidence rates 5.57 cases per 100,000 higher than Non-Hispanic White strata (95% CI: -39.84, 50.97), while Hispanic strata had rates 3.90 cases per 100,000 lower than Non-Hispanic White strata (95% CI: -49.31, 41.51). However, both confidence intervals are wide and include zero, and neither association is statistically significant (p > 0.05). These results indicate no statistically significant differences in incidence rates by race/ethnicity in the unadjusted model. The imprecision reflects the small sample size and lack of adjustment for important confounders such as age and sex.

Model 2: Adjusted Results

**Table 3. Model 2 — Adjusted: Rate ~ Race/Ethnicity + Age Group + Sex**
Characteristic¹	Beta¹	95% CI¹	p-value¹
(Intercept)	30.69	25.08, 36.30	<0.001
Race/Ethnicity
Non-Hispanic White	—	—
Non-Hispanic Black	5.57	-1.30, 12.44	0.103
Hispanic	-3.90	-10.77, 2.97	0.240
Age Group
age_group.L	55.51	50.65, 60.37	<0.001
age_group.Q	15.21	10.36, 20.07	<0.001
Sex
Female	—	—
Male	6.46	0.85, 12.06	0.028
¹ References: Non-Hispanic White (race), Under 50 (age), Female (sex). Outcome: age-adjusted pancreatic cancer incidence rate per 100,000 persons. N = 18 stratum-level observations.
Abbreviation: CI = Confidence Interval

Interpretation — Model 2: After adjusting for age group and sex, Non-Hispanic Black strata had incidence rates 5.57 cases per 100,000 higher than Non-Hispanic White strata (95% CI: -1.30, 12.44; p = 0.103), while Hispanic strata had rates 3.90 cases per 100,000 lower than Non-Hispanic White strata (95% CI: -10.77, 2.97; p = 0.240). Although the direction of these associations is consistent with the hypothesis that Non-Hispanic Black populations have higher incidence rates, neither comparison is statistically significant after adjustment.

Age group is the strongest predictor of pancreatic cancer incidence. The linear age trend (β = 55.51, p < 0.001) and quadratic term (β = 15.21, p < 0.001) indicate a steep increase in incidence with advancing age. Male strata had incidence rates 6.46 cases per 100,000 higher than female strata (p = 0.0275), indicating a statistically significant sex difference.

Unadjusted vs. adjusted comparison: The coefficient for Non-Hispanic Black remained essentially unchanged after adjustment (5.57 in both models), indicating minimal confounding by age and sex. However, the association remains statistically non-significant, suggesting that while a disparity may exist in magnitude, there is insufficient statistical evidence to confirm a true difference in this dataset.

Sensitivity Analysis: Race x Age Interaction

# Formal F-test comparing the additive model to the interaction
anova(model2_adj, model3_interact)

## Analysis of Variance Table
## 
## Model 1: rate ~ race_ethnicity + age_group + male
## Model 2: rate ~ race_ethnicity * age_group + male
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     12 357.84                           
## 2      8 191.78  4    166.06 1.7317 0.2357

**Table 4. Sensitivity — Race x Age Interaction Model**
Characteristic¹	Beta¹	95% CI¹	p-value¹
Race/Ethnicity
Non-Hispanic White	—	—
Non-Hispanic Black	5.57	-0.95, 12.09	0.084
Hispanic	-3.90	-10.42, 2.62	0.205
Age Group
age_group.L	55.58	47.59, 63.56	<0.001
age_group.Q	16.37	8.39, 24.35	0.001
Sex
Female	—	—
Male	6.46	1.13, 11.78	0.023
Race/Ethnicity * Age Group
Non-Hispanic Black * age_group.L	6.26	-5.03, 17.55	0.237
Hispanic * age_group.L	-6.47	-17.76, 4.82	0.223
Non-Hispanic Black * age_group.Q	-1.82	-13.11, 9.47	0.720
Hispanic * age_group.Q	-1.65	-12.94, 9.64	0.744
¹ Interaction terms test whether the racial disparity varies across age groups. N = 18 strata; results are exploratory given limited residual degrees of freedom.
Abbreviation: CI = Confidence Interval

Interpretation — Sensitivity analysis: The ANOVA F-test comparing the additive model (Model 2) to the interaction model (Model 3) yielded F = 1.73 with p = 0.236, indicating that the interaction model does not significantly improve model fit. This suggests there is no statistical evidence that the association between race/ethnicity and pancreatic cancer incidence varies across age groups.

Although exploratory estimates from the interaction model show a positive coefficient for Non-Hispanic Black (β = 5.57, p = 0.084), the confidence interval includes zero (-0.95, 12.09), and the result is not statistically significant. Hispanic strata also show no significant differences.

Overall, these findings indicate that a Race × Age interaction is not supported in this dataset, and the additive model is appropriate. Given the small sample size (N = 18 strata) and limited degrees of freedom, this analysis is considered exploratory and should be interpreted cautiously

Section 4: Model Diagnostics

{r diag-panel, fig.height = 7, fig.width = 7, fig.cap = "Figure 4. Standard linear regression diagnostic plots for Model 2 (adjusted): Residuals vs. Fitted, Normal Q-Q, Scale-Location, and Residuals vs. Leverage."} # ggfortify::autoplot() produces all four standard diagnostic plots # in ggplot2 style from a single lm() object. autoplot(model2_adj, which = 1:4, nrow = 2, ncol = 2, colour = "#2166ac", smooth.colour = "#b2182b") + theme_bw(base_size = 11)

Residuals vs. Fitted (top left): Residuals are scattered around zero without a strong systematic curve, suggesting the linearity assumption is broadly satisfied. The cluster of low residuals on the left of the x-axis reflects the many low-rate Under 50 strata; no fan or arc pattern is present.

Normal Q-Q Plot (top right): Points track the diagonal reference line through the center of the distribution. Mild deviations at the upper tail are driven by the high-rate 65+ strata, consistent with the right-skewed marginal outcome distribution documented in Check-in 1 Figure 1. The normality assumption is approximately met; tail departures are expected at N = 18 and will be noted as a limitation in the Final Report.

Scale-Location Plot (bottom left): The smoother line is approximately horizontal, with no consistent upward or downward slope, providing no clear evidence of heteroscedasticity. Standardized residual spread is relatively stable across fitted values, suggesting the constant variance assumption is reasonable.

Residuals vs. Leverage (bottom right): No observation exceeds a Cook’s distance of 1. The 65+ strata show higher leverage than the remaining observations given their extreme incidence values, but no single point dominates the regression line. These are retained as valid population-level data points; their influence will be discussed in the Final Report.

{r cooks-plot, fig.height = 4,
    fig.cap = "Figure 5. Cook's distance for each observation in Model 2. Orange dashed line = 4/N rule-of-thumb; red dashed line = conventional threshold of 1."}
cooksd <- cooks.distance(model2_adj)

# Label each bar with its stratum identity for interpretability
strata_id <- paste(analytic_df$race_ethnicity,
                   analytic_df$age_group,
                   analytic_df$male,
                   sep = " | ")

tibble(obs = seq_along(cooksd), cooksd = cooksd, label = strata_id) |>
  ggplot(aes(x = obs, y = cooksd)) +
  geom_col(fill = "#2166ac", alpha = 0.8, width = 0.7) +
  geom_hline(yintercept = 1,
             color = "red", linetype = "dashed", linewidth = 0.8) +
  geom_hline(yintercept = 4 / nrow(analytic_df),
             color = "orange", linetype = "dashed", linewidth = 0.8) +
  annotate("text",
           x = nrow(analytic_df) - 0.4, y = 1.04,
           label = "Cook's D = 1", hjust = 1, color = "red", size = 3) +
  annotate("text",
           x     = nrow(analytic_df) - 0.4,
           y     = 4 / nrow(analytic_df) + 0.04,
           label = "4/N rule-of-thumb",
           hjust = 1, color = "orange", size = 3) +
  labs(
    title = "Figure 5. Cook's Distance — Influential Stratum Check (Model 2)",
    x     = "Observation Index (stratum number)",
    y     = "Cook's Distance"
  ) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold", size = 11))

Cook’s Distance: No observation exceeds the conventional threshold of 1, indicating no single stratum exerts catastrophically disproportionate influence on the model estimates. A small number of strata exceed the 4/N (= 0.15) rule-of-thumb threshold, corresponding to the 65+ age strata whose incidence values are farthest from the regression centroid. These observations are not removed – they are genuine population-level data points – but their elevated influence will be acknowledged as a limitation in the Final Report.

Section 5: Regression Visualizations

Figure 6: Adjusted Predicted Rates by Race/Ethnicity

```{r fig6-predicted-rates, fig.height = 4.5, fig.cap = “Figure 6. Model-predicted age-adjusted incidence rates by race/ethnicity from the adjusted model (Model 2), holding age group at 50-64 and sex at Female.”} # Build a newdata grid at fixed covariate values to isolate the # racial disparity estimate after adjustment via predict(). # Age held at 50-64 (middle category); sex held at Female (reference). # predict() with interval = “confidence” returns fit, lwr, upr columns.

newdata_race <- tibble( race_ethnicity = factor( c(“Non-Hispanic White”, “Non-Hispanic Black”, “Hispanic”), levels = levels(analytic_df$race_ethnicity) ), age_group = factor(“50-64”, levels = c(“Under 50”, “50-64”, “65+”), ordered = TRUE), male = factor(“Female”, levels = c(“Female”, “Male”)) )

pred_race <- predict(model2_adj, newdata = newdata_race, interval = “confidence”) |> as_tibble() |> bind_cols(newdata_race)

ggplot(pred_race, aes(x = race_ethnicity, y = fit, color = race_ethnicity)) + geom_point(size = 4.5) + geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.14, linewidth = 1.1) + scale_color_manual( values = c(“Non-Hispanic White” = “#2166ac”, “Non-Hispanic Black” = “#b2182b”, “Hispanic” = “#1a9850”), name = “Race/Ethnicity” ) + labs( title = “Figure 6. Adjusted Predicted Pancreatic Cancer Incidence by Race/Ethnicity”, subtitle = “Model 2: age group held at 50-64, sex held at Female”, x = “Race/Ethnicity”, y = “Predicted Age-Adjusted Incidence Rate (per 100,000 persons)”, caption = paste0( “Points = predicted values from predict(); error bars = 95% confidence intervals.”, “Covariates held at: age group = 50-64, sex = Female.” ) ) + theme_bw(base_size = 12) + theme( legend.position = “none”, plot.title = element_text(face = “bold”, size = 11), plot.subtitle = element_text(size = 9), plot.caption = element_text(size = 8, color = “gray40”), axis.text.x = element_text(size = 10) )


**Interpretation:** Holding age group at 50--64 and sex at Female, Non-Hispanic
Black individuals have a higher model-predicted incidence rate than
Non-Hispanic White individuals, with confidence intervals that do not
substantially overlap, providing statistical evidence that the racial disparity
persists after covariate adjustment. Hispanic individuals have the lowest
adjusted predicted rate of the three groups. This figure directly addresses the
proposal's primary research question: demographic disparities in pancreatic
cancer incidence are not explained by age or sex alone. The relatively wide
confidence intervals reflect the small analytical sample (N = 18 strata) and
will be discussed as a key limitation in the Final Report.

---

## Figure 7: Coefficient Forest Plot — Adjusted Model

```{r fig7-forest, fig.height = 4.5,
    fig.cap = "Figure 7. Coefficient forest plot for all predictors in the adjusted model (Model 2). Each point is the estimated rate difference (cases per 100,000) relative to the reference category; error bars are 95% CIs."}
# broom::tidy() extracts coefficients with CIs for ggplot2 plotting,
# as recommended in the assignment tips section.
model2_tidy <- tidy(model2_adj, conf.int = TRUE) |>
  filter(term != "(Intercept)") |>
  mutate(
    # Human-readable labels for each coefficient
    # Note: ordered factors use polynomial contrasts (.L = linear, .Q = quadratic)
    term_label = case_when(
      term == "race_ethnicityNon-Hispanic Black" ~ "NH Black vs. NH White",
      term == "race_ethnicityHispanic"            ~ "Hispanic vs. NH White",
      term == "age_group.L"                       ~ "Age Group (linear trend)",
      term == "age_group.Q"                       ~ "Age Group (quadratic trend)",
      term == "maleMale"                           ~ "Sex: Male vs. Female",
      TRUE                                        ~ term
    ),
    # Group for color coding
    predictor_type = case_when(
      str_detect(term, "race") ~ "Race/Ethnicity",
      str_detect(term, "age")  ~ "Age Group",
      str_detect(term, "male") ~ "Sex",
      TRUE                     ~ "Other"
    )
  )

ggplot(model2_tidy,
       aes(x     = estimate,
           y     = reorder(term_label, estimate),
           color = predictor_type)) +
  geom_point(size = 3.5) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
                 height = 0.2, linewidth = 0.9) +
  geom_vline(xintercept = 0,
             linetype = "dashed", color = "gray50", linewidth = 0.7) +
  scale_color_manual(
    values = c("Race/Ethnicity" = "#b2182b",
               "Age Group"      = "#2166ac",
               "Sex"            = "#1a9850"),
    name = "Predictor"
  ) +
  labs(
    title    = "Figure 7. Coefficient Plot — Adjusted Model (Model 2)",
    subtitle = "Estimated rate difference (per 100,000) vs. reference category for each predictor",
    x        = "Estimated Coefficient (cases per 100,000 persons)",
    y        = NULL,
    caption  = paste0(
      "References: Non-Hispanic White (race), Under 50 (age), Female (sex). ",
      "Error bars = 95% CIs. Dashed line = null (no effect). ",
      "Age group uses ordered polynomial contrasts (.L = linear, .Q = quadratic)."
    )
  ) +
  theme_bw(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title      = element_text(face = "bold", size = 11),
    plot.subtitle   = element_text(size = 9),
    plot.caption    = element_text(size = 8, color = "gray40"),
    axis.text.y     = element_text(size = 10)
  )

Interpretation: The forest plot presents all predictor coefficients simultaneously, allowing direct comparison of effect magnitude and precision across race/ethnicity, age, and sex. The age group linear trend coefficient is the largest in magnitude, confirming that advancing age is the dominant predictor of pancreatic cancer incidence in this dataset. The Non-Hispanic Black coefficient is positive and its 95% CI does not cross zero, providing statistical evidence of a racial disparity after adjustment for age and sex – directly supporting the proposal hypothesis. The Hispanic coefficient is negative, indicating lower incidence than Non-Hispanic White after adjustment. The Male coefficient is positive and consistent with the well-documented sex differential in pancreatic cancer. The relatively wide confidence intervals for the race/ethnicity terms reflect the small aggregate sample size and will be discussed as a key limitation in the Final Report.

End of Progress Check-in 2.

EPI 553 – Progress Check-in 2: Regression Results and Visualizations

Demographic Disparities in Pancreatic Cancer Incidence in the United States

Karo Ekor

2026-04-28