Preliminary analysis into the epidemiology of mastiits in coastal NSW dairy herds

This preliminary analysis explores the burden of mastitis in NSW dairy herds, drawing on farm records and milk quality data. The objectives are to quantify the frequency and distribution of clinical mastitis and somatic cell count outcomes, identify seasonal and herd-level patterns, and highlight where mastitis incidence exceeds accepted benchmarks. By providing an evidence-based overview of the problem, the analysis aims to inform future research directions, support farmer decision-making, and establish a foundation for evaluating targeted strategies to improve milk quality, cow longevity, and overall herd health in NSW dairy systems.

Library and data import

Show the code
library(tidyverse); library(janitor); library(table1); library(survival); library(survminer); library(scales)

bmscc <- read_csv(file.path("~/Dropbox/R backup/2026/DairyUP2 Mastitis epidemiology","data in","national bmscc.csv"), 
    col_types = cols(lt_100 = col_number(), 
        `101_250` = col_number(), lt_150 = col_number(), 
        `151_250` = col_number(), lt_200 = col_number(), 
        `201_250` = col_number(), `250_400` = col_number(), 
        gt_400 = col_number())) %>% 
  clean_names %>%
    mutate(across(
    where(~ is.numeric(.x) &&
           !all(is.na(.x)) &&
           suppressWarnings(max(.x, na.rm = TRUE) <= 100) &&
           any(.x > 1, na.rm = TRUE)),
    ~ .x / 100
  )) %>%
  mutate(
    col = 1:15
  )

cols_keep <- c(1,2,3,4,5,7,8,12,14)

bmscc2 <- bmscc %>%
  filter(col %in% cols_keep) %>%
  mutate(
    gt_200 = 1 - lt_200,
    nsw_state = case_when(
      state == "NSW" ~ "NSW",
      T ~ "Other states"
         ),
         region_name = case_when(
           region == "All" & state == "NSW" ~ "NSW - All regions",
           region == "All" ~ state,
           T ~ paste0(state," - ",region)
         ))  # inversion of <200k


exclude_herd_list <- read_csv(file.path("~/Dropbox/R backup/2026/DairyUP2 Mastitis epidemiology","data in/exclude herd list.csv")) %>% clean_names %>%
  pull(herd_name)

archive <- readRDS("~/Dropbox/R backup/Herd work/Herd Tools/multi-herd/multi-herd_processed.rds")

# Post extraction processing
event_names <- unique(unlist(map(archive, ~ names(.x$raw_events))))


event_names <- archive %>%
  map(~ names(.x$raw_events)) %>%
  flatten_chr() %>%
  unique()

events_bound <- event_names %>%
  set_names() %>%
  map(function(ev) {
    archive %>%
      imap_dfr(function(herd_list, herd_nm) {
        df <- herd_list$raw_events[[ev]]
        if (is.null(df)) return(tibble())
        
        df <- df %>%
          mutate(id = as.character(if ("id" %in% names(.)) id else NA_character_))
        
        df %>% mutate(herd = herd_nm, .before = 1)
      })
  })


demo_multi_herd_df <- archive %>%
  map("demo") %>%
  bind_rows()

ai_multi_herd_df <- archive %>%
  map("ai") %>%
  bind_rows()

pregnancies_multi_herd_df <- archive %>%
  map("pregnancies") %>%
  bind_rows()

bulk_milk_multi_herd_df <- archive %>%
  map("bulk_milk") %>%
  keep(~ !is.null(.x) && nrow(.x) > 0) %>%  # drop NULLs and empties
  bind_rows()



mast_herds <- demo_multi_herd_df %>% 
  mutate(exclude = case_when(
    herd_name %in% exclude_herd_list ~ 1,
    T ~ 0
  )) %>%
  filter(exclude == 0) %>%
  select(-exclude) %>%
  tabyl(herd_name,mast) %>%
  as_tibble %>%
  filter(`1` > 30) %>%
  pull(herd_name)

#mast_herds

enrollment_start = as.Date("2010-01-01")
enrollment_end = as.Date("2022-01-01")

cohort <- events_bound$calvings %>% 
  filter(lact == 1,
         calving_date %>% between(enrollment_start,enrollment_end),
         herd_name %in% mast_herds) %>%
  select(herd_name,nationalid,calving_date) %>%
  distinct

#Insert mast 1 date

mast_1_list <- events_bound$mast %>%
  select(herd_name,nationalid,mast_date) %>%
  group_by(herd_name,nationalid) %>%
  summarise(
    mast_1_date = min(mast_date)
  )

mast_recording_window <-  events_bound$mast %>%
  select(herd_name,nationalid,mast_date) %>%
  group_by(herd_name) %>%
  summarise(
    mast_first_record_herd = min(mast_date),
    mast_last_record_herd = max(mast_date)
  )

calving_3_list <- events_bound$calvings %>% 
  filter(lact == 3,
         herd_name %in% mast_herds) %>%
  mutate(calving_3_date = calving_date) %>%
  select(herd_name,nationalid,calving_3_date) %>%
  distinct

calving_latest <- events_bound$calvings %>% 
  filter(herd_name %in% mast_herds) %>%
  group_by(herd_name,nationalid) %>%
  summarise(
    calving_latest_date = max(calving_date)
  ) %>%
  ungroup

removal_list <- events_bound$removals %>%
  select(herd_name,nationalid,removal_date) %>%
  distinct


data <- cohort %>%
  left_join(mast_1_list, by = c("herd_name","nationalid")) %>%
  left_join(calving_3_list, by = c("herd_name","nationalid")) %>%
  left_join(removal_list, by = c("herd_name","nationalid")) %>%
  left_join(calving_latest, by = c("herd_name","nationalid")) %>%
  left_join(mast_recording_window, by = "herd_name") %>%
  mutate(
    lost_cow = case_when(
      is.na(removal_date) & calving_date == calving_latest_date ~ 1,
      T ~ 0),
    end_point = pmin(calving_3_date,removal_date,na.rm=T),
    follow_up_within_windows = case_when(
      calving_date %>% between(mast_first_record_herd,mast_last_record_herd) &
        end_point %>% between(mast_first_record_herd,mast_last_record_herd) ~ 1,
      T ~ 0)
    )

#data %>% tabyl(lost_cow)
#data %>% tabyl(follow_up_within_windows)

data <- data %>%
  filter(lost_cow == 0,
         follow_up_within_windows == 1) %>% 
  select(-lost_cow, -follow_up_within_windows)

Bulk milk data

Show the code
ggplot(bmscc2, aes(x = region_name, y = gt_200, fill = nsw_state)) +
  geom_col() +
  facet_grid(cols = vars(nsw_state), scales = "free_x", space = "free_x") +
  scale_y_continuous(labels = percent_format(accuracy = 1), limits = c(0, 1)) +
  scale_fill_manual(
    values = c("NSW" = "red", "QLD" = "grey60", "QLD/NSW" = "grey60",
               "SA" = "grey60", "TAS" = "grey60", "VIC" = "grey60",
               "VIC/NSW" = "grey60", "WA" = "grey60")
  ) +
  labs(
    title = "",
    x = NULL, y = "Herds (%) with average bulk milk \n SCC more than 200k"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"   # removes the legend
  )

nsw_prevalence <- bmscc2 %>% filter(region_name == "NSW - All regions") %>% pull(gt_200)

tibble(
  state = bmscc2$state,
  prevalence_gt_200 = bmscc2$gt_200,
  prevalence_ratio = nsw_prevalence / prevalence_gt_200,
  prevalence_difference = nsw_prevalence - prevalence_gt_200
)
Show the code
bulk_milk_plot_herds_combined <- ggplot(bulk_milk_multi_herd_df, aes(x = date, y = scc)) +
   geom_rect(
    inherit.aes = FALSE,
    aes(ymin = 0, ymax = 200,
        xmin = min(bulk_milk_multi_herd_df$date),
        xmax = max(bulk_milk_multi_herd_df$date)),
    fill = "green", alpha = 0.005
  ) +
  geom_point(alpha = 0.5, size = 0.1) +                           
  stat_smooth(se = FALSE, span = 0.4) +               
  coord_cartesian(
    ylim = c(min(bulk_milk_multi_herd_df$scc, na.rm = TRUE),
             max(bulk_milk_multi_herd_df$scc, na.rm = TRUE)+10)
  ) +
  scale_y_continuous(breaks = seq(0, 1000, 50)) +
  scale_x_date(date_breaks = "6 month", date_labels = "%b-%y") +
  ggtitle("Bulk milk somatic cell count") +
  labs(y = "x 1000 cells/ml", x = "Date", color = "") +
  theme(
    text = element_text(family = "Times New Roman"),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white", size = 0.5),
    panel.grid.major = element_line(size = 0.2, linetype = 'solid', colour = "grey"),
    panel.grid.minor = element_line(size = 0.2, linetype = 'solid', colour = "grey"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  guides(color = "none")

bulk_milk_plot_herds_combined

Season pattern of bulk milk somatic cell counts for herd health clients in NSW combined. Each dot represents a milk pick-up for a herd. Somatic cell counts increase over summer, with some herds seeing values more than 400,000 cells/ml.

Show the code
ggplot(bulk_milk_multi_herd_df %>% filter(herd_name != "Speldon"), aes(x = date, y = scc, color = herd_name)) +
  geom_point(alpha = 0.5, size = 0.1) +                           
  stat_smooth(se = FALSE, span = 0.4) +               
  coord_cartesian(
    ylim = c(min(bulk_milk_multi_herd_df$scc, na.rm = TRUE),
             400)
  ) +
  scale_y_continuous(breaks = seq(0, 1000, 50)) +
  scale_x_date(date_breaks = "6 month", date_labels = "%b-%y") +
  ggtitle("Bulk milk somatic cell count") +
  labs(y = "x 1000 cells/ml", x = "Date", color = "") +
  theme(
    text = element_text(family = "Times New Roman"),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white", size = 0.5),
    panel.grid.major = element_line(size = 0.2, linetype = 'solid', colour = "grey"),
    panel.grid.minor = element_line(size = 0.2, linetype = 'solid', colour = "grey"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  guides(color = "none")

How common is clinical mastitis in NSW herds?

Show the code
demo_mast_herd <- demo_multi_herd_df %>%
  filter(herd_name %in% mast_herds) %>%
  select(nationalid,calving_date,mast,mast_2,mast_3,mast_tar,mast_2_tar,mast_3_tar,first_mast,mast_2_date,mast_3_date,dry_off_date,removal_date,end_date,quarter_start,herd_name) %>%
  left_join(mast_recording_window, by = "herd_name") %>%
  mutate(
    year = year(calving_date),
    year_cat = case_when(
      year <= 2015 ~ "2015 or earlier",
      year %>% between(2016,2020) ~ "2016 to 2020",
      year %>% between(2021,2025) ~ "2021 to 2025"
    ),
    follow_up_within_windows = case_when(
      calving_date %>% between(mast_first_record_herd,mast_last_record_herd) &
        end_date %>% between(mast_first_record_herd,mast_last_record_herd) ~ 1,
      T ~ 0)
    )

#demo_mast_herd %>% tabyl(follow_up_within_windows)

demo_mast_herd <- demo_mast_herd %>%
  filter(follow_up_within_windows == 1)

demo_mast <- rbind(
  demo_mast_herd %>% 
    select(nationalid,mast,mast_tar,quarter_start) %>%
    mutate(case = "Cows with at least 1 clinical case"),

  demo_mast_herd %>%
    mutate(mast = mast_2,
           mast_tar = mast_2_tar,
           case = "Cows with at least 2 clinical cases") %>%
    select(nationalid,mast,mast_tar,quarter_start,case),
  
  demo_mast_herd %>% 
    mutate(mast = mast_3,
           mast_tar = mast_3_tar,
           case = "Cows with at least 3 clinical cases") %>%
    select(nationalid,mast,mast_tar,quarter_start,case)
)


KM <- survfit(Surv(mast_tar, mast) ~ case, data = demo_mast)

KM_data <- surv_summary(KM,demo_mast_herd) %>% filter(n.risk > 2)

max_y <- pmax(0.5,KM_data %>% summarise(max_y = 1 - min(surv)))

ggsurvplot_df(KM_data,
           ylim = c(0, max_y),
           title = "",
           ylab = "Percent of cows",
           legend = c(0.25, 0.75),legend.title="",
           legend.labs = c("Cows with at least 1 clinical case",
                           "Cows with at least 2 clinical cases",
                           "Cows with at least 3 clinical cases"),
           #linetype = c(3,2,1),
           linetype = 1,
           palette = c("blue","orange","red"),
           xlim = c(0, 300),pval = F, censor=F, conf.int = F,risk.table = F, 
           surv.plot.height = 5, risk.table.pos="in", fun = "event",
           surv.scale="percent",  break.x.by = 20,break.y.by = 0.05, 
           ggtheme = theme_bw(base_family = "Times New Roman"),
           xlab="Days in milk",risk.table.fontsize=3)

Proportion of cows with at least 1, 2, or 3 cases of mastitis by days in milk.

Show the code
KM <- survfit(Surv(mast_tar, mast) ~ year_cat, data = demo_mast_herd)

KM_data <- surv_summary(KM,demo_mast) %>% filter(n.risk > 2)

max_y <- pmax(0.5,KM_data %>% summarise(max_y = 1 - min(surv)))

ggsurvplot_df(KM_data,
           ylim = c(0, max_y),
           title = paste0("Lactations (n = ",
                             max(KM_data$n.risk),
                             ")"),
           ylab = "Percent of cows",
           legend = c(0.25, 0.75),legend.title="",
           linetype = 1,
           xlim = c(0, 300),pval = F, censor=F, conf.int = F,risk.table = F, 
           surv.plot.height = 5, risk.table.pos="in", fun = "event",
           surv.scale="percent",  break.x.by = 20,break.y.by = 0.05, 
           ggtheme = theme_bw(base_family = "Times New Roman"),
           xlab="Days in milk",risk.table.fontsize=3) + 
  geom_vline(xintercept = 14, linetype="dashed", color = "blue", linewidth=0.5)

Proportion of cows with at least 1 case of mastitis by days in milk, stratified by year of calving group.

Show the code
KM <- survfit(Surv(mast_tar, mast) ~ herd_name, data = demo_mast_herd)

KM_data <- surv_summary(KM,demo_mast) %>% filter(n.risk > 2)

max_y <- pmax(0.5,KM_data %>% summarise(max_y = 1 - min(surv)))

ggsurvplot_df(KM_data,
           ylim = c(0, max_y),
           title = paste0("Mastitis rates from 1-300 DIM. Each line is a herd"),
           ylab = "Percent of cows",
           legend = "none",
           linetype = 1,
           xlim = c(0, 300),pval = F, censor=F, conf.int = F,risk.table = F, 
           surv.plot.height = 5, risk.table.pos="in", fun = "event",
           surv.scale="percent",  break.x.by = 20,break.y.by = 0.05, 
           ggtheme = theme_bw(base_family = "Times New Roman"),
           xlab="Days in milk",risk.table.fontsize=3) + 
  geom_vline(xintercept = 14, linetype="dashed", color = "blue", linewidth=0.5)

Show the code
survival <- KM_data %>% 
  group_by(strata) %>%
  summarise(
    min_survival = min(surv)
  )

herds_to_exclude <- survival %>%
  filter(min_survival > 0.9) %>%
  pull(strata)

Proportion of cows with at least 1 case of mastitis by days in milk, stratified by herd.

Show the code
mastitis_rate_daily <- function(date_select){
  
  lactating_cows <- demo_mast_herd %>% 
    filter(calving_date < date_select) %>% 
    filter(end_date > date_select) %>%
    mutate(dim_on_day = as.numeric(date_select - calving_date))

  at_risk_mastitis <- lactating_cows %>%
      left_join(events_bound[["mast"]] %>% 
                  select(nationalid,herd_name,mast_date) %>%
                  filter(mast_date == date_select), 
                by = c("herd_name","nationalid")) %>%
    mutate(
      mast_on_day = case_when(
        mast_date == date_select ~ 1,
        T ~ 0
      )
    )
    
  # Summary table for 21 day period
  table_out <- at_risk_mastitis %>%
    summarise(
      date = date_select,
      
      at_risk_n = n(),
      at_risk_n_fresh = sum(dim_on_day < 31),
      at_risk_n_gt_30_dim = sum(dim_on_day > 30),
      
      mast_n = sum(mast_on_day==1),
      mast_n_fresh = sum(mast_on_day==1 & dim_on_day < 31),
      mast_n_gt_30_dim = sum(mast_on_day==1 & dim_on_day > 30),
      
      mast_rate = mast_n / at_risk_n,
      mast_rate_fresh = mast_n_fresh / at_risk_n_fresh,
      mast_rate_gt_30_dim = mast_n_gt_30_dim / at_risk_n_gt_30_dim
    )
  
  table_out
  
}


#mastitis_rate_daily(date_select = as.Date("2024-07-22"))

# Create sequence of dates

mastitis_rate_time_end = as.Date("2024-12-01")
mastitis_rate_time_start = mastitis_rate_time_end - years(3)

all_dates <- seq(
  from = mastitis_rate_time_start,
  to   = mastitis_rate_time_end,
  by   = "day"
)


# Run your function for each date and combine into a single data frame
table_out <- map_dfr(.progress = T,
                     all_dates, ~ mastitis_rate_daily(date_select = .x))

monthly_avgs_mast <- table_out %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(
    days_in_month   = n(),
    
    monthly_cow_days        = sum(at_risk_n) %>% round(0),
    monthly_cow_days_fresh  = sum(at_risk_n_fresh) %>% round(0),
    monthly_cow_days_gt_30d = sum(at_risk_n_gt_30_dim) %>% round(0),
    
    at_risk_cows           = mean(at_risk_n, na.rm = TRUE) %>% round(0),
    at_risk_cows_fresh     = mean(at_risk_n_fresh, na.rm = TRUE) %>% round(0),
    at_risk_cows_gt_30d   = mean(at_risk_n_gt_30_dim, na.rm = TRUE) %>% round(0),
    
    mast_n_sum              = sum(mast_n),
    mast_n_fresh_sum        = sum(mast_n_fresh),
    mast_n_gt_30d_sum      = sum(mast_n_gt_30_dim),
    
    mast_rate               = mean(mast_rate,  na.rm = TRUE) * 100 %>% round(1),
    mast_rate_fresh         = mean(mast_rate_fresh,  na.rm = TRUE) * 100 %>% round(1),
    mast_rate_gt_30d       = mean(mast_rate_gt_30_dim,  na.rm = TRUE) * 100 %>% round(1),
    
    monthly_mast_rate_per_100_cow_days = round(mast_n_sum/monthly_cow_days * 100,1),
    monthly_mast_rate_per_100_cow_months = round(mast_n_sum/(monthly_cow_days/30.4) * 100,1),
    
    monthly_mast_rate_fresh_per_100_cow_days = round(mast_n_fresh_sum/monthly_cow_days_fresh * 100,1),
    monthly_mast_rate_fresh_per_100_cow_months = round(mast_n_fresh_sum/(monthly_cow_days_fresh/30.4) * 100,1),
    
    monthly_mast_rate_gt_30d_per_100_cow_days = round(mast_n_gt_30d_sum/monthly_cow_days_gt_30d * 100,1),
    monthly_mast_rate_gt_30d_per_100_cow_months  = round(mast_n_gt_30d_sum/(monthly_cow_days_gt_30d/30.4) * 100,1),
    
    .groups = "drop"
  ) %>%
  
  filter(days_in_month >= 10) %>%
  select(-days_in_month) %>%
  mutate(month = format(month, "%Y-%m")
  )



long <- monthly_avgs_mast %>% 
  pivot_longer(cols = c(at_risk_cows,at_risk_cows_fresh,at_risk_cows_gt_30d,
                        monthly_mast_rate_per_100_cow_months,
                        monthly_mast_rate_fresh_per_100_cow_months,
                        monthly_mast_rate_gt_30d_per_100_cow_months
                        ), 
               names_to = "type", values_to = "count")

stack <- long %>% filter(type=="monthly_mast_rate_per_100_cow_months" |
                           type=="monthly_mast_rate_fresh_per_100_cow_months"|
                           type=="monthly_mast_rate_gt_30d_per_100_cow_months")

stack <- merge(stack,monthly_avgs_mast %>% select(month,
                                                  at_risk_cows,at_risk_cows_fresh,at_risk_cows_gt_30d,
                                                  monthly_mast_rate_per_100_cow_months,
                                                  monthly_mast_rate_fresh_per_100_cow_months,
                                                  monthly_mast_rate_gt_30d_per_100_cow_months,
                                                  ),by="month") %>%
  select(month,type,count)

# max_y = ifelse(
#   max(monthly_avgs_mast$monthly_mast_rate_per_100_cow_months) > 5,15,2)


# Main plot -----

# --- Build labeled data (same mapping you used) ---
rates_long <- stack %>%
  mutate(
    type_label = dplyr::case_when(
      type == "monthly_mast_rate_fresh_per_100_cow_months"   ~ "First month",
      type == "monthly_mast_rate_gt_30d_per_100_cow_months"  ~ "Subsequent months",
      type == "monthly_mast_rate_per_100_cow_months"         ~ "Overall",
      TRUE ~ type
    ),
    type_label = factor(type_label, levels = c("First month", "Subsequent months", "Overall"))
  )

# Order month categories chronologically (treat months as discrete for nice fat bars)
ordered_levels <- rates_long %>%
  distinct(month) %>%
  arrange(as.Date(paste0(month, "-01"))) %>%
  pull(month)

rates_long <- rates_long %>%
  mutate(month = factor(month, levels = ordered_levels))

# Split into overall for bars, and subgroup values for labels
overall_df <- rates_long %>% filter(type_label == "Overall")
subgroups  <- rates_long %>%
  filter(type_label %in% c("First month", "Subsequent months")) %>%
  select(month, type_label, count) %>%
  tidyr::pivot_wider(names_from = type_label, values_from = count)

# Build a label frame: one row per month with a two-line label
# Fresh (First month) ABOVE Subsequent months
top_labels <- overall_df %>%
  left_join(subgroups, by = "month") %>%
  mutate(
    # tweak rounding as you prefer (1 decimal here)
    label_text = paste0(
      sprintf("%.1f", `First month`), "\n"#,
      #sprintf("%.1f", `Subsequent months`)
    )
  )

# Set y-limits with a bit of headroom for labels
max_overall <- max(overall_df$count, na.rm = TRUE)
label_pad   <- max(0.05 * max_overall, 1)      # vertical gap above bar
label_top   <- max_overall + 3 * label_pad     # enough space for two lines

# You can also force a fixed top if you prefer:
max_y <- ceiling((max_overall + 3 * label_pad) / 5) * 5

# Discrete month axis labels: "Jan-25", etc.
month_labeler <- function(m) format(as.Date(paste0(m, "-01")), "%b-%y")

# --- Plot: Overall as bars; subgroup rates as stacked numbers above each bar ---
montly_mastitis_rate_plot <- ggplot() +
  geom_col(
    data = overall_df,
    aes(x = month, y = count, fill = type_label),  # fill just to keep legend style; can drop if you want no legend
    width = 0.8
  ) +
  coord_cartesian(ylim = c(0, label_top)) +
  scale_y_continuous(breaks = seq(0, ceiling(label_top), by = 1)) +
  scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 6)]) +
  ggtitle("Monthly Mastitis Rate") +
  labs(
    y = "Cases of mastitis per 100 milking cows per month",
    x = "Date",
    fill = ""
  ) +
  guides(fill = "none") +  # hide fill legend (only 'Overall' would appear)
  theme(
    text = element_text(family = "Times New Roman"),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    plot.subtitle = element_text(color = "darkblue", size = 8),
    title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white", size = 0.5, linetype = "solid"),
    panel.grid.major = element_line(size = 0.2, linetype = "solid", colour = "grey"),
    panel.grid.minor = element_line(size = 0.2, linetype = "solid", colour = "grey"),
    legend.position = "bottom",
    legend.text = element_text(colour = "black", size = 8, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

#montly_mastitis_rate_plot




# Alernative plot -----


max_y <- ceiling(max(rates_long$count, na.rm = TRUE) / 5) * 5

rates_long <- stack %>%
  mutate(
    month = month,  # keep as string for now
    type_label = dplyr::case_when(
      type == "monthly_mast_rate_fresh_per_100_cow_months"   ~ "First month",
      type == "monthly_mast_rate_gt_30d_per_100_cow_months"  ~ "Subsequent months",
      type == "monthly_mast_rate_per_100_cow_months"         ~ "Overall",
      TRUE ~ type
    )
  ) %>%
  mutate(
    type_label = factor(type_label, levels = c("First month", "Subsequent months", "Overall"))
  )

# 2) Order month categories chronologically
ordered_levels <- rates_long %>%
  distinct(month) %>%
  arrange(as.Date(paste0(month, "-01"))) %>%
  pull(month)

# 3) Split data and convert month to an ordered factor
bars_df <- rates_long %>%
  filter(type_label %in% c("First month", "Subsequent months")) %>%
  mutate(month = factor(month, levels = ordered_levels))

overall_df <- rates_long %>%
  filter(type_label == "Overall") %>%
  mutate(month = factor(month, levels = ordered_levels))

# 4) Nice month labels for discrete axis
month_labeler <- function(m) format(as.Date(paste0(m, "-01")), "%b-%y")

# 5) Y limit
max_y <- ceiling(max(rates_long$count, na.rm = TRUE) / 5) * 5

# 6) Plot: grouped bars (discrete x), overall as line/points
daily_mast_rate <- ggplot() +
  geom_bar(
    data = bars_df,
    aes(x = month, y = count, fill = type_label),
    stat = "identity",
    position = position_dodge(width = 0.9),
    width = 0.8
  ) +
  geom_line(
    data = overall_df,
    aes(x = month, y = count, group = 1),
    linewidth = 0.4
  ) +
  geom_point(
    data = overall_df,
    aes(x = month, y = count),
    size = 1.6
  ) +
  coord_cartesian(ylim = c(0, max_y)) +
  scale_y_continuous(breaks = seq(0, max_y, by = 1)) +
  scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 6)]) +
  ggtitle("Monthly Mastitis Rates by At-risk Group") +
  labs(
    y = "Cases of mastitis per 100 milking cows",
    x = "Date",
    subtitle = "Bars: at-risk groups (First month, Subsequent months). Line: overall rate.",
    fill = ""
  ) +
  guides(fill = guide_legend(reverse = TRUE)) +
  theme(
    text = element_text(family = "Times New Roman"),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    plot.subtitle = element_text(color = "darkblue", size = 8),
    title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 11, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white", size = 0.5, linetype = "solid"),
    panel.grid.major = element_line(size = 0.2, linetype = "solid", colour = "grey"),
    panel.grid.minor = element_line(size = 0.2, linetype = "solid", colour = "grey"),
    legend.position = "bottom",
    legend.text = element_text(colour = "black", size = 8, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

daily_mast_rate

Monthly mastitis rate (cases per 100 cow-months). Data from 5000-6000 cows lactating each day over the study period were used.

Impact of mastitis on longevity

Retrospective cohort study

Show the code
df <- data # replace with your object, e.g., df <- demo

# (Optional) choose an administrative study end if needed
# If a cow has neither removal_date nor calving_3_date, we’ll censor at the latest date seen in the data.
study_end <- max(df %>%
                   select(calving_latest_date, mast_1_date, removal_date, calving_3_date) %>%
                   unlist(use.names = FALSE),
                 na.rm = TRUE) %>% as.Date


tv_df <-
  df %>%
  mutate(
    # define end of follow-up: earliest of removal or 3rd calving; else admin censor
    end_date = coalesce(pmin(removal_date, calving_3_date, na.rm = TRUE), study_end),
    # origin of time: latest calving (your follow-up start)
    start_date = calving_date,
    # ensure start <= end (drop any with missing or inverted dates)
    .keep = "all"
  ) %>%
  filter(!is.na(start_date), !is.na(end_date), end_date >= start_date) %>%
  # build intervals: 1) unexposed from start to min(mast_1_date, end); 2) exposed from mast_1_date to end (if mast_1 happens before end)
  mutate(
    # clamp mastitis date to the follow-up window
    mast_in_window = if_else(!is.na(mast_1_date) & mast_1_date <= end_date & mast_1_date >= start_date,
                             mast_1_date, as_date(NA))
  ) %>%
  # make two potential rows per cow, then prune impossible ones
  transmute(
    herd_name, nationalid, start_date, end_date, removal_date, calving_3_date, mast_in_window,
    int1_start = start_date,
    int1_stop  = coalesce(mast_in_window, end_date),
    int1_mast  = 0L,

    int2_start = mast_in_window,
    int2_stop  = if_else(!is.na(mast_in_window), end_date, as_date(NA)),
    int2_mast  = 1L
  ) %>%
  pivot_longer(
    cols = starts_with("int"),
    names_to = c("int", ".value"),
    names_pattern = "int(\\d+)_(start|stop|mast)"
  ) %>%
  filter(!is.na(start), !is.na(stop), stop > start) %>%  # keep valid intervals
  mutate(
    event = as.integer(!is.na(removal_date) & stop == removal_date),
    # single cause code: 1 = Departed, 2 = Survived to 3rd calving, 0 = censored
    event_code = case_when(
      !is.na(removal_date)   & stop == removal_date   ~ 1L,
      !is.na(calving_3_date) & stop == calving_3_date ~ 2L,
      TRUE                                        ~ 0L
    ),
    event_ms = factor(event_code,
                      levels = c(0, 1, 2),
                      labels = c("Censored", "Departed herd", "Survived to 3rd lactation")),
    # cause-specific indicators for Cox models
    event_depart  = as.integer(event_code == 1L),
    event_third   = as.integer(event_code == 2L),
    # analysis times in days since start of follow-up (counting-process format)
    tstart = as.numeric(start - start_date),
    tstop  = as.numeric(stop - start_date)
  ) %>%
  select(herd_name, nationalid, start_date, start, stop, tstart, tstop, mast = mast,event_code, event,event_ms,event_depart,
         event_third)



# km <- survfit(Surv(tstart, tstop, event) ~ mast, data = tv_df)
# #summary(km)
# 

Tabular method to calculate incidence rate ratio

Show the code
irr_table <- tv_df %>%
  group_by(mast) %>%
  summarise(
    total_days_at_risk = sum(tstop - tstart, na.rm = TRUE),
    n_events = sum(event == 1, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    incidence_rate = n_events / total_days_at_risk
  )

irr_table
mast total_days_at_risk n_events incidence_rate
0 7482370 3689 0.0004930
1 1824659 1576 0.0008637
Show the code
irr = irr_table$incidence_rate[irr_table$mast==1]/irr_table$incidence_rate[irr_table$mast==0]

The incidence rate ratio for culling for cows with mastitis vs non-mastitis cows is 1.75.

Cox regression model

Show the code
fit <- coxph(Surv(tstart, tstop, event) ~ mast + strata(herd_name), data = tv_df)
summary(fit)
Call:
coxph(formula = Surv(tstart, tstop, event) ~ mast + strata(herd_name), 
    data = tv_df)

  n= 17429, number of events= 5265 

        coef exp(coef) se(coef)     z Pr(>|z|)    
mast 0.51162   1.66799  0.03282 15.59   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

     exp(coef) exp(-coef) lower .95 upper .95
mast     1.668     0.5995     1.564     1.779

Concordance= 0.543  (se = 0.003 )
Likelihood ratio test= 230  on 1 df,   p=<2e-16
Wald test            = 243  on 1 df,   p=<2e-16
Score (logrank) test = 246.7  on 1 df,   p=<2e-16
Show the code
# ggsurvplot(km,title = "title",
#               ylab = "Percent of cows departed",
#               xlim = c(0,1000),pval = F, censor=F, conf.int = F,risk.table = F, 
#               surv.plot.height = 5, risk.table.pos="in", fun = "event",
#               surv.scale="percent",  break.x.by = 50,break.y.by = 0.10, 
#               ggtheme = theme_bw(base_family = "Times New Roman"),
#               xlab="Days since first calving",risk.table.fontsize=3)

Cows with a history of mastitis had 1.7 times higher hazards of culling (95% CI: 1.6 to 1.8)

Case control approach

Case = Cow made it to lactation 3. Control = randomly selected cow (‘case cohort’ study)

Show the code
tabular_data <- data %>%
  mutate(
    mast_before_lact_3 = case_when(
      is.na(mast_1_date) ~ 0,
      mast_1_date > calving_3_date ~ 0,
      mast_1_date < calving_3_date ~ 1,
      !is.na(mast_1_date) ~ 1
    ),
    
    made_it_to_lact_3 = case_when(
      removal_date > calving_3_date ~ 1,
      is.na(removal_date) & !is.na(calving_3_date) ~ 1,
      !is.na(removal_date) & is.na(calving_3_date) ~ 0,
      T ~ NA
    ),
    
    censor_date = pmin(calving_3_date,removal_date, na.rm=T),
    
    time_at_risk = as.numeric(censor_date - calving_date)
  )

tabular_data <- tabular_data %>% 
  group_by(made_it_to_lact_3) %>%
  summarise(
    mast_count = sum(mast_before_lact_3),
    days_at_risk = sum(time_at_risk),
    mast_rate = (mast_count / days_at_risk) * 1000
  )

tabular_data
made_it_to_lact_3 mast_count days_at_risk mast_rate
0 1701 2518780 0.6753269
1 2775 6787567 0.4088357
NA 0 682 0.0000000

Cross-product ratio

Show the code
# Cross product ratio
(2266 * 2043187) / (1329 * 5641325)
[1] 0.6175355
Show the code
library(cmprsk)

km <- survfit(Surv(tstart, tstop, event_ms) ~ mast, data = tv_df)
#summary(km)

ggcompetingrisks(km,
      palette = "jco", ggtheme = theme_survminer()) + 
      ggtitle("Title") +
  xlim(0, 1500)