R pakkar og data

library(dplyr)
library(ggplot2)
library(wesanderson)
library(scales)
library(tidyr)
library(lubridate)
library(ggbump)
wes6_1 <- c("#d3d3d3", "#F4B5BD", "#C6CDF7", "#972D15", "#81A88D", "#FDD262")
wes11  <- c("#FDD262", "#F4B5BD", "#CDC08C", "#D8B70A", "#02401B", "#81A88D",
            "#972D15", "#85D4E3", "#C6CDF7", "#7294D4", "#D3DDDC")
wes15  <- c("#FDD262", "#F4B5BD", "#CDC08C", "#D8B70A", "#02401B", "#A2A475",
            "#81A88D", "#972D15", "#E6A0C4", "#C6CDF7", "#D8A499", "#7294D4",
            "#446455", "#C7B19C", "#D3DDDC")
wes19  <- c("#191970", "#F4B5BD", "#9C964A", "#CDC08C", "#FAD77B", "#D8B70A",
            "#02401B", "#A2A475", "#81A88D", "#972D15", "#E6A0C4", "#C6CDF7",
            "#D8A499", "#7294D4", "#446455", "#85D4E3", "#C7B19C", "#ffa700",
            "#555555", "#1e90ff")
PN <- read.csv(
  "/media/vegard/Work_volume_10TB1/2025_PK-dashboard_v2/poppunk/microreact/microreact_all_merged_metadata.csv",
  header = TRUE, sep = ","
)

# Fjern "__autocolour" fra kolonnenavn
colnames(PN) <- gsub("__autocolour", "", colnames(PN))

# Serotype: foretrekk Quellung, fall tilbake til WGS
PN$Serotype <- ifelse(PN$Quellung_serotype != "", PN$Quellung_serotype, PN$WGS_serotype)
PCV13 <- c('4','6B','9V','14','18C','19F','23F','1','3','5','6A','7F','19A')
PCV15 <- c('4','6B','9V','14','18C','19F','23F','1','3','5','6A','7F','19A','22F','33F')
PCV20 <- c('1','3','4','5','6A','6B','7F','8','9V','10A','11A','12F','14','15B',
           '18C','19A','19F','22F','23F','33F')
PCV21 <- c('3','6A','7F','8','9N','10A','11A','12F','15A','15C','16F','17F','19A',
           '20A','22F','23A','23B','24F','31','33F','35B')
PPV23 <- c('1','2','3','4','5','6B','7F','8','9N','9V','10A','11A','12F','14','15B',
           '17F','18C','19F','19A','20','22F','23F','33F')

PN$PCV13 <- ifelse(PN$Serotype %in% PCV13, "YES", "NO")
PN$PCV15 <- ifelse(PN$Serotype %in% PCV15, "YES", "NO")
PN$PCV20 <- ifelse(PN$Serotype %in% PCV20, "YES", "NO")
PN$PCV21 <- ifelse(PN$Serotype %in% PCV21, "YES", "NO")
PN$PPV23 <- ifelse(PN$Serotype %in% PPV23, "YES", "NO")
age_order <- c("0-6", "7-15", "16-44", "45-66", "67-79", "80+")
PN$age_group <- factor(PN$age_group, levels = age_order)

PN <- PN %>%
  mutate(
    date = as.Date(date),
    year = factor(year(date), levels = sort(unique(year(date))))
  ) %>%
  filter(!is.na(year))
top10_GPSC     <- names(sort(table(PN$GPSC),                                        decreasing = TRUE))[1:10]
top10_ST       <- names(sort(table(PN$ST[!is.na(PN$ST)       & PN$ST != ""]),       decreasing = TRUE))[1:10]
top10_Serotype <- names(sort(table(PN$Serotype[!is.na(PN$Serotype) & PN$Serotype != ""]), decreasing = TRUE))[1:10]

PN <- PN %>%
  mutate(
    GPSC_dominant = if_else(GPSC %in% top10_GPSC,                 GPSC,                  "other"),
    ST_dominant   = if_else(as.character(ST) %in% top10_ST,       as.character(ST),      "other"),
    Sero_dominant = if_else(as.character(Serotype) %in% top10_Serotype, as.character(Serotype), "other")
  )

Vaksinedekning

Numerisk oversikt over talet på sekvenserte IPD tilfeller med serotypar dekka av utvalde tilgjengelege vaksinar.

pcv_counts <- PN %>%
  group_by(year, PCV13, PCV15, PCV20, PCV21, PPV23) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(year) %>%
  summarise(
    PCV13       = sum(count[PCV13 == "YES"]),
    PCV15       = sum(count[PCV15 == "YES"]),
    PCV20       = sum(count[PCV20 == "YES"]),
    PCV21       = sum(count[PCV21 == "YES"]),
    PPV23       = sum(count[PPV23 == "YES"]),
    Cases_total = sum(count)
  ) %>%
  pivot_longer(
    cols      = c(PCV13, PCV15, PCV20, PCV21, PPV23, Cases_total),
    names_to  = "CoveredBy",
    values_to = "Count"
  )

ggplot(pcv_counts, aes(x = year, y = Count, fill = CoveredBy, group = CoveredBy)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = Count), vjust = 1.6, color = "white",
            position = position_dodge(0.9), size = 4) +
  labs(
    title = "Fig1 - Teoretisk vaksinedekning utvalgte vaksiner",
    x     = "",
    y     = "Antall",
    fill  = "Vaksine"
  ) +
  theme_minimal() +
  scale_fill_manual(values = wes6_1)


GPSC

Oversikt over dei ti vanlegaste GPSC gruppene i perioden fordelt på år.

gpsc_by_year <- PN %>%
  filter(!is.na(year)) %>%
  count(year, GPSC_dominant) %>%
  group_by(year) %>%
  mutate(fraction = n / sum(n))

ggplot(gpsc_by_year, aes(x = factor(year), y = fraction, fill = GPSC_dominant)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_fill_manual(values = wes11) +
  labs(
    title = "Fig2 - Største GPSC grupper (n > 50 totalt) per år",
    x     = "",
    y     = "Fraksjon",
    fill  = "GPSC"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x        = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank()
  )

Topp ti GPSC grupper - rangering over tid (bump-plot)

GPSC_ranks <- PN %>%
  filter(GPSC %in% top10_GPSC) %>%
  mutate(year = as.factor(year)) %>%
  group_by(GPSC, year) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(year) %>%
  mutate(rank = rank(-count, ties.method = "min")) %>%
  ungroup()

ggplot(GPSC_ranks, aes(x = year, y = rank, color = GPSC, group = GPSC)) +
  geom_bump(linewidth = 1.5, smooth = 8) +
  geom_point(size = 3) +
  scale_y_reverse(breaks = 1:10) +
  scale_color_manual(values = wes11) +
  labs(
    title = "Fig3 - GPSC utvikling (topp 10)",
    x     = "",
    y     = "Rank",
    color = "GPSC"
  ) +
  theme_minimal() +
  theme(
    legend.position  = "right",
    axis.text.x      = element_text(angle = 45, hjust = 1)
  )


Serotypar

Oversikt over dei ti vanlegaste serotypane i perioden fordelt på år.

Sero_by_year <- PN %>%
  filter(!is.na(year)) %>%
  group_by(year, Sero_dominant) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(fraction = n / sum(n)) %>%
  ungroup()

ggplot(Sero_by_year, aes(x = factor(year), y = fraction, fill = Sero_dominant)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_fill_manual(values = wes11) +
  labs(
    title = "Fig4 - Serotypar per år",
    x     = "År",
    y     = "Fraksjon",
    fill  = "Vanlegaste serotyper (topp 10)"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Topp ti serotypar - rangering over tid (bump-plot)

Sero_ranks <- PN %>%
  filter(Serotype %in% top10_Serotype) %>%
  mutate(year = as.factor(year)) %>%
  group_by(Serotype, year) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(year) %>%
  mutate(rank = rank(-count, ties.method = "min")) %>%
  ungroup()

ggplot(Sero_ranks, aes(x = year, y = rank, color = Serotype, group = Serotype)) +
  geom_bump(linewidth = 1.5, smooth = 8) +
  geom_point(size = 3) +
  scale_y_reverse(breaks = 1:10) +
  scale_color_manual(values = wes11) +
  labs(
    title = "Fig5 - Serotype utvikling (topp 10)",
    x     = "",
    y     = "Rank",
    color = "Serotype"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",
    axis.text.x     = element_text(angle = 45, hjust = 1)
  )


Aldersfordeling per serotype (topp 20)

top20_Serotype <- names(sort(table(
  PN$Serotype[!is.na(PN$Serotype) & PN$Serotype != ""]
), decreasing = TRUE))[1:20]

sero_age <- PN %>%
  filter(Serotype %in% top20_Serotype, !is.na(age_group)) %>%
  count(Serotype, age_group) %>%
  group_by(Serotype) %>%
  mutate(fraction = n / sum(n)) %>%
  ungroup() %>%
  mutate(Serotype = factor(Serotype, levels = top20_Serotype))

ggplot(sero_age, aes(x = age_group, y = fraction, fill = age_group)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Serotype, ncol = 4) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_fill_manual(values = wes6_1) +
  labs(
    title = "Fig6 - Aldersfordeling og serotypar",
    x = "Aldersgruppe",
    y = "Fraksjon"
  ) +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x  = element_text(angle = 45, hjust = 1),
    strip.text   = element_text(face = "bold"),
    panel.grid.major.x = element_blank()
  )

Serotypar per aldersgruppe, per år

age_sero <- PN %>%
  filter(!is.na(age_group), !is.na(Serotype), Serotype != "", !is.na(year)) %>%
  mutate(
    Sero20 = if_else(Serotype %in% top20_Serotype, Serotype, "Andre")
  ) %>%
  count(year, age_group, Sero20) %>%
  mutate(
    Sero20 = factor(Sero20, levels = c(top20_Serotype, "Andre"))
  )

sero20_colors <- setNames(
  c(wes19[seq_along(top20_Serotype)], "#d3d3d3"),
  c(top20_Serotype, "Andre")
)

pA <- ggplot(age_sero, aes(x = age_group, y = n, fill = Sero20)) +
  geom_col(position = "stack") +
  scale_fill_manual(values = sero20_colors, name = "Serotype") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  facet_wrap(~ year, ncol = 1) +
  labs(
    title = "Fig7A) Absolutte tal",
    x     = "Aldersgruppe",
    y     = "Antal tilfelle"
  ) +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x        = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    legend.position    = "none"
  )

age_sero_frac <- age_sero %>%
  group_by(year, age_group) %>%
  mutate(fraction = n / sum(n)) %>%
  ungroup()

pB <- ggplot(age_sero_frac, aes(x = age_group, y = fraction, fill = Sero20)) +
  geom_col(position = "stack") +
  scale_fill_manual(values = sero20_colors, name = "Serotype") +
  scale_y_continuous(labels = percent_format(accuracy = 1),
                     expand = expansion(mult = c(0, 0.02))) +
  facet_wrap(~ year, ncol = 1) +
  labs(
    title = "Fig7B) Relativ fraksjon",
    x     = "Aldersgruppe",
    y     = "Fraksjon"
  ) +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x        = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    legend.position    = "right",
    legend.key.size    = unit(0.4, "cm"),
    legend.text        = element_text(size = 8)
  )

library(patchwork)
pA + pB + plot_layout(widths = c(1, 1))

Penicillin resistens

total_per_year <- PN %>%
  group_by(year) %>%
  summarise(total = n())

mic_per_year <- PN %>%
  filter(!is.na(year)) %>%  # Exclude NA years here
  group_by(year, BEN_MIC, ST, GPSC, Serotype) %>%
  summarise(count = n(), .groups = "drop")

mic_proportions <- mic_per_year %>%
  left_join(total_per_year, by = "year") %>%
  mutate(proportion = count / total,
         ST = as.factor(ST))

high_mic_proportions <- subset(mic_proportions, BEN_MIC >= 2)

Andel ikkje-følsome (MIC > 0.064 mg/l) “Penicillin non-susceptible” [PNS] isolat fordelt på MIC per år

Antal (n) observasjonar printa over kva søyle.

mic_labels <- mic_proportions %>%
  group_by(year, BEN_MIC) %>%
  summarise(
    n_total    = sum(count),
    proportion = sum(count) / first(total),
    .groups    = "drop"
  )

ggplot(mic_proportions, aes(x = factor(BEN_MIC), y = proportion, fill = year)) +
  geom_col() +
  geom_text(data = mic_labels,
            aes(x = factor(BEN_MIC), y = proportion, label = n_total),
            vjust = -0.3, size = 3, color = "black", inherit.aes = FALSE) +
  scale_x_discrete(limits = c("0.12", "0.25", "0.5", "1", "2", "4", "8")) +
  scale_fill_manual(values = wes15) +
  facet_wrap(~year, nrow = 3) +
  labs(
    title = "Fig8 - Andel av PNS pneumokokkisolat fordelt på MIC",
    x     = "Penicillin MIC",
    y     = "Andel av isolat",
    fill  = "År"
  ) +
  theme_bw() +
  ylim(0,0.1) +
  theme(legend.position = "right")

Sekvenstype-fordeling blant penicillin-resistente pneumokokkar (MIC ≥ 2, “PRP”)

ggplot(high_mic_proportions, aes(x = factor(year), y = proportion, fill = ST)) +
  geom_col() +
  scale_fill_manual(values = wes19) +
  labs(
    title = "Fig9 - Penicillinresistente pneumokokker: sekvenstyper",
    x     = "År",
    y     = "Andel av isolat",
    fill  = "ST"
  ) +
  theme_bw() +
  theme(legend.position = "right")

GPSC-fordeling blant PRP-isolat (MIC ≥ 2)

ggplot(high_mic_proportions, aes(x = factor(year), y = proportion, fill = GPSC)) +
  geom_col() +
  scale_fill_manual(values = wes19) +
  labs(
    title = "Fig10 - GPSC-fordeling blant PRP-tilfeller",
    x     = "År",
    y     = "Andel av isolat",
    fill  = "GPSC"
  ) +
  theme_bw() +
  theme(legend.position = "right")

Serotype-fordeling blant PRP-isolat (MIC ≥ 2)

ggplot(high_mic_proportions, aes(x = factor(year), y = proportion, fill = Serotype)) +
  geom_col() +
  scale_fill_manual(values = wes19) +
  labs(
    title = "Fig11 - Serotype-fordeling blant PRP-tilfeller",
    x     = "År",
    y     = "Andel av isolat",
    fill  = "Serotype"
  ) +
  theme_bw() +
  theme(legend.position = "right")