# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, ggplot2, reshape2, scales)

# Read the CSV file with appropriate encoding
my_data <- read_csv("VIW_FNT.csv", locale = locale(encoding = "UTF-8"))
## Rows: 156291 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): WHOREGION, FLUSEASON, HEMISPHERE, ITZ, COUNTRY_CODE, COUNTRY_AREA...
## dbl  (35): ISO_YEAR, ISO_WEEK, MMWR_YEAR, MMWR_WEEK, SPEC_PROCESSED_NB, SPEC...
## date  (2): ISO_WEEKSTARTDATE, MMWR_WEEKSTARTDATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Define the correct influenza subtypes based on the actual column names in the dataset
influenza_subtypes <- c("AH1N12009", "AH1", "AH3", "BVIC_2DEL", "BVIC_3DEL", "BVIC_NODEL", "BVIC_DELUNK", "BYAM")

# Verify that the specified columns exist in the dataset
missing_columns <- setdiff(influenza_subtypes, colnames(my_data))
if (length(missing_columns) > 0) {
  stop(paste("The following columns are missing from the dataset:", paste(missing_columns, collapse = ", ")))
}

# Filter rows where any influenza subtype count is greater than 0
df_filtered <- my_data %>%
  filter(rowSums(select(., all_of(influenza_subtypes)), na.rm = TRUE) > 0)

# Melt the data to long format
df_melted <- melt(df_filtered, id.vars = c('WHOREGION', 'ISO_YEAR'), measure.vars = influenza_subtypes)
df_melted <- df_melted %>%
  rename(Year = ISO_YEAR, Region = WHOREGION)

# Aggregate total cases per year and region
df_agg <- aggregate(value ~ Year + Region, df_melted, FUN = sum)

# Replace region codes with full names
region_names <- c(
  "AFR" = "Africa",
  "AMR" = "Americas",
  "EMR" = "Eastern Mediterranean",
  "EUR" = "Europe",
  "SEAR" = "South-East Asia",
  "WPR" = "Western Pacific"
)

df_agg$Region <- factor(df_agg$Region, levels = names(region_names), labels = region_names)

# Filter out any rows with NA values in the Region column
df_agg <- df_agg %>% filter(!is.na(Region))

# Create the facet plot
ggplot(df_agg, aes(x = Year, y = value, color = Region)) +
  geom_line() +
  geom_point() +
  labs(title = "Influenza Cases Over Time by Region",
       x = "Year",
       y = "Number of Cases") +
  scale_y_continuous(labels = comma) + # Format y-axis with normal numbers
  facet_wrap(~Region, ncol = 2, scales = "free_y") + # Allow y-axis scales to vary, arrange in 2 columns
  theme_minimal()