# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, ggplot2, gridExtra, tidyr)

# Load the data
GDO_data_wide <- read_csv("GDO_data_wide.csv")
## Rows: 42664 Columns: 177
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (177): Cancer Site, Year, Tumour Type, Tumour Type 2, Tumour Type 3, Tum...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Define columns that need to be numeric
numeric_columns <- c("Incidence", "Population", "Incidence Rate", "Screening percentage",
                     "Two Week Wait percentage", "GP Referral percentage", 
                     "Other Outpatient percentage", "Emergency Presentation percentage", 
                     "Net survival 03m", "Net survival 06m", "Net survival 12m")

# Define a list of invalid codes that represent missing data
invalid_codes <- c(".a", ".j", ".k", ".c", ".n", ".d", ".m", ".i", ".b", ".p", ".e", ".f", ".g", ".h")

# Function to clean the data by replacing invalid codes with NA
clean_data <- function(column) {
  column_cleaned <- ifelse(column %in% invalid_codes, NA, as.numeric(column))
  return(column_cleaned)
}

# Apply cleaning function to each numeric column individually
for (col in numeric_columns) {
  GDO_data_wide[[col]] <- clean_data(GDO_data_wide[[col]])
}
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
# Now apply the cleaning function to each survival column
survival_columns <- c("Net survival 03m", "Net survival 06m", "Net survival 09m", "Net survival 12m", 
                      "Net survival 24m", "Net survival 36m", "Net survival 48m", "Net survival 60m", 
                      "Net survival 72m", "Net survival 84m", "Net survival 96m")

for (col in survival_columns) {
  GDO_data_wide[[col]] <- clean_data(GDO_data_wide[[col]])
}
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
# Custom function to get summary statistics for each survival period column
get_summary_stats <- function(column) {
  summary_stats <- summary(column, na.rm = TRUE)
  missing_values <- sum(is.na(column))
  return(list(summary = summary_stats, missing = missing_values))
}

# Generate the summary statistics for the survival columns
survival_summary_cleaned <- lapply(GDO_data_wide[survival_columns], get_summary_stats)

# Create a table with the cleaned summary statistics
survival_summary_table_cleaned <- data.frame(
  Survival_Period = survival_columns,
  Min = sapply(survival_summary_cleaned, function(x) x$summary[1]),
  `1st_Quartile` = sapply(survival_summary_cleaned, function(x) x$summary[2]),
  Median = sapply(survival_summary_cleaned, function(x) x$summary[3]),
  Mean = sapply(survival_summary_cleaned, function(x) x$summary[4]),
  `3rd_Quartile` = sapply(survival_summary_cleaned, function(x) x$summary[5]),
  Max = sapply(survival_summary_cleaned, function(x) x$summary[6]),
  Missing_Values = sapply(survival_summary_cleaned, function(x) x$missing)
)

# View the cleaned summary table
print(survival_summary_table_cleaned)
##                        Survival_Period Min X1st_Quartile Median     Mean
## Net survival 03m.Min. Net survival 03m 6.6          81.4   97.6 87.06204
## Net survival 06m.Min. Net survival 06m 2.8          70.2   95.5 82.07709
## Net survival 09m.Min. Net survival 09m 2.2          63.4   94.0 79.15205
## Net survival 12m.Min. Net survival 12m 1.7          58.9   92.7 77.21324
## Net survival 24m.Min. Net survival 24m 1.0          51.3   89.7 73.95428
## Net survival 36m.Min. Net survival 36m 0.6          50.8   87.5 73.13646
## Net survival 48m.Min. Net survival 48m 0.7          51.3   86.1 72.75818
## Net survival 60m.Min. Net survival 60m 0.6          51.3   85.1 72.65370
## Net survival 72m.Min. Net survival 72m 0.5          52.3   84.5 72.85910
## Net survival 84m.Min. Net survival 84m 1.4          55.8   84.4 74.11172
## Net survival 96m.Min. Net survival 96m 1.3          54.3   82.6 73.34354
##                       X3rd_Quartile   Max Missing_Values
## Net survival 03m.Min.       100.000 102.6           8028
## Net survival 06m.Min.        99.700 103.4           9117
## Net survival 09m.Min.        99.500 104.9           9661
## Net survival 12m.Min.        99.300 106.1          10143
## Net survival 24m.Min.        98.800 110.8          16748
## Net survival 36m.Min.        98.400 119.6          22168
## Net survival 48m.Min.        98.100 126.0          26981
## Net survival 60m.Min.        97.925 129.4          31284
## Net survival 72m.Min.        97.800 141.4          35422
## Net survival 84m.Min.        97.900 146.5          39396
## Net survival 96m.Min.        97.600 156.5          41279
# Define a general function to pivot the data into long format
pivot_data_longer <- function(data, id_cols, pivot_cols, names_to = "Metric", values_to = "Value") {
  data %>%
    pivot_longer(
      cols = all_of(pivot_cols),
      names_to = names_to,
      values_to = values_to
    )
}

# Convert data to long format for plotting
GDO_data_long <- pivot_data_longer(
  data = GDO_data_wide,
  id_cols = "Cancer Site",
  pivot_cols = c("Screening percentage", "GP Referral percentage", "Two Week Wait percentage", 
                 "Other Outpatient percentage", "Emergency Presentation percentage"),
  names_to = "Metric",
  values_to = "Percentage"
)

# Filter data for Tumour Type == "All"
GDO_data_long_filtered <- GDO_data_wide %>%
  filter(`Tumour Type` == "All") %>%
  select(Year, `Cancer Site`, `Screening percentage`, `GP Referral percentage`, 
         `Two Week Wait percentage`, `Other Outpatient percentage`, `Emergency Presentation percentage`) %>%
  pivot_data_longer(
    data = .,
    id_cols = c("Year", "Cancer Site"),
    pivot_cols = c("Screening percentage", "GP Referral percentage", 
                   "Two Week Wait percentage", "Other Outpatient percentage", 
                   "Emergency Presentation percentage"),
    names_to = "Metric",
    values_to = "Percentage"
  )

# Aggregate data by Cancer Site and Year, then calculate the mean for each metric
GDO_data_avg <- GDO_data_wide %>%
  group_by(`Cancer Site`) %>%
  summarize(
    avg_screening = mean(`Screening percentage`, na.rm = TRUE),
    avg_gp_referral = mean(`GP Referral percentage`, na.rm = TRUE),
    avg_two_week_wait = mean(`Two Week Wait percentage`, na.rm = TRUE),
    avg_other_outpatient = mean(`Other Outpatient percentage`, na.rm = TRUE),
    avg_emergency_presentation = mean(`Emergency Presentation percentage`, na.rm = TRUE)
  ) %>%
  pivot_data_longer(
    data = .,
    id_cols = "Cancer Site",
    pivot_cols = starts_with("avg_"),
    names_to = "Metric",
    values_to = "Average Percentage"
  )

# Plot the averaged data
ggplot(GDO_data_avg, aes(x = reorder(`Cancer Site`, -`Average Percentage`), y = `Average Percentage`, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +  # Side-by-side bars for each metric
  coord_flip() +  # Flip the axes for better readability
  labs(title = "Average Screening, GP Referral, Two Week Wait, Other Outpatient, and Emergency Presentation Percentages by Cancer Site",
       x = "Cancer Site",
       y = "Average Percentage") +
  scale_fill_manual(values = c("avg_screening" = "steelblue", 
                               "avg_gp_referral" = "orange", 
                               "avg_two_week_wait" = "darkgreen",
                               "avg_other_outpatient" = "purple",
                               "avg_emergency_presentation" = "red")) +
  theme_minimal()
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_bar()`).