# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, ggplot2, gridExtra)

# Load the data
GDO_data_wide <- read_csv("GDO_data_wide.csv")
## Rows: 42664 Columns: 177
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (177): Cancer Site, Year, Tumour Type, Tumour Type 2, Tumour Type 3, Tum...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert columns to appropriate data types
numeric_columns <- c("Incidence", "Population", "Incidence Rate", "Screening percentage",
                     "Two Week Wait percentage", "GP Referral percentage", 
                     "Net survival 03m", "Net survival 06m", "Net survival 12m")

# Convert specified columns to numeric
GDO_data_wide <- GDO_data_wide %>%
  mutate(across(all_of(numeric_columns), as.numeric))
## Warning: There were 9 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(all_of(numeric_columns), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 8 remaining warnings.
# Define survival columns
survival_columns <- c("Net survival 03m", "Net survival 06m", "Net survival 09m", "Net survival 12m", 
                      "Net survival 24m", "Net survival 36m", "Net survival 48m", "Net survival 60m", 
                      "Net survival 72m", "Net survival 84m", "Net survival 96m")

# Define a list of invalid codes that represent missing data
invalid_codes <- c(".a", ".j", ".k", ".c", ".n", ".d", ".m", ".i", ".b", ".p", ".e", ".f", ".g", ".h")

# Function to clean the data by replacing invalid codes with NA
clean_data <- function(column) {
  column_cleaned <- ifelse(column %in% invalid_codes, NA, as.numeric(column))
  return(column_cleaned)
}

# Apply the cleaning function to each survival column
for (col in survival_columns) {
  GDO_data_wide[[col]] <- clean_data(GDO_data_wide[[col]])
}
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
# Custom function to get summary statistics for each survival period column
get_summary_stats <- function(column) {
  summary_stats <- summary(column, na.rm = TRUE)
  missing_values <- sum(is.na(column))
  return(list(summary = summary_stats, missing = missing_values))
}

# Generate the summary statistics for the survival columns
survival_summary_cleaned <- lapply(GDO_data_wide[survival_columns], get_summary_stats)

# Create a table with the cleaned summary statistics
survival_summary_table_cleaned <- data.frame(
  Survival_Period = survival_columns,
  Min = sapply(survival_summary_cleaned, function(x) x$summary[1]),
  `1st_Quartile` = sapply(survival_summary_cleaned, function(x) x$summary[2]),
  Median = sapply(survival_summary_cleaned, function(x) x$summary[3]),
  Mean = sapply(survival_summary_cleaned, function(x) x$summary[4]),
  `3rd_Quartile` = sapply(survival_summary_cleaned, function(x) x$summary[5]),
  Max = sapply(survival_summary_cleaned, function(x) x$summary[6]),
  Missing_Values = sapply(survival_summary_cleaned, function(x) x$missing)
)

# View the cleaned summary table
print(survival_summary_table_cleaned)
##                        Survival_Period Min X1st_Quartile Median     Mean
## Net survival 03m.Min. Net survival 03m 6.6          81.4   97.6 87.06204
## Net survival 06m.Min. Net survival 06m 2.8          70.2   95.5 82.07709
## Net survival 09m.Min. Net survival 09m 2.2          63.4   94.0 79.15205
## Net survival 12m.Min. Net survival 12m 1.7          58.9   92.7 77.21324
## Net survival 24m.Min. Net survival 24m 1.0          51.3   89.7 73.95428
## Net survival 36m.Min. Net survival 36m 0.6          50.8   87.5 73.13646
## Net survival 48m.Min. Net survival 48m 0.7          51.3   86.1 72.75818
## Net survival 60m.Min. Net survival 60m 0.6          51.3   85.1 72.65370
## Net survival 72m.Min. Net survival 72m 0.5          52.3   84.5 72.85910
## Net survival 84m.Min. Net survival 84m 1.4          55.8   84.4 74.11172
## Net survival 96m.Min. Net survival 96m 1.3          54.3   82.6 73.34354
##                       X3rd_Quartile   Max Missing_Values
## Net survival 03m.Min.       100.000 102.6           8028
## Net survival 06m.Min.        99.700 103.4           9117
## Net survival 09m.Min.        99.500 104.9           9661
## Net survival 12m.Min.        99.300 106.1          10143
## Net survival 24m.Min.        98.800 110.8          16748
## Net survival 36m.Min.        98.400 119.6          22168
## Net survival 48m.Min.        98.100 126.0          26981
## Net survival 60m.Min.        97.925 129.4          31284
## Net survival 72m.Min.        97.800 141.4          35422
## Net survival 84m.Min.        97.900 146.5          39396
## Net survival 96m.Min.        97.600 156.5          41279
# Create 'incidence_summary' as a summary table for mean incidence by 'Cancer Site'
incidence_summary <- GDO_data_wide %>%
  group_by(`Cancer Site`) %>%
  summarise(mean_incidence = mean(`Incidence Rate`, na.rm = TRUE))

# Bar plot for average incidence rate by cancer site
ggplot(incidence_summary, aes(x = reorder(`Cancer Site`, -mean_incidence), y = mean_incidence)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Average Incidence Rate by Cancer Site",
       x = "Cancer Site",
       y = "Mean Incidence Rate") +
  theme_minimal()

# Scatter plot for survival vs. incidence rate (example for 12m survival)
p1 <- ggplot(GDO_data_wide, aes(x = `Incidence Rate`, y = `Net survival 12m`)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  labs(title = "Scatter Plot: Incidence Rate vs. Net Survival at 12 Months", 
       x = "Incidence Rate", y = "Net Survival 12m") +
  theme_minimal()

# Additional scatter plot for survival vs. population (example for 03m survival)
p2 <- ggplot(GDO_data_wide, aes(x = `Population`, y = `Net survival 03m`)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  labs(title = "Scatter Plot: Population vs. Net Survival at 3 Months", 
       x = "Population", y = "Net Survival 03m") +
  theme_minimal() +
  scale_x_continuous(labels = scales::label_number(scale = 1))

grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 14697 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 14697 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 13509 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 13509 rows containing missing values or values outside the scale range
## (`geom_point()`).