# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, ggplot2, gridExtra, scales)
# Load the data
GDO_data_wide <- read_csv("GDO_data_wide.csv")
## Rows: 42664 Columns: 177
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (177): Cancer Site, Year, Tumour Type, Tumour Type 2, Tumour Type 3, Tum...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert columns to appropriate data types
numeric_columns <- c("Incidence", "Population", "Incidence Rate", "Screening percentage",
"Two Week Wait percentage", "GP Referral percentage",
"Net survival 03m", "Net survival 06m", "Net survival 12m")
# Convert specified columns to numeric
GDO_data_wide <- GDO_data_wide %>%
mutate(across(all_of(numeric_columns), as.numeric))
## Warning: There were 9 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(all_of(numeric_columns), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 8 remaining warnings.
# Define survival columns
survival_columns <- c("Net survival 03m", "Net survival 06m", "Net survival 09m", "Net survival 12m",
"Net survival 24m", "Net survival 36m", "Net survival 48m", "Net survival 60m",
"Net survival 72m", "Net survival 84m", "Net survival 96m")
# Define a list of invalid codes that represent missing data
invalid_codes <- c(".a", ".j", ".k", ".c", ".n", ".d", ".m", ".i", ".b", ".p", ".e", ".f", ".g", ".h")
# Function to clean the data by replacing invalid codes with NA
clean_data <- function(column) {
column_cleaned <- ifelse(column %in% invalid_codes, NA, as.numeric(column))
return(column_cleaned)
}
# Apply the cleaning function to each survival column
for (col in survival_columns) {
GDO_data_wide[[col]] <- clean_data(GDO_data_wide[[col]])
}
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
## Warning in ifelse(column %in% invalid_codes, NA, as.numeric(column)): NAs
## introduced by coercion
# Boxplots for each survival period (03m to 96m)
boxplot_plots <- lapply(survival_columns, function(col) {
ggplot(GDO_data_wide, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot(fill = "steelblue", color = "black") +
labs(title = paste("Boxplot of Net Survival at", gsub("Net survival ", "", col)), y = col) +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.text.x = element_blank()) +
scale_y_continuous(labels = scales::label_number(scale = 1)) # Avoid exponential notation
})
# Arrange the boxplots in a 1x3 grid for the first 3 periods
grid.arrange(grobs = boxplot_plots[1:3], ncol = 3, nrow = 1)
## Warning: Removed 8028 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 9117 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 9661 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Arrange the boxplots in a 2x4 grid for the remaining periods
grid.arrange(grobs = boxplot_plots[4:11], ncol = 2, nrow = 4)
## Warning: Removed 10143 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 16748 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 22168 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 26981 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 31284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 35422 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 39396 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 41279 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Histograms for each survival period (03m to 96m)
histogram_plots <- lapply(survival_columns, function(col) {
ggplot(GDO_data_wide, aes(x = .data[[col]])) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = paste("Histogram of Net Survival at", gsub("Net survival ", "", col)),
x = col, y = "Frequency") +
theme_minimal()
})
# Arrange the histograms in a 1x3 grid for the first 3 periods
grid.arrange(grobs = histogram_plots[1:3], ncol = 3, nrow = 1)
## Warning: Removed 8028 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 9117 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 9661 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Arrange the histograms in a 2x4 grid for the remaining periods
grid.arrange(grobs = histogram_plots[4:11], ncol = 2, nrow = 4)
## Warning: Removed 10143 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 16748 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 22168 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 26981 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 31284 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 35422 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 39396 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 41279 rows containing non-finite outside the scale range
## (`stat_bin()`).
