# Load necessary libraries
pacman::p_load(pacman, readr, ggplot2, tidyverse, summarytools, Amelia, gridExtra, RColorBrewer)

# Load the datasets
nhs_file_path <- "NHS.csv"
ind_file_path <- "IND.csv"

nhs_data <- read_csv(nhs_file_path)
## Rows: 64328 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): Provider Type, Month, Region Code, Region Name, Trust Code, Trust ...
## dbl (20): Year, Month#, Completed Procedures (by Submission Date), Completed...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ind_data <- read_csv(ind_file_path)
## Rows: 44991 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): Provider Type, Month, Region Code, Region Name, Company Code, Comp...
## dbl (20): Year, Month#, Completed Procedures (by Submission Date), Completed...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Define colour palettes for plots
nhs_colour <- brewer.pal(3, "Blues")[3]
ind_colour <- brewer.pal(3, "Greens")[3]
combined_palette <- brewer.pal(8, "Set1")

# Distribution plots for NHS and Independent data side by side
p1 <- ggplot(nhs_data, aes(x = `Completed Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Distribution of Completed Procedures (by Submission Date)", 
       x = "Completed Procedures (by Submission Date)", y = "Frequency")

p2 <- ggplot(ind_data, aes(x = `Completed Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Distribution of Completed Procedures (by Submission Date)", 
       x = "Completed Procedures (by Submission Date)", y = "Frequency")

grid.arrange(p1, p2, ncol = 2)

p3 <- ggplot(nhs_data, aes(x = `Completed Procedures (by Operation Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Distribution of Completed Procedures (by Operation Date)", 
       x = "Completed Procedures (by Operation Date)", y = "Frequency")

p4 <- ggplot(ind_data, aes(x = `Completed Procedures (by Operation Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Distribution of Completed Procedures (by Operation Date)", 
       x = "Completed Procedures (by Operation Date)", y = "Frequency")

grid.arrange(p3, p4, ncol = 2)

# Scatter plots for NHS and Independent datasets side by side
p5 <- ggplot(nhs_data, aes(x = `Completed Procedures (by Submission Date)`, 
                           y = `Completed Procedures (by Operation Date)`)) + 
  geom_point(color = nhs_colour) + 
  theme() +
  labs(title = "NHS: Completed Procedures (Submission Date) vs Operation Date", 
       x = "Completed Procedures (by Submission Date)", 
       y = "Completed Procedures (by Operation Date)")

p6 <- ggplot(ind_data, aes(x = `Completed Procedures (by Submission Date)`, 
                           y = `Completed Procedures (by Operation Date)`)) + 
  geom_point(color = ind_colour) + 
  theme() +
  labs(title = "Independent: Completed Procedures (Submission Date) vs Operation Date", 
       x = "Completed Procedures (by Submission Date)", 
       y = "Completed Procedures (by Operation Date)")

grid.arrange(p5, p6, ncol = 2)

# Combine the 'Region Name' faceted plots into one with different colours for NHS and Independent
# Reducing the number of labels by filtering for top N institutions with most occurrences
top_n_trusts <- nhs_data %>% 
  count(`Trust Name`, sort = TRUE) %>% 
  top_n(10) %>% 
  pull(`Trust Name`)
## Selecting by n
top_n_companies <- ind_data %>% 
  count(`Company Name`, sort = TRUE) %>% 
  top_n(10) %>% 
  pull(`Company Name`)
## Selecting by n
# Combine NHS and Independent data into a single dataset
combined_data <- nhs_data %>%
  filter(`Trust Name` %in% top_n_trusts) %>%
  mutate(Source = "NHS") %>%
  rename(`Institution Name` = `Trust Name`) %>%
  bind_rows(
    ind_data %>%
      filter(`Company Name` %in% top_n_companies) %>%
      mutate(Source = "Independent") %>%
      rename(`Institution Name` = `Company Name`)
  )

# Plot faceted bar plot with different colours for NHS and Independent
ggplot(combined_data, aes(x = `Institution Name`, fill = Source)) + 
  geom_bar(position = "dodge", color = 'black') + 
  scale_fill_manual(values = combined_palette) +
  theme() +
  labs(title = "Top 10 Institutions Across Regions (NHS & Independent)", 
       x = "Institution Name", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  facet_wrap(~`Region Name`)

# For Hip Procedures Comparison - Histograms

# NHS Hip Procedures (Submission Date)
p7 <- ggplot(nhs_data, aes(x = `Hip Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Hip Procedures (by Submission Date)", 
       x = "Hip Procedures (by Submission Date)", y = "Frequency")

# Independent Hip Procedures (Submission Date)
p8 <- ggplot(ind_data, aes(x = `Hip Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Hip Procedures (by Submission Date)", 
       x = "Hip Procedures (by Submission Date)", y = "Frequency")

# Display side by side
grid.arrange(p7, p8, ncol = 2)

# For Boxplot - Hip Procedures (Operation Date)
ggplot() +
  geom_boxplot(aes(x = "Independent", y = ind_data$`Hip Procedures (by Operation Date)`), fill = ind_colour) +
  geom_boxplot(aes(x = "NHS", y = nhs_data$`Hip Procedures (by Operation Date)`), fill = nhs_colour) +
  theme() +
  labs(title = "Hip Procedures (by Operation Date) - Independent vs NHS", 
       x = "Provider", y = "Hip Procedures")

# Boxplot of 'Consent Rate (by Operation Date)' by 'Region Name' for both datasets
p9 <- ggplot(nhs_data, aes(x = `Region Name`, y = `Consent Rate (by Operation Date)`)) + 
  geom_boxplot(fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Consent Rate (by Operation Date) by Region", 
       x = "Region", y = "Consent Rate (by Operation Date)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p10 <- ggplot(ind_data, aes(x = `Region Name`, y = `Consent Rate (by Operation Date)`)) + 
  geom_boxplot(fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Consent Rate (by Operation Date) by Region", 
       x = "Region", y = "Consent Rate (by Operation Date)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

grid.arrange(p9, p10, ncol = 2)

# For Knee Procedures Comparison - Histograms

# NHS Knee Procedures (Submission Date)
p7a <- ggplot(nhs_data, aes(x = `Knee Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Knee Procedures (by Submission Date)", 
       x = "Knee Procedures (by Submission Date)", y = "Frequency")

# Independent Knee Procedures (Submission Date)
p8a <- ggplot(ind_data, aes(x = `Knee Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Knee Procedures (by Submission Date)", 
       x = "Knee Procedures (by Submission Date)", y = "Frequency")

# Display side by side
grid.arrange(p7a, p8a, ncol = 2)

# For Boxplot - Knee Procedures (Operation Date)
ggplot() +
  geom_boxplot(aes(x = "Independent", y = ind_data$`Knee Procedures (by Operation Date)`), fill = ind_colour) +
  geom_boxplot(aes(x = "NHS", y = nhs_data$`Knee Procedures (by Operation Date)`), fill = nhs_colour) +
  theme() +
  labs(title = "Knee Procedures (by Operation Date) - Independent vs NHS", 
       x = "Provider", y = "Knee Procedures")

# For Ankle Procedures Comparison - Histograms

# NHS Ankle Procedures (Submission Date)
p7b <- ggplot(nhs_data, aes(x = `Ankle Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Ankle Procedures (by Submission Date)", 
       x = "Ankle Procedures (by Submission Date)", y = "Frequency")

# Independent Ankle Procedures (Submission Date)
p8b <- ggplot(ind_data, aes(x = `Ankle Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Ankle Procedures (by Submission Date)", 
       x = "Knee Procedures (by Submission Date)", y = "Frequency")

# Display side by side
grid.arrange(p7b, p8b, ncol = 2)

# For Boxplot - Ankle Procedures (Operation Date)
ggplot() +
  geom_boxplot(aes(x = "Independent", y = ind_data$`Ankle Procedures (by Operation Date)`), fill = ind_colour) +
  geom_boxplot(aes(x = "NHS", y = nhs_data$`Ankle Procedures (by Operation Date)`), fill = nhs_colour) +
  theme() +
  labs(title = "Ankle Procedures (by Operation Date) - Independent vs NHS", 
       x = "Provider", y = "Ankle Procedures")

# For Elbow Procedures Comparison - Histograms

# NHS Elbow Procedures (Submission Date)
p7c <- ggplot(nhs_data, aes(x = `Elbow Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Elbow Procedures (by Submission Date)", 
       x = "Elbow Procedures (by Submission Date)", y = "Frequency")

# Independent Elbow Procedures (Submission Date)
p8c <- ggplot(ind_data, aes(x = `Elbow Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Elbow Procedures (by Submission Date)", 
       x = "Elbow Procedures (by Submission Date)", y = "Frequency")

# Display side by side
grid.arrange(p7c, p8c, ncol = 2)

# For Boxplot - Elbow Procedures (Operation Date)
ggplot() +
  geom_boxplot(aes(x = "Independent", y = ind_data$`Elbow Procedures (by Operation Date)`), fill = ind_colour) +
  geom_boxplot(aes(x = "NHS", y = nhs_data$`Elbow Procedures (by Operation Date)`), fill = nhs_colour) +
  theme() +
  labs(title = "Elbow Procedures (by Operation Date) - Independent vs NHS", 
       x = "Provider", y = "Elbow Procedures")

# For Shoulder Procedures Comparison - Histograms

# NHS Shoulder Procedures (Submission Date)
p7d <- ggplot(nhs_data, aes(x = `Shoulder Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = nhs_colour, color = 'black') + 
  theme() +
  labs(title = "NHS: Shoulder Procedures (by Submission Date)", 
       x = "Shoulder Procedures (by Submission Date)", y = "Frequency")

# Independent Shoulder Procedures (Submission Date)
p8d <- ggplot(ind_data, aes(x = `Shoulder Procedures (by Submission Date)`)) + 
  geom_histogram(binwidth = 10, fill = ind_colour, color = 'black') + 
  theme() +
  labs(title = "Independent: Shoulder Procedures (by Submission Date)", 
       x = "Shoulder Procedures (by Submission Date)", y = "Frequency")

# Display side by side
grid.arrange(p7d, p8d, ncol = 2)

# For Boxplot - Shoulder Procedures (Operation Date)
ggplot() +
  geom_boxplot(aes(x = "Independent", y = ind_data$`Shoulder Procedures (by Operation Date)`), fill = ind_colour) +
  geom_boxplot(aes(x = "NHS", y = nhs_data$`Shoulder Procedures (by Operation Date)`), fill = nhs_colour) +
  theme() +
  labs(title = "Shoulder Procedures (by Operation Date) - Independent vs NHS", 
       x = "Provider", y = "Shoulder Procedures")

# Extract top 10 NHS trusts by completed procedures (by submission date)
nhs_top_institutions <- nhs_data %>%
  group_by(`Trust Name`) %>%
  summarise(`Completed Procedures` = sum(`Completed Procedures (by Submission Date)`)) %>%
  top_n(10, `Completed Procedures`) %>%
  arrange(desc(`Completed Procedures`))

# Extract top 10 Independent companies by completed procedures (by submission date)
ind_top_institutions <- ind_data %>%
  group_by(`Company Name`) %>%
  summarise(`Completed Procedures` = sum(`Completed Procedures (by Submission Date)`)) %>%
  top_n(10, `Completed Procedures`) %>%
  arrange(desc(`Completed Procedures`))

# NHS Bar Plot
p11 <- ggplot(nhs_top_institutions, aes(x = reorder(`Trust Name`, `Completed Procedures`), 
                                        y = `Completed Procedures`)) +
  geom_bar(stat = "identity", color = "black", fill = nhs_colour) +
  theme() +
  labs(title = "Top 10 NHS Trusts by Completed Procedures (Submission Date)",
       x = "Trust Name", y = "Completed Procedures") +
  scale_y_continuous(labels = scales::comma) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

# Independent Bar Plot
p12 <- ggplot(ind_top_institutions, aes(x = reorder(`Company Name`, `Completed Procedures`), 
                                        y = `Completed Procedures`)) +
  geom_bar(stat = "identity", color = "black", fill = ind_colour) +
  theme() +
  labs(title = "Top 10 Independent Companies by Completed Procedures (Submission Date)",
       x = "Company Name", y = "Completed Procedures") +
  scale_y_continuous(labels = scales::comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

# Display both plots side by side
grid.arrange(p11, p12, ncol = 2)

# Combine NHS and Independent data into one dataset with a 'Source' column
combined_time_data <- nhs_data %>%
  mutate(Source = "NHS") %>%
  bind_rows(ind_data %>% mutate(Source = "Independent"))

# Create a new Date column using Year and Month#
combined_time_data <- combined_time_data %>%
  mutate(Date = as.Date(paste(Year, `Month#`, "1", sep = "-"), "%Y-%m-%d"))

# Plot time series for Completed Procedures (by Submission Date)
p13 <- ggplot(combined_time_data, aes(x = Date, y = `Completed Procedures (by Submission Date)`, color = Source)) +
  geom_line(size = 1) +
  theme() +
  labs(title = "Time Series: Completed Procedures (by Submission Date)", 
       x = "Date", y = "Completed Procedures") +
  scale_color_manual(values = c(NHS = nhs_colour, Independent = ind_colour)) +
  theme(legend.position = "top")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot time series for Hip Procedures (by Submission Date)
p14 <- ggplot(combined_time_data, aes(x = Date, y = `Hip Procedures (by Submission Date)`, color = Source)) +
  geom_line(size = 1) +
  theme() +
  labs(title = "Time Series: Hip Procedures (by Submission Date)", 
       x = "Date", y = "Hip Procedures") +
  scale_color_manual(values = c(NHS = nhs_colour, Independent = ind_colour)) +
  theme(legend.position = "top")

# Plot time series for Knee Procedures (by Submission Date)
p15 <- ggplot(combined_time_data, aes(x = Date, y = `Knee Procedures (by Submission Date)`, color = Source)) +
  geom_line(size = 1) +
  theme() +
  labs(title = "Time Series: Knee Procedures (by Submission Date)", 
       x = "Date", y = "Knee Procedures") +
  scale_color_manual(values = c(NHS = nhs_colour, Independent = ind_colour)) +
  theme(legend.position = "top")

print (p13)

print (p14)

print (p15)