# Load required libraries
pacman::p_load(pacman, tidyverse, lubridate, stringr, gridExtra, grid)

# Read the CSV file
df <- read_csv("salinization_2021_2022_en.csv")
## Rows: 60 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): Station, River, Province, The distance between river and station (...
## dbl  (2): 21/03 - 31/03, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the distance column for easier handling
df <- df %>%
  rename(Distance_km = `The distance between river and station (km)`)

# Convert Distance_km to numeric, coercing non-numeric values (e.g., "Nội đồng", "-") to NA
df <- df %>%
  mutate(Distance_km = as.numeric(Distance_km))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Distance_km = as.numeric(Distance_km)`.
## Caused by warning:
## ! NAs introduced by coercion
# Convert time period columns to character to avoid issues during pivoting
df <- df %>%
  mutate(across(contains("-"), as.character))

# Pivot the data to long format and create necessary columns
df_long <- df %>%
  pivot_longer(
    cols = contains("-"),
    names_to = "Time_Period",
    values_to = "Salinity"
  ) %>%
  mutate(
    Start_Date = str_extract(Time_Period, "^\\d{2}/\\d{2}"),  # Extract MM/DD
    Start_Date = paste0(Start_Date, "/", Year),               # Append year
    Start_Date = dmy(Start_Date),                             # Convert to date
    Salinity = as.numeric(Salinity)                           # Convert salinity to numeric
  ) %>%
  filter(!is.na(Salinity))  # Remove rows with missing salinity values
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Start_Date = dmy(Start_Date)`.
## Caused by warning:
## !  36 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Prepare data for seasonal patterns
df_seasonal <- df_long %>%
  mutate(Month = month(Start_Date, label = TRUE)) %>%
  group_by(Month) %>%
  summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE))

df_long <- df_long %>%
  filter(!is.na(Start_Date))

# Prepare data for Plot 1
df_january_river <- df_long %>%
  filter(month(Start_Date) == 1) %>%
  mutate(Year = factor(year(Start_Date), levels = c("2021", "2022"))) %>%
  group_by(River, Year) %>%
  summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE), .groups = "drop")

# Order rivers by overall average salinity
df_river_order <- df_january_river %>%
  group_by(River) %>%
  summarise(Overall_Avg_Salinity = mean(Avg_Salinity, na.rm = TRUE)) %>%
  arrange(desc(Overall_Avg_Salinity))

df_january_river$River <- factor(df_january_river$River, levels = df_river_order$River)

# Define caption for all plots
caption <- "Source: National Centre for Hydro-Meteorological Forecasting"

# Create Plot 1
plot1 <- ggplot(df_january_river, aes(x = River, y = Avg_Salinity, fill = Year)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    title = "Average Salinity by River (January 2021 and 2022)",
    subtitle = "Plot 1",
    x = "River",
    y = "Average Salinity (g/l)", caption = caption,
    fill = "Year"
  ) +
  scale_fill_manual(values = c("2021" = "#1b9e77", "2022" = "#d95f02")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10)
  )


# Plot 2
df_distance <- df_long %>%
  filter(!is.na(Distance_km)) %>%  # Remove rows where Distance_km is NA
  mutate(
    Distance_Group = cut(
      Distance_km,
      breaks = c(0, 20, 40, 60, 80, 100, Inf),
      labels = c("0-20 km", "21-40 km", "41-60 km", "61-80 km", "81-100 km", ">100 km"),
      right = FALSE
    )
  ) %>%
  group_by(Distance_Group) %>%
  summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE))

plot2 <- ggplot(df_distance, aes(x = Distance_Group, y = Avg_Salinity)) +
  geom_bar(stat = "identity", fill = "lightgreen", color = "black") +
  labs(
    title = "Average Salinity by Distance from River Mouth",
    subtitle = "Plot 2",
    x = "Distance from Measuring Station to River Mouth (km)",
    y = "Average Salinity (g/l)",
    caption = caption
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.text.x = element_text(size = 10)
  )

# Prepare data for Plot 3
df_january_distance <- df_long %>%
  filter(month(Start_Date) == 1 & !is.na(Distance_km)) %>%
  mutate(
    Distance_Group = cut(
      Distance_km,
      breaks = c(0, 20, 40, 60, 80, 100, Inf),
      labels = c("0-20 km", "21-40 km", "41-60 km", "61-80 km", "81-100 km", ">100 km"),
      right = FALSE
    ),
    Year = factor(year(Start_Date), levels = c("2021", "2022"))
  ) %>%
  group_by(Distance_Group, Year) %>%
  summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE), .groups = "drop")

# Create Plot 3
plot3 <- ggplot(df_january_distance, aes(x = Distance_Group, y = Avg_Salinity, fill = Year)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    title = "Average Salinity by Distance from River Mouth (Jan 2021 & 2022)",
    subtitle = "Plot 3",
    x = "Distance from Measuring Station to River Mouth (km)",
    y = "Average Salinity (g/l)",
    caption = caption,
    fill = "Year"
  ) +
  scale_fill_manual(values = c("2021" = "#1b9e77", "2022" = "#d95f02")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 1, size = 10)
  )

# Create the scatter plot with regression line (plot 4)
plot4 <- ggplot(df_long, aes(x = Distance_km, y = Salinity)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  labs(
    title = "Salinity vs. Distance from Measuring Station to River Mouth (Estuary)",
    subtitle = "Plot 4",
    x = "Distance (km)",
    y = "Salinity (g/l)", caption = caption
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

# Prepare data with year
df_long <- df_long %>%
  mutate(Year = year(Start_Date))

# Create the faceted scatter plot (5)
plot5 <- ggplot(df_long, aes(x = Distance_km, y = Salinity)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  facet_wrap(~ Year) +
  labs(
    title = "Salinity vs. Distance from Measuring Station to River Mouth (Estuary) by Year",
    subtitle = "Plot 5",
    x = "Distance (km)",
    y = "Salinity (g/l)", caption = caption
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

#grid.arrange(
 # plot1, plot2, plot3,
  #layout_matrix = rbind(c(1, 1),  # plot1 spans two columns
   #                     c(2, 3)), # plot2 and plot3 side-by-side
  #top = textGrob("Saline Intrusion in the Southern Region, Vietnam (Mekong Delta) in 2021 and in 2022: Bar charts",
   #              gp = gpar(fontsize = 14, fontface = "bold")),
  #bottom = textGrob("Source: National Centre for Hydro-Meteorological Forecasting",
   #                 gp = gpar(fontsize = 10))
#)

#grid.arrange(plot4, plot5,
 #            nrow = 2,
  #           top = textGrob("Saline Intrusion in the Southern Region, Vietnam (Mekong Delta) in 2021 and in 2022: Scatter plots #with regression line", gp = gpar(fontsize = 14, fontface = "bold")),
 #            bottom = textGrob("Source: National Centre for Hydro-Meteorological Forecasting", gp = gpar(fontsize = 10)))

print (plot1)

print (plot2)

print (plot3)

print (plot4)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 57 rows containing missing values or values outside the scale range
## (`geom_point()`).

print (plot5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 57 rows containing missing values or values outside the scale range
## (`geom_point()`).