# Load required libraries
pacman::p_load(pacman, tidyverse, lubridate, stringr, gridExtra, grid)
# Read the CSV file
df <- read_csv("salinization_2021_2022_en.csv")
## Rows: 60 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): Station, River, Province, The distance between river and station (...
## dbl (2): 21/03 - 31/03, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the distance column for easier handling
df <- df %>%
rename(Distance_km = `The distance between river and station (km)`)
# Convert Distance_km to numeric, coercing non-numeric values (e.g., "Nội đồng", "-") to NA
df <- df %>%
mutate(Distance_km = as.numeric(Distance_km))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Distance_km = as.numeric(Distance_km)`.
## Caused by warning:
## ! NAs introduced by coercion
# Convert time period columns to character to avoid issues during pivoting
df <- df %>%
mutate(across(contains("-"), as.character))
# Pivot the data to long format and create necessary columns
df_long <- df %>%
pivot_longer(
cols = contains("-"),
names_to = "Time_Period",
values_to = "Salinity"
) %>%
mutate(
Start_Date = str_extract(Time_Period, "^\\d{2}/\\d{2}"), # Extract MM/DD
Start_Date = paste0(Start_Date, "/", Year), # Append year
Start_Date = dmy(Start_Date), # Convert to date
Salinity = as.numeric(Salinity) # Convert salinity to numeric
) %>%
filter(!is.na(Salinity)) # Remove rows with missing salinity values
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Start_Date = dmy(Start_Date)`.
## Caused by warning:
## ! 36 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Prepare data for seasonal patterns
df_seasonal <- df_long %>%
mutate(Month = month(Start_Date, label = TRUE)) %>%
group_by(Month) %>%
summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE))
df_long <- df_long %>%
filter(!is.na(Start_Date))
# Prepare data for Plot 1
df_january_river <- df_long %>%
filter(month(Start_Date) == 1) %>%
mutate(Year = factor(year(Start_Date), levels = c("2021", "2022"))) %>%
group_by(River, Year) %>%
summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE), .groups = "drop")
# Order rivers by overall average salinity
df_river_order <- df_january_river %>%
group_by(River) %>%
summarise(Overall_Avg_Salinity = mean(Avg_Salinity, na.rm = TRUE)) %>%
arrange(desc(Overall_Avg_Salinity))
df_january_river$River <- factor(df_january_river$River, levels = df_river_order$River)
# Define caption for all plots
caption <- "Source: National Centre for Hydro-Meteorological Forecasting"
# Create Plot 1
plot1 <- ggplot(df_january_river, aes(x = River, y = Avg_Salinity, fill = Year)) +
geom_bar(stat = "identity", position = "stack") +
labs(
title = "Average Salinity by River (January 2021 and 2022)",
subtitle = "Plot 1",
x = "River",
y = "Average Salinity (g/l)", caption = caption,
fill = "Year"
) +
scale_fill_manual(values = c("2021" = "#1b9e77", "2022" = "#d95f02")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10)
)
# Plot 2
df_distance <- df_long %>%
filter(!is.na(Distance_km)) %>% # Remove rows where Distance_km is NA
mutate(
Distance_Group = cut(
Distance_km,
breaks = c(0, 20, 40, 60, 80, 100, Inf),
labels = c("0-20 km", "21-40 km", "41-60 km", "61-80 km", "81-100 km", ">100 km"),
right = FALSE
)
) %>%
group_by(Distance_Group) %>%
summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE))
plot2 <- ggplot(df_distance, aes(x = Distance_Group, y = Avg_Salinity)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black") +
labs(
title = "Average Salinity by Distance from River Mouth",
subtitle = "Plot 2",
x = "Distance from Measuring Station to River Mouth (km)",
y = "Average Salinity (g/l)",
caption = caption
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.text.x = element_text(size = 10)
)
# Prepare data for Plot 3
df_january_distance <- df_long %>%
filter(month(Start_Date) == 1 & !is.na(Distance_km)) %>%
mutate(
Distance_Group = cut(
Distance_km,
breaks = c(0, 20, 40, 60, 80, 100, Inf),
labels = c("0-20 km", "21-40 km", "41-60 km", "61-80 km", "81-100 km", ">100 km"),
right = FALSE
),
Year = factor(year(Start_Date), levels = c("2021", "2022"))
) %>%
group_by(Distance_Group, Year) %>%
summarise(Avg_Salinity = mean(Salinity, na.rm = TRUE), .groups = "drop")
# Create Plot 3
plot3 <- ggplot(df_january_distance, aes(x = Distance_Group, y = Avg_Salinity, fill = Year)) +
geom_bar(stat = "identity", position = "stack") +
labs(
title = "Average Salinity by Distance from River Mouth (Jan 2021 & 2022)",
subtitle = "Plot 3",
x = "Distance from Measuring Station to River Mouth (km)",
y = "Average Salinity (g/l)",
caption = caption,
fill = "Year"
) +
scale_fill_manual(values = c("2021" = "#1b9e77", "2022" = "#d95f02")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.text.x = element_text(angle = 0, hjust = 1, size = 10)
)
# Create the scatter plot with regression line (plot 4)
plot4 <- ggplot(df_long, aes(x = Distance_km, y = Salinity)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
labs(
title = "Salinity vs. Distance from Measuring Station to River Mouth (Estuary)",
subtitle = "Plot 4",
x = "Distance (km)",
y = "Salinity (g/l)", caption = caption
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
)
# Prepare data with year
df_long <- df_long %>%
mutate(Year = year(Start_Date))
# Create the faceted scatter plot (5)
plot5 <- ggplot(df_long, aes(x = Distance_km, y = Salinity)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
facet_wrap(~ Year) +
labs(
title = "Salinity vs. Distance from Measuring Station to River Mouth (Estuary) by Year",
subtitle = "Plot 5",
x = "Distance (km)",
y = "Salinity (g/l)", caption = caption
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
)
#grid.arrange(
# plot1, plot2, plot3,
#layout_matrix = rbind(c(1, 1), # plot1 spans two columns
# c(2, 3)), # plot2 and plot3 side-by-side
#top = textGrob("Saline Intrusion in the Southern Region, Vietnam (Mekong Delta) in 2021 and in 2022: Bar charts",
# gp = gpar(fontsize = 14, fontface = "bold")),
#bottom = textGrob("Source: National Centre for Hydro-Meteorological Forecasting",
# gp = gpar(fontsize = 10))
#)
#grid.arrange(plot4, plot5,
# nrow = 2,
# top = textGrob("Saline Intrusion in the Southern Region, Vietnam (Mekong Delta) in 2021 and in 2022: Scatter plots #with regression line", gp = gpar(fontsize = 14, fontface = "bold")),
# bottom = textGrob("Source: National Centre for Hydro-Meteorological Forecasting", gp = gpar(fontsize = 10)))
print (plot1)

print (plot2)

print (plot3)

print (plot4)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 57 rows containing missing values or values outside the scale range
## (`geom_point()`).

print (plot5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 57 rows containing missing values or values outside the scale range
## (`geom_point()`).
