R Final Project - Chloride Report

Author

Zirui Zheng

Published

February 12, 2025

R Spatial final project

Step 1: Import libraries and data path

Show the code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Show the code
library(ggplot2)

## Reference: Turn off warning method from https://stackoverflow.com/questions/16194212/how-to-suppress-warnings-globally-in-an-r-script
options(warn=-1) ## Turn off the warning when generate graphs

data_dir <- file.path(getwd(), 'chloride_data')

Step 2: Read Chloride file

Show the code
## The csv file has been manually exported from separated sheet.
## Read in the cl concentration & cl limit
cl_df <- read.csv(file.path(data_dir, 'Data-Table 1.csv'))

## Adjust the date format, X is the actual date column name, we replaced with a meaningful name
colnames(cl_df)[1] <- "Chloride_measured_dates"

Step 3: Make a function for Plotting Production Wells

Show the code
## Reference: paste function comes from https://www.r-bloggers.com/2022/08/r-program-to-concatenate-two-strings/
plot_production_well_func <- function(well_number, cl_df, data_dir, year_range, graph_type){
  chloride_limit_col = paste("Well." , well_number, sep = "")
  chloride_concentration_col = paste("Well.", well_number, ".CL", sep = "")
  chloride_loss_file_name = paste("Cl Well ", well_number, "-Table 1.csv", sep = "")
  
  ## Read the data in production well data from cl_df
  well_df <- cl_df %>%
                      select(all_of(chloride_limit_col), all_of(chloride_concentration_col), Chloride_measured_dates)

  ## Read in Chloride losses data
  well_losses_df <- read.csv(file.path(data_dir, chloride_loss_file_name))
  colnames(well_losses_df)[1] <- "Chloride_measured_dates"
  
  
  ## Adjust as.date format
well_df$Chloride_measured_dates <- as.Date(well_df$Chloride_measured_dates, format = "%m/%d/%y") 
well_df <- well_df %>% 
        filter(!is.na(Chloride_measured_dates))

well_losses_df$Chloride_measured_dates <- as.Date(well_losses_df$Chloride_measured_dates, format = "%m/%d/%y") 
well_losses_df <- well_losses_df %>% 
        filter(!is.na(Chloride_measured_dates))

  
  ## Merge columns of cl concentration, cl limit and cl loss into well_df
  well_df <- merge(well_df, well_losses_df, by = c("Chloride_measured_dates", chloride_concentration_col), all = TRUE)

  
  # Clean NAs from the file
  # Reference: Get the method of filtering by passing a string column name from https://stackoverflow.com/questions/71875366/how-do-you-filter-by-passing-a-string-as-a-column-name-in-a-user-defined-functio
  well_df <- well_df %>%
                      filter(!is.na(.[[chloride_limit_col]]) & !is.na(.[[chloride_concentration_col]]))
  
  
  ## Convert the values to numeric to fit the 'scale_y_continuous' function
  well_df[[chloride_concentration_col]] <- as.numeric(well_df[[chloride_concentration_col]])
  
  well_df[[chloride_limit_col]] <- as.numeric(well_df[[chloride_limit_col]])

  
  
  
  
## Plot the data
## Reference: Use scale_x/y_continous from https://medium.com/@bao.character/how-to-use-scale-y-continuous-in-ggplot2-library-in-r-ca233b06fe82#:~:text=The%20scale_y_continuous()%20function%20in,breaks%2C%20labels%2C%20and%20transformations.
## Reference: Use method = "loess" from https://www.datanovia.com/en/blog/how-to-plot-a-smooth-line-using-ggplot2/
## Reference: Use aes_string from https://stackoverflow.com/questions/19826352/pass-character-strings-to-ggplot2-within-a-function
## Reference: Use .data[[]] to retrieve data from df with string variable from https://stackoverflow.com/questions/28897577/what-is-the-difference-between-aes-and-aes-string-ggplot2-in-r

  
  
  ## Adjust different settings and graph title for period of record AND five-year period
  if (graph_type == 'record'){
    span_number <- 0.034
    graph_title <- 'Period of Record: Production Well '
    year_break <- "5 years"
    limit_plot_func <- function(chloride_limit_col) {
      geom_point(aes(y = .data[[chloride_limit_col]],
                     color = "CL Limit"),
             size = 1
             ) 
    }
  }else {
    span_number <- 0.310
    graph_title <- 'Five-Year Period -  Production Well '
    year_break <- "year"
    limit_plot_func <- function(chloride_limit_col) {
      geom_line(aes(y = .data[[chloride_limit_col]], 
                    color = "CL Limit")
             ) 
    }
  }
  
  ## Reference: pmin function to get the minimun values among two columns in a df from:https://stackoverflow.com/questions/21977353/get-the-min-of-two-columns
  ## Reference: scale_color_manual method and its usage from https://stackoverflow.com/questions/74392543/how-to-add-a-legend-to-ggplot-in-r
  
  y_min <- max(0, pmin(well_df[[chloride_concentration_col]], well_df[[chloride_limit_col]]) - 100)
  y_max <- pmax(well_df[[chloride_concentration_col]], well_df[[chloride_limit_col]]) + 100
  y_range <- c(y_min, y_max)

ggplot(well_df, aes(x = Chloride_measured_dates)) +
  limit_plot_func(chloride_limit_col) +
  geom_point(aes(y = .data[[chloride_concentration_col]],
                        color = 'Chloride Concentration'),
             shape = 23,
             fill = 'lightblue'
             ) + 
  geom_smooth(aes(y = LOESS.2023, 
                  color = "LOESS"
                  ),
              method = "loess",
              span = span_number,
              se = F
            ) +  
  scale_y_continuous(limits = y_range) +  # set y range
  scale_x_date(limits = as.Date(year_range),
               breaks = year_break,
               date_labels = "%b-%Y"
               ) +  # set x range
  labs(
    x = "Date", 
    y = "Chloride Concentration (mg/L)", 
    title = paste (graph_title, well_number, sep = ""),
    color = "symbol"
  ) +
  scale_color_manual(values = c("Chloride Concentration" = "blue",
                                "LOESS" = "purple",
                                "CL Limit" = "red"))
}

Step 3.1: Plot Production Well & Five-Year period for Product Well 2

Show the code
plot_production_well_func(2, cl_df, data_dir, c("1978-01-01", "2023-01-01"), "record")
`geom_smooth()` using formula = 'y ~ x'

Show the code
plot_production_well_func(2, cl_df, data_dir, c("2018-01-01", "2023-01-01"), "five_year")
`geom_smooth()` using formula = 'y ~ x'

Step 3.2: Plot Production Well & Five-Year period for Product Well 3

Show the code
plot_production_well_func(3, cl_df, data_dir, c("1977-11-01", "2022-12-31"), "record")
`geom_smooth()` using formula = 'y ~ x'

Show the code
plot_production_well_func(3, cl_df, data_dir, c("2017-11-01", "2022-12-31"), "five_year")
`geom_smooth()` using formula = 'y ~ x'

Step 3.3: Plot Production Well & Five-Year period for Product Well 6

Show the code
plot_production_well_func(6, cl_df, data_dir, c("1977-10-01", "2023-12-31"), "record")
`geom_smooth()` using formula = 'y ~ x'

Show the code
plot_production_well_func(6, cl_df, data_dir, c("2016-10-01", "2023-12-31"), "five_year")
`geom_smooth()` using formula = 'y ~ x'

Step 3.4: Plot Production Well & Five-Year period for Product Well 7

Show the code
plot_production_well_func(7, cl_df, data_dir, c("2000-03-01", "2023-12-31"), "record")
`geom_smooth()` using formula = 'y ~ x'

Show the code
plot_production_well_func(7, cl_df, data_dir, c("2016-09-01", "2023-12-31"), "five_year")
`geom_smooth()` using formula = 'y ~ x'

Step 4.1: Table Kendall’s tau b for Production Well 2

Show the code
## Read mw files for creating the table chart
MW_df <- cl_df %>%
          select(MW.CHL, CHL, CHL.1, CHL.2, CHL.3)
      
MW_df$MW.CHL <- as.Date(MW_df$MW.CHL, format = "%m/%d/%y") 
MW_df <- MW_df %>% 
        filter(!is.na(MW.CHL))

colnames(MW_df) <- c('Date', 'mw4', 'mw20','mw21','mw25')

MW_df$Date <- as.numeric(MW_df$Date)
MW_df$mw4 <- as.numeric(MW_df$mw4)
MW_df$mw20 <- as.numeric(MW_df$mw20)
MW_df$mw21 <- as.numeric(MW_df$mw21)
MW_df$mw25 <- as.numeric(MW_df$mw25)

## Do the cor test.
## Reference: correlation coefficient https://stackoverflow.com/questions/69594298/r-calculate-the-correlation-coefficient
## Reference: cor.test comes from https://www.geeksforgeeks.org/kendall-correlation-testing-in-r-programming/#
## Reference: na.omit to count the non NAs in column from https://stackoverflow.com/questions/15704665/simple-method-of-counting-non-nas-in-column-of-data-string

test_func <- function(x, y){
  result <- cor.test(x, y, method = 'kendall')
  return (c(
    round(result$estimate, 3), 
    round(result$p.value,3), length(na.omit(y))))
}

## Apply the functioin
table_mw <- sapply(MW_df, function(y){
  test_func(MW_df$Date, y)
})

## Didable scientific notation
## Reference: https://stackoverflow.com/questions/25946047/how-to-prevent-scientific-notation-in-r
options(scipen = 999)
rownames(table_mw) <- c("Correlation-coffecient", "Sig (1 sided)", "N")

print(table_mw)
                       Date     mw4   mw20    mw21    mw25
Correlation-coffecient    1   0.165   0.69  -0.301   0.487
Sig (1 sided)             0   0.000   0.00   0.000   0.000
N                       360 322.000 322.00 321.000 347.000