Precipitation Variability vs. Water Quality

Author

Kathleen Kirsch

Pull in Data:

library(RCurl)
library(R.utils)
library(lubridate)
library(data.table)
library(raster)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(chirps)
library(reshape2)
library(ncdf4)
library(terra)
library(rworldmap)
library(ggpubr)
library(foreign)
library(MASS)

#Create path
path.rf <- file.path("/Users/kathleenkirsch/Desktop/Rscript")

#------Pull in WQ Data--------
###load files with wq test dates
DRCWQ = read.csv("https://api.mwater.co/v3/datagrids/df35cdf1fd06431ba787e7a500061dff/download?client=04fa81950005265980b0d25a763daab4&share=&extraFilters=%5B%5D&format=csv")
RwandaWQ = read.csv("https://api.mwater.co/v3/datagrids/4ec9dfd4014647eba7401f3d88cb6a64/download?client=04fa81950005265980b0d25a763daab4&share=&extraFilters=%5B%5D&format=csv")

#save the wq files as csv
write.csv(DRCWQ, "/Users/kathleenkirsch/Desktop/Rscript/DRCWQ.csv", row.names = F)
write.csv(RwandaWQ, "/Users/kathleenkirsch/Desktop/Rscript/RwandaWQ.csv", row.names = F)

##convert the wq data into a spatial object to pull data from raster stacks
# WQ files ####
DRCWQ_data = DRCWQ
DRCWQ_data$date2<- date(DRCWQ_data$Date)
head(DRCWQ_data)

RwandaWQ_data = RwandaWQ
RwandaWQ_data <- na.omit(RwandaWQ) #RwandaWQ has NAs for MPN and blank dates for a number of sites which messes with the date function
RwandaWQ_data$date2<- date(RwandaWQ_data$Date)
head(RwandaWQ_data)

##DRC WQ
DRCWQ_data$SiteID<-DRCWQ_data$Water.Site.ID.Bacteriological
DRCWQ_data<-DRCWQ_data[,c(9,8,2,4,6:7)] #select the relevant variables
names(DRCWQ_data)[2]<-"date"
coordinates(DRCWQ_data) <- ~ Longitude + Latitude 
crs(DRCWQ_data) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

##Rwanda WQ
RwandaWQ_data$SiteID<-RwandaWQ_data$Water.Point.ID
names(RwandaWQ_data)
RwandaWQ_data<-RwandaWQ_data[,c(12,11,2,4,9:10)]
names(RwandaWQ_data)[2] <- "date"
coordinates(RwandaWQ_data) <- ~ Longitude + Latitude 
crs(RwandaWQ_data) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

###simple sample plots on a map
#afr.cnt<-subset(getMap(), continent=="Africa")
drc<-getData("GADM", country = "Democratic Republic of the Congo", level = 1)
rwanda <- getData("GADM", country = "Rwanda", level = 1) #shapefile data is available from https://gadm.org/data.html


#----Pull in Meterological Data----------
#### SETTINGS ####
#---------------CHIRPS DRC---------------
# +dates of interest ####
#start_date <- floor_date(min(as.Date(DRCWQ_data$date)), "month") # "2022-03-01" - first date of the WQ data
#end_date <- as.Date(max(DRCWQ_data$date)) # "2023-05-01" - last date
lst.dates <- seq(as.Date("2022-01-01"),as.Date("2023-11-29"),by="days")

###CHIRPS rainfall product
length(DRCWQ_data$SiteID)

chirps_nc<-"/Users/kathleenkirsch/Desktop/Rscript/403cb493-f891-488f-86c8-b36cbdee0af1.nc" #read the downloaded spatially and temporally subset CHIRPS v2p datafile from ClimateSERV; https://climateserv.servirglobal.net/

chirps_nc_stk<-raster::stack(chirps_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
chirps_rf_wq<-raster::extract(chirps_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
chirps_rf_wq_df<-as.data.frame(chirps_rf_wq) #create dataframe from the extracted data
chirps_rf_wq_df2<-as.data.frame(t(chirps_rf_wq_df))  #transpose the df 
rownames(chirps_rf_wq_df2)<-lst.dates #rename the rows with the date vector created for the timeseries raster stack
chirps_rf_wq_df2<-rownames_to_column(chirps_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(chirps_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
chirps_rf_wq_df3<-reshape2::melt(chirps_rf_wq_df2,variable.name = "SiteID", value.name = "chirps_rf") #reshape the dataframe to from a wide table format to a long format
chirps_rf_wq_df3$chp30d<-frollsum(chirps_rf_wq_df3$chirps_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
chirps_rf_wq_df3$chp14d<-frollsum(chirps_rf_wq_df3$chirps_rf,14,fill = NA, align = "right")
chirps_rf_wq_df3$chp7d<-frollsum(chirps_rf_wq_df3$chirps_rf,7,fill = NA, align = "right")
chirps_rf_wq_df3$chp5d<-frollsum(chirps_rf_wq_df3$chirps_rf,5,fill = NA, align = "right")
chirps_rf_wq_df3$chp10d<-frollsum(chirps_rf_wq_df3$chirps_rf,10,fill = NA, align = "right")

##merge rainfall and WQ data
DRCWQ$date<-DRCWQ_data$date
DRCWQ$SiteID<-DRCWQ_data$SiteID
chirps_at_wq_DRC<-merge(DRCWQ,chirps_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (chirps_at_wq_DRC) #KMKcheck

#outputc should look like this;

# SiteID       date            WSP Colony  Latitude Longitude chirps_rf     rf30d
# 1 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332  0.000000 140.51398
# 2 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332  0.000000 140.51398
# 3 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332  8.582757 229.46067
# 4 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332  8.582757 229.46067
# 5 236261208 2022-05-20          SADEL     13 -6.198532  23.65095  0.000000  98.08299
# 6 236261215 2022-05-20          SADEL     48 -6.192828  23.65280  0.000000  98.08299

#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
chirps_at_wq_DRC$Colony <- as.numeric(gsub(",","",chirps_at_wq_DRC$Colony))

#write the output to a csv file
write.csv(chirps_at_wq_DRC, paste0(path.rf, "/output/chirps_at_wq_DRC.csv"),row.names = F)

#-----------CHIRPS RWANDA------------
# +dates of interest ####
#start_date <- floor_date(min(as.Date(RwandaWQ_data$date)), "month") # "summer 2023 start" - first date of the WQ data
#end_date <- as.Date(max(RwandaWQ_data$date)) # "ongoing" - last date

lst.dates <- seq(as.Date("2022-01-01"),as.Date("2023-11-29"),by="days")

###CHIRPS rainfall product
##check length of SiteIDs and update colnames:
length(RwandaWQ_data$SiteID)

chirps_nc<-"/Users/kathleenkirsch/Desktop/Rscript/2e6ac690-b564-42ca-b50d-8d0c31df8f0a.nc" #read the downloaded spatially and temporally subset CHIRPS v2p datafile from ClimateSERV; https://climateserv.servirglobal.net/

chirps_nc_stk<-raster::stack(chirps_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
chirps_rf_wq<-raster::extract(chirps_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
chirps_rf_wq_df<-as.data.frame(chirps_rf_wq) #create dataframe from the extracted data
chirps_rf_wq_df2<-as.data.frame(t(chirps_rf_wq_df))  #transpose the df 
rownames(chirps_rf_wq_df2)<-lst.dates #rename the rows with the date vector created for the timeseries raster stack
chirps_rf_wq_df2<-rownames_to_column(chirps_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(chirps_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
chirps_rf_wq_df3<-reshape2::melt(chirps_rf_wq_df2,variable.name = "SiteID", value.name = "chirps_rf") #reshape the dataframe to from a wide table format to a long format
chirps_rf_wq_df3$chp30d<-frollsum(chirps_rf_wq_df3$chirps_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
chirps_rf_wq_df3$chp14d<-frollsum(chirps_rf_wq_df3$chirps_rf,14,fill = NA, align = "right")
chirps_rf_wq_df3$chp7d<-frollsum(chirps_rf_wq_df3$chirps_rf,7,fill = NA, align = "right")
chirps_rf_wq_df3$chp5d<-frollsum(chirps_rf_wq_df3$chirps_rf,5,fill = NA, align = "right")
chirps_rf_wq_df3$chp10d<-frollsum(chirps_rf_wq_df3$chirps_rf,10,fill = NA, align = "right")
tail(chirps_rf_wq_df3)
head(chirps_rf_wq_df3)
##merge rainfall and WQ data
RwandaWQ = na.omit(RwandaWQ)
RwandaWQ$date<-RwandaWQ_data$date

RwandaWQ$SiteID<-RwandaWQ_data$SiteID
View(RwandaWQ)
chirps_at_wq_Rwanda<-merge(RwandaWQ,chirps_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
head(chirps_at_wq_Rwanda)
View (chirps_at_wq_Rwanda) #KMKcheck

#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
chirps_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",chirps_at_wq_Rwanda$MPN))

#write the output to a csv file
write.csv(chirps_at_wq_Rwanda, paste0(path.rf, "/output/chirps_at_wq_Rwanda.csv"),row.names = F)

#-----------IMERG DRC------------------
###IMERG- late

lst.dates.img <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")

imerg_nc<-"/Users/kathleenkirsch/Desktop/Rscript/26ef7276-2bee-4e59-9f32-822aa32e9cd9.nc" #read the downloaded spatially and temporally subset imerg datafile
###NOTE: I pulled climateserv

imerg_nc_stk<-raster::stack(imerg_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(imerg_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
imerg_rf_wq<-raster::extract(imerg_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
imerg_rf_wq_df<-as.data.frame(imerg_rf_wq) #create dataframe from the extracted data
imerg_rf_wq_df2<-as.data.frame(t(imerg_rf_wq_df))  #transpose the df 
rownames(imerg_rf_wq_df2)<-lst.dates.img #rename the rows with the date vector created for the timeseries raster stack
imerg_rf_wq_df2<-rownames_to_column(imerg_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(imerg_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
imerg_rf_wq_df3<-reshape2::melt(imerg_rf_wq_df2,variable.name = "SiteID", value.name = "imerg_rf") #reshape the dataframe to from a wide table format to a long format
imerg_rf_wq_df3$img30d<-frollsum(imerg_rf_wq_df3$imerg_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
imerg_rf_wq_df3$img14d<-frollsum(imerg_rf_wq_df3$imerg_rf,14,fill = NA, align = "right")
imerg_rf_wq_df3$img10d<-frollsum(imerg_rf_wq_df3$imerg_rf,10,fill = NA, align = "right")
imerg_rf_wq_df3$img7d<-frollsum(imerg_rf_wq_df3$imerg_rf,7,fill = NA, align = "right")
imerg_rf_wq_df3$img5d<-frollsum(imerg_rf_wq_df3$imerg_rf,5,fill = NA, align = "right")


##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID

imerg_at_wq_DRC<-merge(DRCWQ,imerg_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (imerg_at_wq_DRC) #KMKcheck

#outputc should look like this;

# SiteID       date            WSP Colony  Latitude Longitude     imerg_rf    rf30d     rf14d      rf10d
# 1 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332  0.008066831 114.2119  57.84766 42.2861472
# 2 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332  0.008066831 114.2119  57.84766 42.2861472
# 3 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332 28.959114075 249.6487 117.45070 94.3415748
# 4 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332 28.959114075 249.6487 117.45070 94.3415748
# 5 236261208 2022-05-20          SADEL     13 -6.198532  23.65095  0.007829102 162.7100  22.92562  0.8062411
# 6 236261215 2022-05-20          SADEL     48 -6.192828  23.65280  0.007829102 162.7100  22.92562  0.8062411
# rf7d         rf5d
# 1 33.084086833 16.332286296
# 2 33.084086833 16.332286296
# 3 94.216252863 56.426456034
# 4 94.216252863 56.426456034
# 5  0.007829102  0.007829102
# 6  0.007829102  0.007829102

#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
imerg_at_wq_DRC$Colony <- as.numeric(gsub(",","",imerg_at_wq_DRC$Colony))

#write the output to a csv file
write.csv(imerg_at_wq_DRC, paste0(path.rf, "/output/imerg_at_wq_DRC.csv"),row.names = F)

#-----------IMERG RWANDA--------------
lst.dates.img <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")

imerg_nc<-"/Users/kathleenkirsch/Desktop/Rscript/8ca328ef-c421-4bc7-92c8-b76fefdaf1ae.nc" #read the downloaded spatially and temporally subset imerg datafile
###NOTE: I pulled climateserv

imerg_nc_stk<-raster::stack(imerg_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(imerg_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
imerg_rf_wq<-raster::extract(imerg_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
imerg_rf_wq_df<-as.data.frame(imerg_rf_wq) #create dataframe from the extracted data
imerg_rf_wq_df2<-as.data.frame(t(imerg_rf_wq_df))  #transpose the df 
rownames(imerg_rf_wq_df2)<-lst.dates.img #rename the rows with the date vector created for the timeseries raster stack
imerg_rf_wq_df2<-rownames_to_column(imerg_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(imerg_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
imerg_rf_wq_df3<-reshape2::melt(imerg_rf_wq_df2,variable.name = "SiteID", value.name = "imerg_rf") #reshape the dataframe to from a wide table format to a long format
imerg_rf_wq_df3$img30d<-frollsum(imerg_rf_wq_df3$imerg_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
imerg_rf_wq_df3$img14d<-frollsum(imerg_rf_wq_df3$imerg_rf,14,fill = NA, align = "right")
imerg_rf_wq_df3$img10d<-frollsum(imerg_rf_wq_df3$imerg_rf,10,fill = NA, align = "right")
imerg_rf_wq_df3$img7d<-frollsum(imerg_rf_wq_df3$imerg_rf,7,fill = NA, align = "right")
imerg_rf_wq_df3$img5d<-frollsum(imerg_rf_wq_df3$imerg_rf,5,fill = NA, align = "right")


##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID

imerg_at_wq_Rwanda<-merge(RwandaWQ,imerg_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (imerg_at_wq_Rwanda) #KMKcheck

#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
imerg_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",imerg_at_wq_Rwanda$MPN))

#write the output to a csv file
write.csv(imerg_at_wq_Rwanda, paste0(path.rf, "/output/imerg_at_wq_Rwanda.csv"),row.names = F)


#---------LIS DRC---------
###LIS- runoff
###Note: recommended site I pulled from also climateserv

lst.dates.lis <- seq(as.Date("2022-01-01"),as.Date("2023-12-24"),by="days")

lis_nc<-"/Users/kathleenkirsch/Desktop/Rscript/LIS-runoff-daily-01012022-24122023-corrected.nc" #read the downloaded spatially and temporally subset lis datafile downloaded from climateserv

lis_nc_stk<-raster::stack(lis_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(lis_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
lis_rf_wq<-raster::extract(lis_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
lis_rf_wq_df<-as.data.frame(lis_rf_wq) #create dataframe from the extracted data
lis_rf_wq_df2<-as.data.frame(t(lis_rf_wq_df))  #transpose the df 
rownames(lis_rf_wq_df2)<-lst.dates.lis #rename the rows with the date vector created for the timeseries raster stack
lis_rf_wq_df2<-rownames_to_column(lis_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(lis_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
lis_rf_wq_df3<-reshape2::melt(lis_rf_wq_df2,variable.name = "SiteID", value.name = "lis_runoff") #reshape the dataframe to from a wide table format to a long format
lis_rf_wq_df3$lis30d<-frollsum(lis_rf_wq_df3$lis_runoff,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
lis_rf_wq_df3$lis14d<-frollsum(lis_rf_wq_df3$lis_runoff,14,fill = NA, align = "right")
lis_rf_wq_df3$lis0d<-frollsum(lis_rf_wq_df3$lis_runoff,10,fill = NA, align = "right")
lis_rf_wq_df3$lis7d<-frollsum(lis_rf_wq_df3$lis_runoff,7,fill = NA, align = "right")
lis_rf_wq_df3$lis5d<-frollsum(lis_rf_wq_df3$lis_runoff,5,fill = NA, align = "right")


##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID
lis_at_wq_DRC<-merge(DRCWQ,lis_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (lis_at_wq_DRC) #KMKcheck

#outputc should look like this;

# SiteID       date            WSP Colony  Latitude Longitude       lis_rf    rf30d     rf14d        rf10d
# 1 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332 9.531376e-07 15.96225  5.935595  4.913927514
# 2 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332 9.531376e-07 15.96225  5.935595  4.913927514
# 3 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332 9.570781e+00 60.24816 32.720138 27.717481035
# 4 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332 9.570781e+00 60.24816 32.720138 27.717481035
# 5 236261208 2022-05-20          SADEL     13 -6.198532  23.65095 1.400630e-07 49.43797  7.206170  0.009684263
# 6 236261215 2022-05-20          SADEL     48 -6.192828  23.65280 1.400630e-07 49.43797  7.206170  0.009684263
# rf7d         rf5d
# 1 4.344562e+00 2.335235e+00
# 2 4.344562e+00 2.335235e+00
# 3 2.770782e+01 1.694874e+01
# 4 2.770782e+01 1.694874e+01
# 5 1.400755e-07 1.400755e-07
# 6 1.400755e-07 1.400755e-07

#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
lis_at_wq_DRC$Colony <- as.numeric(gsub(",","",lis_at_wq_DRC$Colony))

#write the output to a csv file
write.csv(lis_at_wq_DRC, paste0(path.rf, "/output/lis_at_wq_DRC.csv"),row.names = F)

#------LIS RWANDA-----------------

lst.dates.lis <- seq(as.Date("2022-01-01"),as.Date("2023-12-24"),by="days")

lis_nc<-"/Users/kathleenkirsch/Desktop/Rscript/LIS-runoff-daily-01012022-24122023-corrected.nc" #read the downloaded spatially and temporally subset lis datafile downloaded from climateserv

lis_nc_stk<-raster::stack(lis_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(lis_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
lis_rf_wq<-raster::extract(lis_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
lis_rf_wq_df<-as.data.frame(lis_rf_wq) #create dataframe from the extracted data
lis_rf_wq_df2<-as.data.frame(t(lis_rf_wq_df))  #transpose the df 
rownames(lis_rf_wq_df2)<-lst.dates.lis #rename the rows with the date vector created for the timeseries raster stack
lis_rf_wq_df2<-rownames_to_column(lis_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(lis_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
lis_rf_wq_df3<-reshape2::melt(lis_rf_wq_df2,variable.name = "SiteID", value.name = "lis_runoff") #reshape the dataframe to from a wide table format to a long format
lis_rf_wq_df3$lis30d<-frollsum(lis_rf_wq_df3$lis_runoff,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
lis_rf_wq_df3$lis14d<-frollsum(lis_rf_wq_df3$lis_runoff,14,fill = NA, align = "right")
lis_rf_wq_df3$lis0d<-frollsum(lis_rf_wq_df3$lis_runoff,10,fill = NA, align = "right")
lis_rf_wq_df3$lis7d<-frollsum(lis_rf_wq_df3$lis_runoff,7,fill = NA, align = "right")
lis_rf_wq_df3$lis5d<-frollsum(lis_rf_wq_df3$lis_runoff,5,fill = NA, align = "right")

##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID
lis_at_wq_Rwanda<-merge(RwandaWQ,lis_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (lis_at_wq_Rwanda) #KMKcheck


#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
lis_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",lis_at_wq_Rwanda$MPN))

#write the output to a csv file
write.csv(lis_at_wq_Rwanda, paste0(path.rf, "/output/lis_at_wq_Rwanda.csv"),row.names = F)


#------- TAMSAT DRC----------------
###TAMSAT rainfall product
lst.dates.tamsat <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")

tamsat_nc<-"/Users/kathleenkirsch/Desktop/Rscript/tamsat_01012022-20122023_daily.nc" #read the downloaded spatially and temporally subset tamsatap datafile downloaded 
#if you lose pull from https://iridl.ldeo.columbia.edu/SOURCES/.Reading/.Meteorology/.TAMSAT/.v3p1/.daily/.rfe/dataselection.html
#note: talk to Denis -- this file type needs preprocessing (ask how to use Climate Data Operators?)

tamsat_nc_stk<-raster::stack(tamsat_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
tamsat_rf_wq<-raster::extract(tamsat_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
tamsat_rf_wq_df<-as.data.frame(tamsat_rf_wq) #create dataframe from the extracted data
tamsat_rf_wq_df2<-as.data.frame(t(tamsat_rf_wq_df))  #transpose the df 
rownames(tamsat_rf_wq_df2)<-lst.dates.tamsat #rename the rows with the date vector created for the timeseries raster stack
tamsat_rf_wq_df2<-rownames_to_column(tamsat_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(tamsat_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
tamsat_rf_wq_df3<-reshape2::melt(tamsat_rf_wq_df2,variable.name = "SiteID", value.name = "tamsat_rf") #reshape the dataframe to from a wide table format to a long format
tamsat_rf_wq_df3$tam30d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
tamsat_rf_wq_df3$tam4d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,14,fill = NA, align = "right")
tamsat_rf_wq_df3$tam10d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,10,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf7d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,7,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf5d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,5,fill = NA, align = "right")


##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID

tamsat_at_wq_DRC<-merge(DRCWQ,tamsat_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (tamsat_at_wq_DRC) #KMKcheck

#outputc should look like this;

# SiteID       date            WSP Colony  Latitude Longitude tamsat_rf tam30d tam4d tam10d tamf7d tamf5d
# 1 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332         0  141.6  61.6   36.6   36.6   18.3
# 2 236260568 2022-04-13 FOMI Tshitenge      0 -6.127290  23.67332         0  141.6  61.6   36.6   36.6   18.3
# 3 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332         0  145.1  61.1   50.5   50.5   31.4
# 4 236260568 2022-12-01 FOMI Tshitenge     65 -6.127290  23.67332         0  145.1  61.1   50.5   50.5   31.4
# 5 236261208 2022-05-20          SADEL     13 -6.198532  23.65095         0   94.4  20.1    0.0    0.0    0.0
# 6 236261215 2022-05-20          SADEL     48 -6.192828  23.65280         0   94.4  20.1    0.0    0.0    0.0

#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
tamsat_at_wq_DRC$Colony <- as.numeric(gsub(",","",tamsat_at_wq_DRC$Colony))

#write the output to a csv file
write.csv(tamsat_at_wq_DRC, paste0(path.rf, "/output/tamsat_at_wq_DRC.csv"),row.names = F)

#-------TAMSAT RWANDA---------
lst.dates.tamsat <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")

tamsat_nc<-"/Users/kathleenkirsch/Desktop/Rscript/tamsat_01012022-20122023_daily.nc" #read the downloaded spatially and temporally subset tamsatap datafile downloaded 
#if you lose pull from https://iridl.ldeo.columbia.edu/SOURCES/.Reading/.Meteorology/.TAMSAT/.v3p1/.daily/.rfe/dataselection.html
#note: talk to Denis -- this file type needs preprocessing (ask how to use Climate Data Operators?)

tamsat_nc_stk<-raster::stack(tamsat_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
tamsat_rf_wq<-raster::extract(tamsat_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
tamsat_rf_wq_df<-as.data.frame(tamsat_rf_wq) #create dataframe from the extracted data
tamsat_rf_wq_df2<-as.data.frame(t(tamsat_rf_wq_df))  #transpose the df 
rownames(tamsat_rf_wq_df2)<-lst.dates.tamsat #rename the rows with the date vector created for the timeseries raster stack
tamsat_rf_wq_df2<-rownames_to_column(tamsat_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(tamsat_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
tamsat_rf_wq_df3<-reshape2::melt(tamsat_rf_wq_df2,variable.name = "SiteID", value.name = "tamsat_rf") #reshape the dataframe to from a wide table format to a long format
tamsat_rf_wq_df3$tam30d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
tamsat_rf_wq_df3$tam4d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,14,fill = NA, align = "right")
tamsat_rf_wq_df3$tam10d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,10,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf7d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,7,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf5d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,5,fill = NA, align = "right")


##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID

tamsat_at_wq_Rwanda<-merge(RwandaWQ,tamsat_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates

View (tamsat_at_wq_Rwanda) #KMKcheck


#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
tamsat_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",tamsat_at_wq_Rwanda$MPN))

#write the output to a csv file
write.csv(tamsat_at_wq_Rwanda, paste0(path.rf, "/output/tamsat_at_wq_Rwanda.csv"),row.names = F)


#------MERGE DFs--------

rf_at_wq_DRC1 <- merge(chirps_at_wq_DRC,tamsat_at_wq_DRC,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))
rf_at_wq_DRC2 <- merge(imerg_at_wq_DRC,lis_at_wq_DRC,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))
rf_at_wq_DRC_all <- merge(rf_at_wq_DRC1,rf_at_wq_DRC2,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))

##export merged output
write.csv(rf_at_wq_DRC_all, "/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_DRC_all.csv",row.names = F)

rf_at_wq_Rwanda1 <- merge(chirps_at_wq_Rwanda,tamsat_at_wq_Rwanda,by=c("SiteID","date","School","MPN","Latitude","Longitude"))
rf_at_wq_Rwanda2 <- merge(imerg_at_wq_Rwanda,lis_at_wq_Rwanda,by=c("SiteID","date","School","MPN","Latitude","Longitude"))
rf_at_wq_Rwanda_all <- merge(rf_at_wq_Rwanda1,rf_at_wq_Rwanda2,by=c("SiteID","date","School","MPN","Latitude","Longitude"))

##export merged output
write.csv(rf_at_wq_Rwanda_all, "/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_Rwanda_all.csv",row.names = F)

Statistics:

```{# rf_at_wq_DRC_all <- read.csv(“/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_DRC_all.csv”) #can also (re)read the merged data file using this code}

names(rf_at_wq_DRC_all) #names of the dataframe so we know which variables we are working with for the modeling

[1] “SiteID” “date” “WSP” “Colony” “Latitude” “Longitude” “chirps_rf” “chp30d”

[9] “chp14d” “chp7d” “chp5d” “chp10d” “tamsat_rf” “tam30d” “tam4d” “tam10d”

[17] “tamf7d” “tamf5d” “imerg_rf” “img30d” “img14d” “img10d” “img7d” “img5d”

[25] “lis_runoff” “lis30d” “lis14d” “lis0d” “lis7d” “lis5d”

rf_at_wq_Rwanda_all <- read.csv(“/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_Rwanda_all.csv”) #can also (re)read the merged data file using this code

names(rf_at_wq_Rwanda_all) #names of the dataframe so we know which variables we are working with for the modeling

#——–Analysis———- #sample_drc_model <- lm(Colony~chp30d, data=rf_at_wq_DRC_all) #fit a simple linear model #summary(sample_drc_model)

#sample_drc_fe_model <- lm(Colony~chp30d+factor(SiteID)-1, data=rf_at_wq_DRC_all) #fit a simple linear model with “SiteID” as the fixed effect variable #summary(sample_drc_fe_model)

#sample_Rwanda_model <- lm(MPN~chp30d, data=rf_at_wq_Rwanda_all) #fit a simple linear model #summary(sample_Rwanda_model)

#sample_Rwanda_fe_model <- lm(MPN~chp30d+factor(SiteID)-1, data=rf_at_wq_Rwanda_all) #fit a simple linear model with “SiteID” as the fixed effect variable #summary(sample_Rwanda_fe_model)

#regression of rainfall versus water quality #check correlation coefficient to assess linearity coefficientDRC <- cor.test(rf_at_wq_DRC_all$Colony, rf_at_wq_DRC_all$imerg_rf) coefficientDRC$estimate

coefficientRwanda <- cor.test(rf_at_wq_Rwanda_all$MPN, rf_at_wq_Rwanda_all$imerg_rf) coefficientRwanda$estimate

#linear regression model lmmodelDRC <- lm(Colony ~ imerg_rf, data = rf_at_wq_DRC_all)

lmmodelRwanda <- lm(MPN ~ imerg_rf, data = rf_at_wq_Rwanda_all)

#Check residual standard error summary(lmmodelDRC) summary(lmmodelRwanda)

#Check R2 summary(lmmodelDRC)$r.squared summary(lmmodelRwanda)$r.squared

###scatterplot basic #ggscatter(rf_at_wq_DRC_all, x = “Colony”, y = “imerg_rf”, add = “reg.line”) + # stat_cor(label.x = 3, label.y = 34) + # stat_regline_equation(label.x = 3, label.y = 32)

#ggscatter(rf_at_wq_Rwanda_all, x = “MPN”, y = “imerg_rf”, add = “reg.line”) + # stat_cor(label.x = 3, label.y = 34) + # stat_regline_equation(label.x = 3, label.y = 32)

#—-Scatterplots

#scatterplot: CHIRPS DRC ggscatter(rf_at_wq_DRC_all, x = “chirps_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “CHIRPS Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “_,”)), label.x = 3 )

#scatterplot: IMERG DRC ggscatter(rf_at_wq_DRC_all, x = “imerg_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “IMERG Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “_,”)), label.x = 3 )

#scatterplot: LIS DRC ggscatter(rf_at_wq_DRC_all, x = “lis_runoff”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “LIS Water Quality Plot: DRC”, xlab = “Runoff (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “_,”)), label.x = 3 )

#scatterplot: TAMSAT DRC ggscatter(rf_at_wq_DRC_all, x = “tamsat_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “TAMSAT Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “_,”)), label.x = 3 )

#scatterplot: CHIRPS Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “chirps_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “CHIRPS Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )

#scatterplot: IMERG Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “imerg_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “IMERG Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )

#scatterplot: LIS Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “lis_runoff”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “LIS Water Quality Plot: Rwanda”, xlab = “Runoff (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )

#scatterplot: TAMSAT Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “tamsat_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “TAMSAT Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )

#—-Negative Binomial Regression m1DRC <- glm.nb(Colony ~ imerg_rf, data=rf_at_wq_DRC_all) summary(m1DRC)

m2DRC <- glm.nb(Colony ~ imerg_rf + SiteID, data=rf_at_wq_DRC_all) #Trying to account here for multiple samples at same pt fixed effects, but this returns error due to “NA/NaN/Inf in ‘x’” but there are no NA – check with Evan. summary(m2DRC)

m1Rwanda <- glm.nb(MPN ~ imerg_rf, data=rf_at_wq_Rwanda_all) #Trying to account here for multiple samples at same pt fixed effects, but this returns error due to X. I believe this is either due to high AIC or theta, but adding control: control = glm.control(maxit = 500), init.theta = 1.0 did not correct this. So it may be an integer issue? Check with Evan. summary(m1Rwanda)

m2Rwanda <- glm.nb(MPN ~ imerg_rf + SiteID, data=rf_at_wq_Rwanda_all) summary(m2Rwanda)

```

Mapping