Precipitation Variability vs. Water Quality
Pull in Data:
library(RCurl)
library(R.utils)
library(lubridate)
library(data.table)
library(raster)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(chirps)
library(reshape2)
library(ncdf4)
library(terra)
library(rworldmap)
library(ggpubr)
library(foreign)
library(MASS)
#Create path
path.rf <- file.path("/Users/kathleenkirsch/Desktop/Rscript")
#------Pull in WQ Data--------
###load files with wq test dates
DRCWQ = read.csv("https://api.mwater.co/v3/datagrids/df35cdf1fd06431ba787e7a500061dff/download?client=04fa81950005265980b0d25a763daab4&share=&extraFilters=%5B%5D&format=csv")
RwandaWQ = read.csv("https://api.mwater.co/v3/datagrids/4ec9dfd4014647eba7401f3d88cb6a64/download?client=04fa81950005265980b0d25a763daab4&share=&extraFilters=%5B%5D&format=csv")
#save the wq files as csv
write.csv(DRCWQ, "/Users/kathleenkirsch/Desktop/Rscript/DRCWQ.csv", row.names = F)
write.csv(RwandaWQ, "/Users/kathleenkirsch/Desktop/Rscript/RwandaWQ.csv", row.names = F)
##convert the wq data into a spatial object to pull data from raster stacks
# WQ files ####
DRCWQ_data = DRCWQ
DRCWQ_data$date2<- date(DRCWQ_data$Date)
head(DRCWQ_data)
RwandaWQ_data = RwandaWQ
RwandaWQ_data <- na.omit(RwandaWQ) #RwandaWQ has NAs for MPN and blank dates for a number of sites which messes with the date function
RwandaWQ_data$date2<- date(RwandaWQ_data$Date)
head(RwandaWQ_data)
##DRC WQ
DRCWQ_data$SiteID<-DRCWQ_data$Water.Site.ID.Bacteriological
DRCWQ_data<-DRCWQ_data[,c(9,8,2,4,6:7)] #select the relevant variables
names(DRCWQ_data)[2]<-"date"
coordinates(DRCWQ_data) <- ~ Longitude + Latitude
crs(DRCWQ_data) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
##Rwanda WQ
RwandaWQ_data$SiteID<-RwandaWQ_data$Water.Point.ID
names(RwandaWQ_data)
RwandaWQ_data<-RwandaWQ_data[,c(12,11,2,4,9:10)]
names(RwandaWQ_data)[2] <- "date"
coordinates(RwandaWQ_data) <- ~ Longitude + Latitude
crs(RwandaWQ_data) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
###simple sample plots on a map
#afr.cnt<-subset(getMap(), continent=="Africa")
drc<-getData("GADM", country = "Democratic Republic of the Congo", level = 1)
rwanda <- getData("GADM", country = "Rwanda", level = 1) #shapefile data is available from https://gadm.org/data.html
#----Pull in Meterological Data----------
#### SETTINGS ####
#---------------CHIRPS DRC---------------
# +dates of interest ####
#start_date <- floor_date(min(as.Date(DRCWQ_data$date)), "month") # "2022-03-01" - first date of the WQ data
#end_date <- as.Date(max(DRCWQ_data$date)) # "2023-05-01" - last date
lst.dates <- seq(as.Date("2022-01-01"),as.Date("2023-11-29"),by="days")
###CHIRPS rainfall product
length(DRCWQ_data$SiteID)
chirps_nc<-"/Users/kathleenkirsch/Desktop/Rscript/403cb493-f891-488f-86c8-b36cbdee0af1.nc" #read the downloaded spatially and temporally subset CHIRPS v2p datafile from ClimateSERV; https://climateserv.servirglobal.net/
chirps_nc_stk<-raster::stack(chirps_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
chirps_rf_wq<-raster::extract(chirps_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
chirps_rf_wq_df<-as.data.frame(chirps_rf_wq) #create dataframe from the extracted data
chirps_rf_wq_df2<-as.data.frame(t(chirps_rf_wq_df)) #transpose the df
rownames(chirps_rf_wq_df2)<-lst.dates #rename the rows with the date vector created for the timeseries raster stack
chirps_rf_wq_df2<-rownames_to_column(chirps_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(chirps_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
chirps_rf_wq_df3<-reshape2::melt(chirps_rf_wq_df2,variable.name = "SiteID", value.name = "chirps_rf") #reshape the dataframe to from a wide table format to a long format
chirps_rf_wq_df3$chp30d<-frollsum(chirps_rf_wq_df3$chirps_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
chirps_rf_wq_df3$chp14d<-frollsum(chirps_rf_wq_df3$chirps_rf,14,fill = NA, align = "right")
chirps_rf_wq_df3$chp7d<-frollsum(chirps_rf_wq_df3$chirps_rf,7,fill = NA, align = "right")
chirps_rf_wq_df3$chp5d<-frollsum(chirps_rf_wq_df3$chirps_rf,5,fill = NA, align = "right")
chirps_rf_wq_df3$chp10d<-frollsum(chirps_rf_wq_df3$chirps_rf,10,fill = NA, align = "right")
##merge rainfall and WQ data
DRCWQ$date<-DRCWQ_data$date
DRCWQ$SiteID<-DRCWQ_data$SiteID
chirps_at_wq_DRC<-merge(DRCWQ,chirps_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (chirps_at_wq_DRC) #KMKcheck
#outputc should look like this;
# SiteID date WSP Colony Latitude Longitude chirps_rf rf30d
# 1 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0.000000 140.51398
# 2 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0.000000 140.51398
# 3 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 8.582757 229.46067
# 4 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 8.582757 229.46067
# 5 236261208 2022-05-20 SADEL 13 -6.198532 23.65095 0.000000 98.08299
# 6 236261215 2022-05-20 SADEL 48 -6.192828 23.65280 0.000000 98.08299
#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
chirps_at_wq_DRC$Colony <- as.numeric(gsub(",","",chirps_at_wq_DRC$Colony))
#write the output to a csv file
write.csv(chirps_at_wq_DRC, paste0(path.rf, "/output/chirps_at_wq_DRC.csv"),row.names = F)
#-----------CHIRPS RWANDA------------
# +dates of interest ####
#start_date <- floor_date(min(as.Date(RwandaWQ_data$date)), "month") # "summer 2023 start" - first date of the WQ data
#end_date <- as.Date(max(RwandaWQ_data$date)) # "ongoing" - last date
lst.dates <- seq(as.Date("2022-01-01"),as.Date("2023-11-29"),by="days")
###CHIRPS rainfall product
##check length of SiteIDs and update colnames:
length(RwandaWQ_data$SiteID)
chirps_nc<-"/Users/kathleenkirsch/Desktop/Rscript/2e6ac690-b564-42ca-b50d-8d0c31df8f0a.nc" #read the downloaded spatially and temporally subset CHIRPS v2p datafile from ClimateSERV; https://climateserv.servirglobal.net/
chirps_nc_stk<-raster::stack(chirps_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
chirps_rf_wq<-raster::extract(chirps_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
chirps_rf_wq_df<-as.data.frame(chirps_rf_wq) #create dataframe from the extracted data
chirps_rf_wq_df2<-as.data.frame(t(chirps_rf_wq_df)) #transpose the df
rownames(chirps_rf_wq_df2)<-lst.dates #rename the rows with the date vector created for the timeseries raster stack
chirps_rf_wq_df2<-rownames_to_column(chirps_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(chirps_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
chirps_rf_wq_df3<-reshape2::melt(chirps_rf_wq_df2,variable.name = "SiteID", value.name = "chirps_rf") #reshape the dataframe to from a wide table format to a long format
chirps_rf_wq_df3$chp30d<-frollsum(chirps_rf_wq_df3$chirps_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
chirps_rf_wq_df3$chp14d<-frollsum(chirps_rf_wq_df3$chirps_rf,14,fill = NA, align = "right")
chirps_rf_wq_df3$chp7d<-frollsum(chirps_rf_wq_df3$chirps_rf,7,fill = NA, align = "right")
chirps_rf_wq_df3$chp5d<-frollsum(chirps_rf_wq_df3$chirps_rf,5,fill = NA, align = "right")
chirps_rf_wq_df3$chp10d<-frollsum(chirps_rf_wq_df3$chirps_rf,10,fill = NA, align = "right")
tail(chirps_rf_wq_df3)
head(chirps_rf_wq_df3)
##merge rainfall and WQ data
RwandaWQ = na.omit(RwandaWQ)
RwandaWQ$date<-RwandaWQ_data$date
RwandaWQ$SiteID<-RwandaWQ_data$SiteID
View(RwandaWQ)
chirps_at_wq_Rwanda<-merge(RwandaWQ,chirps_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
head(chirps_at_wq_Rwanda)
View (chirps_at_wq_Rwanda) #KMKcheck
#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
chirps_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",chirps_at_wq_Rwanda$MPN))
#write the output to a csv file
write.csv(chirps_at_wq_Rwanda, paste0(path.rf, "/output/chirps_at_wq_Rwanda.csv"),row.names = F)
#-----------IMERG DRC------------------
###IMERG- late
lst.dates.img <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")
imerg_nc<-"/Users/kathleenkirsch/Desktop/Rscript/26ef7276-2bee-4e59-9f32-822aa32e9cd9.nc" #read the downloaded spatially and temporally subset imerg datafile
###NOTE: I pulled climateserv
imerg_nc_stk<-raster::stack(imerg_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(imerg_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
imerg_rf_wq<-raster::extract(imerg_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
imerg_rf_wq_df<-as.data.frame(imerg_rf_wq) #create dataframe from the extracted data
imerg_rf_wq_df2<-as.data.frame(t(imerg_rf_wq_df)) #transpose the df
rownames(imerg_rf_wq_df2)<-lst.dates.img #rename the rows with the date vector created for the timeseries raster stack
imerg_rf_wq_df2<-rownames_to_column(imerg_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(imerg_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
imerg_rf_wq_df3<-reshape2::melt(imerg_rf_wq_df2,variable.name = "SiteID", value.name = "imerg_rf") #reshape the dataframe to from a wide table format to a long format
imerg_rf_wq_df3$img30d<-frollsum(imerg_rf_wq_df3$imerg_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
imerg_rf_wq_df3$img14d<-frollsum(imerg_rf_wq_df3$imerg_rf,14,fill = NA, align = "right")
imerg_rf_wq_df3$img10d<-frollsum(imerg_rf_wq_df3$imerg_rf,10,fill = NA, align = "right")
imerg_rf_wq_df3$img7d<-frollsum(imerg_rf_wq_df3$imerg_rf,7,fill = NA, align = "right")
imerg_rf_wq_df3$img5d<-frollsum(imerg_rf_wq_df3$imerg_rf,5,fill = NA, align = "right")
##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID
imerg_at_wq_DRC<-merge(DRCWQ,imerg_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (imerg_at_wq_DRC) #KMKcheck
#outputc should look like this;
# SiteID date WSP Colony Latitude Longitude imerg_rf rf30d rf14d rf10d
# 1 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0.008066831 114.2119 57.84766 42.2861472
# 2 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0.008066831 114.2119 57.84766 42.2861472
# 3 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 28.959114075 249.6487 117.45070 94.3415748
# 4 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 28.959114075 249.6487 117.45070 94.3415748
# 5 236261208 2022-05-20 SADEL 13 -6.198532 23.65095 0.007829102 162.7100 22.92562 0.8062411
# 6 236261215 2022-05-20 SADEL 48 -6.192828 23.65280 0.007829102 162.7100 22.92562 0.8062411
# rf7d rf5d
# 1 33.084086833 16.332286296
# 2 33.084086833 16.332286296
# 3 94.216252863 56.426456034
# 4 94.216252863 56.426456034
# 5 0.007829102 0.007829102
# 6 0.007829102 0.007829102
#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
imerg_at_wq_DRC$Colony <- as.numeric(gsub(",","",imerg_at_wq_DRC$Colony))
#write the output to a csv file
write.csv(imerg_at_wq_DRC, paste0(path.rf, "/output/imerg_at_wq_DRC.csv"),row.names = F)
#-----------IMERG RWANDA--------------
lst.dates.img <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")
imerg_nc<-"/Users/kathleenkirsch/Desktop/Rscript/8ca328ef-c421-4bc7-92c8-b76fefdaf1ae.nc" #read the downloaded spatially and temporally subset imerg datafile
###NOTE: I pulled climateserv
imerg_nc_stk<-raster::stack(imerg_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(imerg_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
imerg_rf_wq<-raster::extract(imerg_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
imerg_rf_wq_df<-as.data.frame(imerg_rf_wq) #create dataframe from the extracted data
imerg_rf_wq_df2<-as.data.frame(t(imerg_rf_wq_df)) #transpose the df
rownames(imerg_rf_wq_df2)<-lst.dates.img #rename the rows with the date vector created for the timeseries raster stack
imerg_rf_wq_df2<-rownames_to_column(imerg_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(imerg_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
imerg_rf_wq_df3<-reshape2::melt(imerg_rf_wq_df2,variable.name = "SiteID", value.name = "imerg_rf") #reshape the dataframe to from a wide table format to a long format
imerg_rf_wq_df3$img30d<-frollsum(imerg_rf_wq_df3$imerg_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
imerg_rf_wq_df3$img14d<-frollsum(imerg_rf_wq_df3$imerg_rf,14,fill = NA, align = "right")
imerg_rf_wq_df3$img10d<-frollsum(imerg_rf_wq_df3$imerg_rf,10,fill = NA, align = "right")
imerg_rf_wq_df3$img7d<-frollsum(imerg_rf_wq_df3$imerg_rf,7,fill = NA, align = "right")
imerg_rf_wq_df3$img5d<-frollsum(imerg_rf_wq_df3$imerg_rf,5,fill = NA, align = "right")
##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID
imerg_at_wq_Rwanda<-merge(RwandaWQ,imerg_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (imerg_at_wq_Rwanda) #KMKcheck
#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
imerg_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",imerg_at_wq_Rwanda$MPN))
#write the output to a csv file
write.csv(imerg_at_wq_Rwanda, paste0(path.rf, "/output/imerg_at_wq_Rwanda.csv"),row.names = F)
#---------LIS DRC---------
###LIS- runoff
###Note: recommended site I pulled from also climateserv
lst.dates.lis <- seq(as.Date("2022-01-01"),as.Date("2023-12-24"),by="days")
lis_nc<-"/Users/kathleenkirsch/Desktop/Rscript/LIS-runoff-daily-01012022-24122023-corrected.nc" #read the downloaded spatially and temporally subset lis datafile downloaded from climateserv
lis_nc_stk<-raster::stack(lis_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(lis_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
lis_rf_wq<-raster::extract(lis_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
lis_rf_wq_df<-as.data.frame(lis_rf_wq) #create dataframe from the extracted data
lis_rf_wq_df2<-as.data.frame(t(lis_rf_wq_df)) #transpose the df
rownames(lis_rf_wq_df2)<-lst.dates.lis #rename the rows with the date vector created for the timeseries raster stack
lis_rf_wq_df2<-rownames_to_column(lis_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(lis_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
lis_rf_wq_df3<-reshape2::melt(lis_rf_wq_df2,variable.name = "SiteID", value.name = "lis_runoff") #reshape the dataframe to from a wide table format to a long format
lis_rf_wq_df3$lis30d<-frollsum(lis_rf_wq_df3$lis_runoff,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
lis_rf_wq_df3$lis14d<-frollsum(lis_rf_wq_df3$lis_runoff,14,fill = NA, align = "right")
lis_rf_wq_df3$lis0d<-frollsum(lis_rf_wq_df3$lis_runoff,10,fill = NA, align = "right")
lis_rf_wq_df3$lis7d<-frollsum(lis_rf_wq_df3$lis_runoff,7,fill = NA, align = "right")
lis_rf_wq_df3$lis5d<-frollsum(lis_rf_wq_df3$lis_runoff,5,fill = NA, align = "right")
##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID
lis_at_wq_DRC<-merge(DRCWQ,lis_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (lis_at_wq_DRC) #KMKcheck
#outputc should look like this;
# SiteID date WSP Colony Latitude Longitude lis_rf rf30d rf14d rf10d
# 1 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 9.531376e-07 15.96225 5.935595 4.913927514
# 2 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 9.531376e-07 15.96225 5.935595 4.913927514
# 3 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 9.570781e+00 60.24816 32.720138 27.717481035
# 4 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 9.570781e+00 60.24816 32.720138 27.717481035
# 5 236261208 2022-05-20 SADEL 13 -6.198532 23.65095 1.400630e-07 49.43797 7.206170 0.009684263
# 6 236261215 2022-05-20 SADEL 48 -6.192828 23.65280 1.400630e-07 49.43797 7.206170 0.009684263
# rf7d rf5d
# 1 4.344562e+00 2.335235e+00
# 2 4.344562e+00 2.335235e+00
# 3 2.770782e+01 1.694874e+01
# 4 2.770782e+01 1.694874e+01
# 5 1.400755e-07 1.400755e-07
# 6 1.400755e-07 1.400755e-07
#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
lis_at_wq_DRC$Colony <- as.numeric(gsub(",","",lis_at_wq_DRC$Colony))
#write the output to a csv file
write.csv(lis_at_wq_DRC, paste0(path.rf, "/output/lis_at_wq_DRC.csv"),row.names = F)
#------LIS RWANDA-----------------
lst.dates.lis <- seq(as.Date("2022-01-01"),as.Date("2023-12-24"),by="days")
lis_nc<-"/Users/kathleenkirsch/Desktop/Rscript/LIS-runoff-daily-01012022-24122023-corrected.nc" #read the downloaded spatially and temporally subset lis datafile downloaded from climateserv
lis_nc_stk<-raster::stack(lis_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
crs(lis_nc_stk) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
lis_rf_wq<-raster::extract(lis_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
lis_rf_wq_df<-as.data.frame(lis_rf_wq) #create dataframe from the extracted data
lis_rf_wq_df2<-as.data.frame(t(lis_rf_wq_df)) #transpose the df
rownames(lis_rf_wq_df2)<-lst.dates.lis #rename the rows with the date vector created for the timeseries raster stack
lis_rf_wq_df2<-rownames_to_column(lis_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(lis_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
lis_rf_wq_df3<-reshape2::melt(lis_rf_wq_df2,variable.name = "SiteID", value.name = "lis_runoff") #reshape the dataframe to from a wide table format to a long format
lis_rf_wq_df3$lis30d<-frollsum(lis_rf_wq_df3$lis_runoff,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
lis_rf_wq_df3$lis14d<-frollsum(lis_rf_wq_df3$lis_runoff,14,fill = NA, align = "right")
lis_rf_wq_df3$lis0d<-frollsum(lis_rf_wq_df3$lis_runoff,10,fill = NA, align = "right")
lis_rf_wq_df3$lis7d<-frollsum(lis_rf_wq_df3$lis_runoff,7,fill = NA, align = "right")
lis_rf_wq_df3$lis5d<-frollsum(lis_rf_wq_df3$lis_runoff,5,fill = NA, align = "right")
##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID
lis_at_wq_Rwanda<-merge(RwandaWQ,lis_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (lis_at_wq_Rwanda) #KMKcheck
#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
lis_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",lis_at_wq_Rwanda$MPN))
#write the output to a csv file
write.csv(lis_at_wq_Rwanda, paste0(path.rf, "/output/lis_at_wq_Rwanda.csv"),row.names = F)
#------- TAMSAT DRC----------------
###TAMSAT rainfall product
lst.dates.tamsat <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")
tamsat_nc<-"/Users/kathleenkirsch/Desktop/Rscript/tamsat_01012022-20122023_daily.nc" #read the downloaded spatially and temporally subset tamsatap datafile downloaded
#if you lose pull from https://iridl.ldeo.columbia.edu/SOURCES/.Reading/.Meteorology/.TAMSAT/.v3p1/.daily/.rfe/dataselection.html
#note: talk to Denis -- this file type needs preprocessing (ask how to use Climate Data Operators?)
tamsat_nc_stk<-raster::stack(tamsat_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
tamsat_rf_wq<-raster::extract(tamsat_nc_stk,DRCWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
tamsat_rf_wq_df<-as.data.frame(tamsat_rf_wq) #create dataframe from the extracted data
tamsat_rf_wq_df2<-as.data.frame(t(tamsat_rf_wq_df)) #transpose the df
rownames(tamsat_rf_wq_df2)<-lst.dates.tamsat #rename the rows with the date vector created for the timeseries raster stack
tamsat_rf_wq_df2<-rownames_to_column(tamsat_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(tamsat_rf_wq_df2)[2:189]<-DRCWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
tamsat_rf_wq_df3<-reshape2::melt(tamsat_rf_wq_df2,variable.name = "SiteID", value.name = "tamsat_rf") #reshape the dataframe to from a wide table format to a long format
tamsat_rf_wq_df3$tam30d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
tamsat_rf_wq_df3$tam4d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,14,fill = NA, align = "right")
tamsat_rf_wq_df3$tam10d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,10,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf7d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,7,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf5d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,5,fill = NA, align = "right")
##merge rainfall and WQ data
# DRCWQ$date<-DRCWQ_data$date
# DRCWQ$SiteID<-DRCWQ_data$SiteID
tamsat_at_wq_DRC<-merge(DRCWQ,tamsat_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (tamsat_at_wq_DRC) #KMKcheck
#outputc should look like this;
# SiteID date WSP Colony Latitude Longitude tamsat_rf tam30d tam4d tam10d tamf7d tamf5d
# 1 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0 141.6 61.6 36.6 36.6 18.3
# 2 236260568 2022-04-13 FOMI Tshitenge 0 -6.127290 23.67332 0 141.6 61.6 36.6 36.6 18.3
# 3 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 0 145.1 61.1 50.5 50.5 31.4
# 4 236260568 2022-12-01 FOMI Tshitenge 65 -6.127290 23.67332 0 145.1 61.1 50.5 50.5 31.4
# 5 236261208 2022-05-20 SADEL 13 -6.198532 23.65095 0 94.4 20.1 0.0 0.0 0.0
# 6 236261215 2022-05-20 SADEL 48 -6.192828 23.65280 0 94.4 20.1 0.0 0.0 0.0
#convert the "Colony" variable into a numerical field as some of the values have commas as thousands separator
tamsat_at_wq_DRC$Colony <- as.numeric(gsub(",","",tamsat_at_wq_DRC$Colony))
#write the output to a csv file
write.csv(tamsat_at_wq_DRC, paste0(path.rf, "/output/tamsat_at_wq_DRC.csv"),row.names = F)
#-------TAMSAT RWANDA---------
lst.dates.tamsat <- seq(as.Date("2022-01-01"),as.Date("2023-12-20"),by="days")
tamsat_nc<-"/Users/kathleenkirsch/Desktop/Rscript/tamsat_01012022-20122023_daily.nc" #read the downloaded spatially and temporally subset tamsatap datafile downloaded
#if you lose pull from https://iridl.ldeo.columbia.edu/SOURCES/.Reading/.Meteorology/.TAMSAT/.v3p1/.daily/.rfe/dataselection.html
#note: talk to Denis -- this file type needs preprocessing (ask how to use Climate Data Operators?)
tamsat_nc_stk<-raster::stack(tamsat_nc) #stack the netcdf file into a raster stack - should contain all the dates in the subset
tamsat_rf_wq<-raster::extract(tamsat_nc_stk,RwandaWQ_data,fun=mean,na.rm=T) #extract rainfall data at WQ sample locations
tamsat_rf_wq_df<-as.data.frame(tamsat_rf_wq) #create dataframe from the extracted data
tamsat_rf_wq_df2<-as.data.frame(t(tamsat_rf_wq_df)) #transpose the df
rownames(tamsat_rf_wq_df2)<-lst.dates.tamsat #rename the rows with the date vector created for the timeseries raster stack
tamsat_rf_wq_df2<-rownames_to_column(tamsat_rf_wq_df2,var="date") #assign a variable name for the date column
colnames(tamsat_rf_wq_df2)[2:1118]<-RwandaWQ_data$SiteID #assign SiteID names to all other columns containing the extracted rainfall data at sites/locations
tamsat_rf_wq_df3<-reshape2::melt(tamsat_rf_wq_df2,variable.name = "SiteID", value.name = "tamsat_rf") #reshape the dataframe to from a wide table format to a long format
tamsat_rf_wq_df3$tam30d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,30,fill = NA, align = "right") #calculate a rolling sum of rainfall 30 days prior to the wq sample collection date
tamsat_rf_wq_df3$tam4d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,14,fill = NA, align = "right")
tamsat_rf_wq_df3$tam10d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,10,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf7d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,7,fill = NA, align = "right")
tamsat_rf_wq_df3$tamf5d<-frollsum(tamsat_rf_wq_df3$tamsat_rf,5,fill = NA, align = "right")
##merge rainfall and WQ data
# RwandaWQ$date<-RwandaWQ_data$date
# RwandaWQ$SiteID<-RwandaWQ_data$SiteID
tamsat_at_wq_Rwanda<-merge(RwandaWQ,tamsat_rf_wq_df3,by=c("SiteID","date")) #merge by both SiteIDs and dates
View (tamsat_at_wq_Rwanda) #KMKcheck
#convert the "MPN" variable into a numerical field as some of the values have commas as thousands separator
tamsat_at_wq_Rwanda$MPN <- as.numeric(gsub(",","",tamsat_at_wq_Rwanda$MPN))
#write the output to a csv file
write.csv(tamsat_at_wq_Rwanda, paste0(path.rf, "/output/tamsat_at_wq_Rwanda.csv"),row.names = F)
#------MERGE DFs--------
rf_at_wq_DRC1 <- merge(chirps_at_wq_DRC,tamsat_at_wq_DRC,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))
rf_at_wq_DRC2 <- merge(imerg_at_wq_DRC,lis_at_wq_DRC,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))
rf_at_wq_DRC_all <- merge(rf_at_wq_DRC1,rf_at_wq_DRC2,by=c("SiteID","date","WSP","Colony","Latitude","Longitude"))
##export merged output
write.csv(rf_at_wq_DRC_all, "/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_DRC_all.csv",row.names = F)
rf_at_wq_Rwanda1 <- merge(chirps_at_wq_Rwanda,tamsat_at_wq_Rwanda,by=c("SiteID","date","School","MPN","Latitude","Longitude"))
rf_at_wq_Rwanda2 <- merge(imerg_at_wq_Rwanda,lis_at_wq_Rwanda,by=c("SiteID","date","School","MPN","Latitude","Longitude"))
rf_at_wq_Rwanda_all <- merge(rf_at_wq_Rwanda1,rf_at_wq_Rwanda2,by=c("SiteID","date","School","MPN","Latitude","Longitude"))
##export merged output
write.csv(rf_at_wq_Rwanda_all, "/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_Rwanda_all.csv",row.names = F)
Statistics:
```{# rf_at_wq_DRC_all <- read.csv(“/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_DRC_all.csv”) #can also (re)read the merged data file using this code}
names(rf_at_wq_DRC_all) #names of the dataframe so we know which variables we are working with for the modeling
[1] “SiteID” “date” “WSP” “Colony” “Latitude” “Longitude” “chirps_rf” “chp30d”
[9] “chp14d” “chp7d” “chp5d” “chp10d” “tamsat_rf” “tam30d” “tam4d” “tam10d”
[17] “tamf7d” “tamf5d” “imerg_rf” “img30d” “img14d” “img10d” “img7d” “img5d”
[25] “lis_runoff” “lis30d” “lis14d” “lis0d” “lis7d” “lis5d”
rf_at_wq_Rwanda_all <- read.csv(“/Users/kathleenkirsch/Desktop/Rscript/output/rf_at_wq_Rwanda_all.csv”) #can also (re)read the merged data file using this code
names(rf_at_wq_Rwanda_all) #names of the dataframe so we know which variables we are working with for the modeling
#——–Analysis———- #sample_drc_model <- lm(Colony~chp30d, data=rf_at_wq_DRC_all) #fit a simple linear model #summary(sample_drc_model)
#sample_drc_fe_model <- lm(Colony~chp30d+factor(SiteID)-1, data=rf_at_wq_DRC_all) #fit a simple linear model with “SiteID” as the fixed effect variable #summary(sample_drc_fe_model)
#sample_Rwanda_model <- lm(MPN~chp30d, data=rf_at_wq_Rwanda_all) #fit a simple linear model #summary(sample_Rwanda_model)
#sample_Rwanda_fe_model <- lm(MPN~chp30d+factor(SiteID)-1, data=rf_at_wq_Rwanda_all) #fit a simple linear model with “SiteID” as the fixed effect variable #summary(sample_Rwanda_fe_model)
#regression of rainfall versus water quality #check correlation coefficient to assess linearity coefficientDRC <- cor.test(rf_at_wq_DRC_all\(Colony, rf_at_wq_DRC_all\)imerg_rf) coefficientDRC$estimate
coefficientRwanda <- cor.test(rf_at_wq_Rwanda_all\(MPN, rf_at_wq_Rwanda_all\)imerg_rf) coefficientRwanda$estimate
#linear regression model lmmodelDRC <- lm(Colony ~ imerg_rf, data = rf_at_wq_DRC_all)
lmmodelRwanda <- lm(MPN ~ imerg_rf, data = rf_at_wq_Rwanda_all)
#Check residual standard error summary(lmmodelDRC) summary(lmmodelRwanda)
#Check R2 summary(lmmodelDRC)\(r.squared summary(lmmodelRwanda)\)r.squared
###scatterplot basic #ggscatter(rf_at_wq_DRC_all, x = “Colony”, y = “imerg_rf”, add = “reg.line”) + # stat_cor(label.x = 3, label.y = 34) + # stat_regline_equation(label.x = 3, label.y = 32)
#ggscatter(rf_at_wq_Rwanda_all, x = “MPN”, y = “imerg_rf”, add = “reg.line”) + # stat_cor(label.x = 3, label.y = 34) + # stat_regline_equation(label.x = 3, label.y = 32)
#—-Scatterplots
#scatterplot: CHIRPS DRC ggscatter(rf_at_wq_DRC_all, x = “chirps_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “CHIRPS Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “,”)), label.x = 3 )
#scatterplot: IMERG DRC ggscatter(rf_at_wq_DRC_all, x = “imerg_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “IMERG Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “,”)), label.x = 3 )
#scatterplot: LIS DRC ggscatter(rf_at_wq_DRC_all, x = “lis_runoff”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “LIS Water Quality Plot: DRC”, xlab = “Runoff (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “,”)), label.x = 3 )
#scatterplot: TAMSAT DRC ggscatter(rf_at_wq_DRC_all, x = “tamsat_rf”, y = “Colony”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “TAMSAT Water Quality Plot: DRC”, xlab = “Rainfall (mm)”, ylab = “Thermotolerant coliforms (CFU/100ml)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) ) + stat_cor( aes(label = paste(..rr.label.., ..p.label.., sep = “,”)), label.x = 3 )
#scatterplot: CHIRPS Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “chirps_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “CHIRPS Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )
#scatterplot: IMERG Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “imerg_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “IMERG Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )
#scatterplot: LIS Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “lis_runoff”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “LIS Water Quality Plot: Rwanda”, xlab = “Runoff (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )
#scatterplot: TAMSAT Rwanda ggscatter(rf_at_wq_Rwanda_all, x = “tamsat_rf”, y = “MPN”, color = “black”, shape = 21, size = 3, # Points color, shape and size title = “TAMSAT Water Quality Plot: Rwanda”, xlab = “Rainfall (mm)”, ylab = “E.coli (MPN)”, add = “reg.line”, # Add regressin line add.params = list(color = “blue”, fill = “lightgray”), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor cor.coeff.args = list(method = “pearson”, label.x = 3, label.sep = “”) )
#—-Negative Binomial Regression m1DRC <- glm.nb(Colony ~ imerg_rf, data=rf_at_wq_DRC_all) summary(m1DRC)
m2DRC <- glm.nb(Colony ~ imerg_rf + SiteID, data=rf_at_wq_DRC_all) #Trying to account here for multiple samples at same pt fixed effects, but this returns error due to “NA/NaN/Inf in ‘x’” but there are no NA – check with Evan. summary(m2DRC)
m1Rwanda <- glm.nb(MPN ~ imerg_rf, data=rf_at_wq_Rwanda_all) #Trying to account here for multiple samples at same pt fixed effects, but this returns error due to X. I believe this is either due to high AIC or theta, but adding control: control = glm.control(maxit = 500), init.theta = 1.0 did not correct this. So it may be an integer issue? Check with Evan. summary(m1Rwanda)
m2Rwanda <- glm.nb(MPN ~ imerg_rf + SiteID, data=rf_at_wq_Rwanda_all) summary(m2Rwanda)
```
Mapping