Synopsis:

This is an attempt to explore a part of the Mystic River Watershed Association (MyRWA: myrwa.org) database. I am particularly interested in looking at the historical data gleaned from https://github.com/Arcticgrayling/MyRWA/tree/master/projects/WQ7981 and entered into the data base. I want to see how this data compares to current data from the sites sampled in this report. I am using techinques learned from Roger Peng and his Exploratory Data Analysis with R class and book: https://www.coursera.org/learn/exploratory-data-analysis and https://leanpub.com/exdata

Cleaning the Data

I was give an Excel(.xlsx) file from Andy Hycerna at Myrwa which includes the sites in question from the MyRWA database. It is large and the R xlsx reader had trouble handling it. I used Excel to convert the data to .csv format, being careful to change the date format in Excel to give the complete data and time for each site, which loads easily into R.

rm(list=ls())   # Clean: remove all variables in Enviornment
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
#library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)


wd <- "~/Documents/MyRWA/projects/ExploreWQ7981/"
#scriptsLoc <- paste(wd,"scripts/analyze/", sep = "")
dataLoc <- paste(wd,"data/", sep = "")
#resultsLoc <- paste(wd,"results/", sep = "")
setwd(wd)

site.data <- paste(dataLoc,"siteData.csv", sep = "")  

dataRaw <- read.csv(site.data)

#list of database names of locations sampled in report
reportLocations <- c("AJ01","ABRN01","ABR070","HAB001","ABR049",
                     "ABR036","ABR028","ABR017","ABR006","ABR001",
                     "UPL001","MYR071","MIB001","MWRA070","MWRA172",
                     "MWRA061","MYR39","MAR036","MWRA059","HAB0015",
                     "HAB002","SWBB","HOB103","SHBTRIB","CUBLEX","HBT01",
                     "HBT02","HBT03","HBT05","HBT06","HBT07","HBT08")

#Show characteristics of the data set:

dim(dataRaw)
## [1] 168346      9
str(dataRaw)
## 'data.frame':    168346 obs. of  9 variables:
##  $ ID              : int  358 359 360 361 362 363 364 365 366 367 ...
##  $ CharacteristicID: Factor w/ 65 levels "ALK","ARSENIC",..: 19 19 19 19 19 19 19 19 19 19 ...
##  $ ResultValue     : num  40 650 10 710 440 940 40 950 870 1000 ...
##  $ Units           : Factor w/ 14 levels "","%","CFU/100ml",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Qualifier       : Factor w/ 3 levels "","<",">": 1 1 2 1 1 1 1 1 1 1 ...
##  $ LocationID      : Factor w/ 963 levels "ABR001","ABR002",..: 825 45 480 11 4 863 634 15 554 554 ...
##  $ ProjectID       : Factor w/ 7 levels "BASE","BHWQM",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Datetime        : Factor w/ 14146 levels "1/10/1995 10:31",..: 8917 8918 8919 8920 8921 8922 8923 8924 8924 8924 ...
##  $ SampleDepthType : Factor w/ 3 levels "","B","S": 1 1 1 1 1 1 1 1 1 1 ...
head(dataRaw)
##    ID CharacteristicID ResultValue     Units Qualifier LocationID
## 1 358            FCOLI          40 CFU/100ml               UPL001
## 2 359            FCOLI         650 CFU/100ml               ALB006
## 3 360            FCOLI          10 CFU/100ml         <     MAR036
## 4 361            FCOLI         710 CFU/100ml               ABR028
## 5 362            FCOLI         440 CFU/100ml               ABR006
## 6 363            FCOLI         940 CFU/100ml               WIB001
##   ProjectID        Datetime SampleDepthType
## 1      BASE 7/12/2000 06:03                
## 2      BASE 7/12/2000 06:25                
## 3      BASE 7/12/2000 06:40                
## 4      BASE 7/12/2000 06:44                
## 5      BASE 7/12/2000 06:48                
## 6      BASE 7/12/2000 06:58
tail(dataRaw)
##            ID CharacteristicID ResultValue     Units Qualifier LocationID
## 168341 121325            ECOLI     10.9000 MPN/100ml              WEPBOBC
## 168342 121325             TURB      3.4900       NTU              WEPBOBC
## 168343 121326       TEMP_WATER     25.8000     deg C           MYRBOBDOCK
## 168344 121326           SPCOND   1709.9000     uS/cm           MYRBOBDOCK
## 168345 121326               DO      9.2100      mg/l           MYRBOBDOCK
## 168346 121326           DO_SAT    113.5622         %           MYRBOBDOCK
##        ProjectID        Datetime SampleDepthType
## 168341   RECFLAG 7/21/2015 09:20                
## 168342   RECFLAG 7/21/2015 09:20                
## 168343   RECFLAG 7/22/2015 07:15                
## 168344   RECFLAG 7/22/2015 07:15                
## 168345   RECFLAG 7/22/2015 07:15                
## 168346   RECFLAG 7/22/2015 07:15

We can see the data set is fairly large.

Lets narrow the data down to sites that were in the report and the columns that we are interested in.

## get just location from report

d <- filter(dataRaw, dataRaw$LocationID %in% reportLocations)
#d <- subset(d, d$LocationID %in% reportLocations)
# get just useful columns
d <- select(d, one_of("CharacteristicID","ResultValue","Units","LocationID","Datetime"))
d <- filter(d,!is.na(ResultValue))
dim(d)
## [1] 34285     5
head(d)
##   CharacteristicID ResultValue     Units LocationID        Datetime
## 1            FCOLI          40 CFU/100ml     UPL001 7/12/2000 06:03
## 2            FCOLI          10 CFU/100ml     MAR036 7/12/2000 06:40
## 3            FCOLI         710 CFU/100ml     ABR028 7/12/2000 06:44
## 4            FCOLI         440 CFU/100ml     ABR006 7/12/2000 06:48
## 5            FCOLI          40 CFU/100ml     MYR071 7/12/2000 07:07
## 6            FCOLI         950 CFU/100ml     ABR049 7/12/2000 07:10
str(d)
## 'data.frame':    34285 obs. of  5 variables:
##  $ CharacteristicID: Factor w/ 65 levels "ALK","ARSENIC",..: 19 19 19 19 19 19 19 19 19 19 ...
##  $ ResultValue     : num  40 10 710 440 40 950 870 1000 40 1100 ...
##  $ Units           : Factor w/ 14 levels "","%","CFU/100ml",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ LocationID      : Factor w/ 963 levels "ABR001","ABR002",..: 825 480 11 4 634 15 554 554 825 11 ...
##  $ Datetime        : Factor w/ 14146 levels "1/10/1995 10:31",..: 8917 8919 8920 8921 8923 8924 8924 8924 12372 12373 ...
#refactor: (from hadley,http://grokbase.com/t/r/r-help/0765n5rmsf/r-refactor-all-factors-in-a-data-frame)
cat <- sapply(d, is.factor)
d[cat] <- lapply(d[cat], factor)       

## see where there are enough values to explore further
table(d$LocationID, d$CharacteristicID)
##          
##           ALK ARSENIC BARIUM BOD5 CADMIUM CHLORIDE CHLORINE CHROMIUM COD
##   ABR001    4       2      3    4       3        3        0        2   4
##   ABR006    7       2      3    7       2        4        0        2   5
##   ABR017    7       3      4    7       3        3        0        3   5
##   ABR028    7       3      4    7       3        3        0        3   5
##   ABR036    7       3      4    7       3        3        0        3   6
##   ABR049    7       3      4    7       3        3        0        4   7
##   ABR070    7       3      4    7       3        3        0        4   7
##   ABRN01    7       4      4    7       3        3        0        4   7
##   AJ01      5       3      3    5       2        2        0        3   5
##   CUBLEX    3       0      3    3       2        2        0        2   3
##   HAB001    7       3      4    7       3        3        0        4   7
##   HAB0015   7       4      4    7       3        3        0        4   7
##   HAB002    7       4      4    7       3        3        0        4   7
##   HBT01     6       4      4    6       3        3        0        4   6
##   HBT02     6       4      4    6       3        3        0        4   6
##   HBT03     3       3      3    3       2        2        0        3   3
##   HBT05     7       4      4    7       3        3        0        4   6
##   HBT06     7       4      4    7       3        3        0        4   6
##   HBT07     3       3      3    3       2        2        0        3   3
##   HBT08     7       4      4    7       3        3        0        4   6
##   HOB103    2       2      2    2       1        1        0        2   2
##   MAR036    5       2      3    4       2        4        0        2   4
##   MIB001    7       3      4    6       3        5        1        3   6
##   MWRA059   5       2      3    5       2        4        0        2   3
##   MWRA061   5       2      3    4       2        4        0        2   4
##   MWRA070   6       3      4    5       3        5        0        3   5
##   MWRA172   2       0      1    2       2        2        0        1   2
##   MYR071    7       3      4    6       3        5        0        3   6
##   MYR39     1       1      1    1       1        1        0        1   1
##   SHBTRIB   3       2      3    3       2        2        0        2   3
##   SWBB      7       3      4    7       3        3        0        3   5
##   UPL001    7       3      4    6       3        5        0        3   6
##          
##           COPPER DEPTH  DO DO_SAT ECOLI ENT FCOLI HARDNESS IRON LEAD
##   ABR001       3     0  10     10     6   1     6        1    3    3
##   ABR006       3     0 212    201   183   2    38        1    3    3
##   ABR017       4     0  10     10     2   1     7        1    4    4
##   ABR028       4     0 213    208   182   1    38        1    4    4
##   ABR036       4     0  14     14     5   1    10        1    4    4
##   ABR049       4     0 202    200   177   1    39        1    4    4
##   ABR070       4     0   9      9     1   0     9        1    4    4
##   ABRN01       4     0  10     10     1   0    10        1    4    4
##   AJ01         3     0   4      4     0   0     4        0    3    3
##   CUBLEX       3     0   4      4     2   0     4        1    3    3
##   HAB001       4     0  12     12     4   0    10        1    4    4
##   HAB0015      4     0   7      7     0   0     7        1    4    4
##   HAB002       4     0   8      8     0   0     7        1    4    4
##   HBT01        4     0   6      6     0   0     5        1    4    4
##   HBT02        4     0   6      5     0   0     5        1    4    4
##   HBT03        3     0   3      3     0   0     3        0    3    3
##   HBT05        4     0   7      7     0   0     6        1    4    4
##   HBT06        4     0   7      7     0   0     6        1    4    4
##   HBT07        3     0   4      4     0   0     4        0    3    3
##   HBT08        4     0   7      7     0   0     6        1    4    4
##   HOB103       2     0   2      2     1   0     2        0    2    2
##   MAR036       3     0 194    189   180   1    33        1    3    3
##   MIB001       4     0 208    206   182   0    39        1    4    4
##   MWRA059      3   426 786    787   263 526   297        1    3    3
##   MWRA061      3     0   5      5     0   0     5        1    3    3
##   MWRA070      4   123 698    688   353 601   335        1    4    4
##   MWRA172      2     0 633    631   335 426   175        1    2    1
##   MYR071       4     0 211    201   185   2    35        1    4    4
##   MYR39        1     0   2      2     1   0     2        0    1    1
##   SHBTRIB      3     0   4      4     1   0     3        1    3    3
##   SWBB         4     0  10     10     5   0     7        1    4    4
##   UPL001       4     0 200    190   173   0    35        1    4    4
##          
##           MANGANESE MERCURY NH3 NICKEL NO2 NO23 NO3  PH SALINITY SECCHI
##   ABR001          1       1   4      1   0    0   4   5        7      0
##   ABR006          1       2  54      2   7  196  14  80       15      0
##   ABR017          1       2   7      2   0    0   7   8        2      0
##   ABR028          1       2  56      2   7  193  14  89       13      0
##   ABR036          1       2   8      2   0    0   7   8        6      0
##   ABR049          1       2  52      2   7  188  14  84       15      0
##   ABR070          1       2   7      2   0    0   7   8        2      0
##   ABRN01          1       2   7      2   0    0   7   7        3      0
##   AJ01            1       2   5      2   0    0   5   5        0      0
##   CUBLEX          0       1   3      1   0    0   3   4        2      0
##   HAB001          1       2   7      2   0    0   7   8        4      0
##   HAB0015         1       2   7      2   0    0   7   7        0      0
##   HAB002          1       2   7      2   0    0   7   7        1      0
##   HBT01           1       2   6      2   0    0   6   6        0      0
##   HBT02           1       2   6      2   0    0   6   6        0      0
##   HBT03           1       2   3      2   0    0   3   3        0      0
##   HBT05           1       2   7      2   0    0   7   7        0      0
##   HBT06           1       2   7      2   0    0   7   7        0      0
##   HBT07           1       2   3      2   0    0   3   3        0      0
##   HBT08           1       2   7      2   0    0   7   7        0      0
##   HOB103          0       1   2      1   0    0   2   2        0      0
##   MAR036          0       1  52      1   6  191  11  78        8      0
##   MIB001          1       2  55      2   7  194  14  86       14      0
##   MWRA059         0       1   5      1   0    0   5 715      791    426
##   MWRA061         0       1   5      1   0    0   5   5        0      0
##   MWRA070         1       2   6      2   0    0   6 614      678    123
##   MWRA172         1       1   2      1   0    0   2 573      608      0
##   MYR071          1       2  54      2   7  198  14  68       11      0
##   MYR39           0       0   1      0   0    0   1   2        1      0
##   SHBTRIB         0       1   3      1   0    0   3   3        1      0
##   SWBB            1       2   9      2   0    0   7   7        2      0
##   UPL001          1       2  55      2   7  192  14  83        8      0
##          
##           SELENIUM SILVER SODIUM SPCOND SULFATE SURF_ANIONIC TCOLI
##   ABR001         1      1      3      7       2            0     4
##   ABR006         2      2      2    211       2            0     7
##   ABR017         2      2      3      3       2            0     7
##   ABR028         2      2      3    207       2            0     7
##   ABR036         2      2      3      7       2            1     7
##   ABR049         2      2      3    204       2            0     7
##   ABR070         2      2      3      2       2            0     7
##   ABRN01         2      2      3      3       2            0     7
##   AJ01           2      2      2      0       2            0     5
##   CUBLEX         1      1      2      2       1            0     3
##   HAB001         2      2      3      4       2            0     7
##   HAB0015        2      2      3      0       2            0     7
##   HAB002         2      2      3      1       2            0     6
##   HBT01          2      2      3      0       2            0     5
##   HBT02          2      2      3      0       2            0     5
##   HBT03          2      2      2      0       2            0     3
##   HBT05          2      2      3      0       2            0     6
##   HBT06          2      2      3      0       2            0     6
##   HBT07          2      2      2      0       2            0     4
##   HBT08          2      2      3      0       2            0     6
##   HOB103         1      1      1      1       1            0     2
##   MAR036         1      1      2    200       1            0     5
##   MIB001         2      2      3    210       2            1     7
##   MWRA059        1      1      2    725       1            0     5
##   MWRA061        1      1      2      0       1            0     5
##   MWRA070        2      2      3    632       2            0     6
##   MWRA172        1      1      2    631       1            0     2
##   MYR071         2      2      3    209       2            0     7
##   MYR39          0      0      1      1       1            0     1
##   SHBTRIB        1      1      2      1       1            0     3
##   SWBB           2      2      3      4       2            2     7
##   UPL001         2      2      3    200       2            0     7
##          
##           TEMP_WATER TKN  TN  TP  TS TSS TURB VANADIUM ZINC
##   ABR001          10   4   0   4   4   4    0        1    3
##   ABR006         207   7   1 210   7 209   12        2    3
##   ABR017          10   7   0   7   7   7    0        2    4
##   ABR028         214   7   1 207   7 210   12        2    4
##   ABR036          14   7   0   7   7   7    0        2    4
##   ABR049         211   7   0 202   7 205   12        2    4
##   ABR070           9   7   0   7   7   7    0        2    4
##   ABRN01          10   7   0   7   7   7    0        2    4
##   AJ01             4   5   0   5   5   5    0        2    3
##   CUBLEX           5   3   0   3   3   3    0        1    3
##   HAB001          12   7   0   7   7   7    0        2    4
##   HAB0015          7   7   0   7   7   7    0        2    4
##   HAB002           8   7   0   7   7   7    0        2    4
##   HBT01            7   6   0   6   6   6    0        2    4
##   HBT02            5   6   0   6   6   6    0        2    4
##   HBT03            3   3   0   3   3   3    0        2    3
##   HBT05            7   7   0   7   7   7    0        2    4
##   HBT06            7   7   0   7   7   7    0        2    4
##   HBT07            4   3   0   3   3   3    0        2    3
##   HBT08            7   7   0   7   7   7    0        2    4
##   HOB103           2   2   0   2   2   2    0        1    2
##   MAR036         200   5   1 203   5 203   13        1    3
##   MIB001         214   7   0 209   7 209   11        2    4
##   MWRA059        797   5   0   5   5   5  594        1    3
##   MWRA061          5   5   0   5   5   5    0        1    3
##   MWRA070        705   6   0   6   6   6  277        2    4
##   MWRA172        640   2   0   2   2   2  160        0    2
##   MYR071         202   7   1 212   7 213   11        2    4
##   MYR39            2   1   0   1   1   1    0        0    1
##   SHBTRIB          4   3   0   3   3   3    0        1    3
##   SWBB            10   7   0   7   7   7    0        2    4
##   UPL001         194   7   1 206   7 208   12        2    4

From the table we can see which sites have a large number of values for certain Characteristics.

FCOLI is of interest and site WMRA070 has a fairly large number of values.

## subset for site and FCOLI results
dFcoli <- filter(d, d$CharacteristicID == "FCOLI" & d$LocationID == "MWRA070")

## add a year column to look at data by year


dFcoli <- 
      mutate(dFcoli,
             year =  year(strptime(dFcoli$Datetime, "%m/%d/%Y %H:%M"))
        )
#dFcoli$year <- factor(dFcoli$year)

What does our data spread look like?

summary(dFcoli$ResultValue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       5     435    1080    6071    3600  313000

Lets see what FCOLI at site WMRA070 looks like over time on a time series plot.

dplot <- dFcoli %>% group_by(year) %>% summarise(avg=mean(ResultValue))

g <- ggplot(dplot, aes(year, avg))
g + geom_point() + geom_smooth(method= "lm")

Alternate plot with log transform on the yearly avg

dFcoli["ResultValue"] <- log(dFcoli["ResultValue"])
dplot <- dFcoli %>% group_by(year) %>% summarise(avg=mean(ResultValue))
g <- ggplot(dplot, aes(year, avg))
g + geom_point() + geom_smooth(method = "lm")

With these graphs we can see an indication that Fcoli is going down over time at this site. The log transform did not appear to help us any.

Lets taek a step back and look at Fcoli for all the sites in the report, not just site WMRA070

reportSitesFcoli <- dplyr::filter(d, d$CharacteristicID == "FCOLI" & d$LocationID %in% reportLocations)

## add a year column to look at data by year

reportSitesFcoli <- 
      mutate(reportSitesFcoli,
             year =  year(strptime(reportSitesFcoli$Datetime, "%m/%d/%Y %H:%M"))
        )
#reportSitesFcoli <- dplyr::filter(!is.na(reportSitesFcoli))

summary(reportSitesFcoli$ResultValue)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      1.9     95.0    400.0   4360.0   1400.0 800000.0
allsites <- reportSitesFcoli %>% group_by(year) %>% summarise(avg=mean(ResultValue))

The time series plot of Fcoli in all sites in the report looks like this:

##plot with linear model line to see direction of results over time
allg <- ggplot(allsites, aes(year, avg))
allg + geom_point() + geom_smooth(method = "lm")

Lets take another step back and look at Fcoli for all sites in this data.

## subset for site and FCOLI results
allFcoli <- filter(dataRaw, dataRaw$CharacteristicID == "FCOLI")

## add a year column to look at data by year

allFcoli <- 
      mutate(allFcoli,
             year =  year(strptime(allFcoli$Datetime, "%m/%d/%Y %H:%M"))
        )

summary(allFcoli$ResultValue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1      15      95    4202     501 4100000       2
alldata <- allFcoli %>% group_by(year) %>% summarise(avg=mean(ResultValue))


##plot with linear model line to see direction of results over time
allg <- ggplot(alldata, aes(year, avg))
allg + geom_point() + geom_smooth(method = "lm")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

All plots show a downward slowing line with the linear model. This would indicate a decrease in the amount of Fcoli in the river system over time. The last plot is not as clear as to the slope of the line and more investigation into what what was happening at some of the sites included in the data may be needed.