This document describes the process I used to assemble and merge lab and field data from Phosphorus grab sampling in 2015.

The first steps (parts 1-5) load and shape the lab and field data records into a table that is easy to input into the General Conversion template. The remaining steps (part 6) add missing data, flags, comments and other details directly into the General Conversion template. The result is a single Processed document that can be uploaded to the database.

library(dplyr)
library(ggplot2)
library(reshape2)
library(tidyr)
library(lubridate)
library(pander)
library(xtable)

1 Loading EPA lab data

Beginning in April, each month has two (rarely three) events. For each event there is either one lab report that combines all results, or two separate reports one for phos/Ophos data, the other for Chlorophyll.

First, the raw data files are read from Dropbox, and merged into one data frame.

setwd("\\\\psf/Home/Dropbox/")
#setwd("c:/Users/Katrina/Dropbox/")

# Assemble data into one dataframe
options(stringsAsFactors=FALSE)

Apr1 <- read.csv("Phosphorus/raw_grab/20150414/lab/PN15040023CHLA.csv", strip.white=TRUE)
Apr2 <- read.csv("Phosphorus/raw_grab/20150428/lab/PN15040050.csv", strip.white=TRUE)
May1 <- read.csv("Phosphorus/raw_grab/20150512/lab/PN15050020CHLA.csv", strip.white=TRUE)
May2 <- read.csv("Phosphorus/raw_grab/20150526/lab/PN15050041CHLA.csv", strip.white=TRUE)
Jun1 <- read.csv("Phosphorus/raw_grab/20150609/lab/PN15060012CHLA.csv", strip.white=TRUE)
Jun2 <- read.csv("Phosphorus/raw_grab/20150623/lab/15060040_CHLA.csv", strip.white=TRUE)
Jul1_phos <- read.csv("Phosphorus/raw_grab/20150707/lab/15070011.csv", strip.white=TRUE)
Jul1_chl <- read.csv("Phosphorus/raw_grab/20150707/lab/PN15070012CHLA.csv", strip.white=TRUE)
Jul2_phos <- read.csv("Phosphorus/raw_grab/20150721/lab/15070026.csv", strip.white=TRUE)
Jul2_chl <- read.csv("Phosphorus/raw_grab/20150721/lab/PN15070027_MRL_CHLA.csv", strip.white=TRUE)
Aug1_phos <- read.csv("Phosphorus/raw_grab/20150804/lab/15080002.csv", strip.white=TRUE)
Aug1_chl <- read.csv("Phosphorus/raw_grab/20150804/lab/PN15080004_CHLA.csv", strip.white=TRUE)
Aug2_phos <- read.csv("Phosphorus/raw_grab/20150818/lab/15080034.csv", strip.white=TRUE)
Aug2_chl <- read.csv("Phosphorus/raw_grab/20150818/lab/PN15080035_CHLA.csv", strip.white=TRUE)
Sep1_phos <- read.csv("Phosphorus/raw_grab/20150901/lab/15090004.csv", strip.white=TRUE)
Sep1_chl <- read.csv("Phosphorus/raw_grab/20150901/lab/PN15090005CHLA.csv", strip.white=TRUE)
Sep2_phos <- read.csv("Phosphorus/raw_grab/20150915/lab/15090019.csv", strip.white=TRUE)
Sep2_chl <- read.csv("Phosphorus/raw_grab/20150915/lab/PN15090020.csv", strip.white=TRUE)
Sep3_phos <- read.csv("Phosphorus/raw_grab/20150929/lab/15090037.csv", strip.white=TRUE)
Sep3_chl <- read.csv("Phosphorus/raw_grab/20150929/lab/PN15090038_CHLA.csv", strip.white=TRUE)
Oct1_phos <- read.csv("Phosphorus/raw_grab/20151013/lab/15100018.csv", strip.white=TRUE)
Oct1_chl <- read.csv("Phosphorus/raw_grab/20151013/lab/PN15100019CHLA.csv", strip.white=TRUE)
Oct2 <- read.csv("Phosphorus/raw_grab/20151027/lab/15100040.csv", strip.white=TRUE)
Nov1 <- read.csv("Phosphorus/raw_grab/20151110/lab/15110020.csv", strip.white=TRUE)

phos <- rbind(Apr1, Apr2, May1, May2, Jun1, Jun2, Jul1_phos, Jul1_chl, Jul2_phos, Jul2_chl, Aug1_phos,
              Aug1_chl, Aug2_phos, Aug2_chl, Sep1_phos, Sep1_chl, Sep2_phos, Sep2_chl, Sep3_phos,
              Sep3_chl, Oct1_phos, Oct1_chl, Oct2, Nov1)

write.csv(phos, file = "Phosphorus/raw_grab/Phos_lab_Apr_Nov_2015.csv")

2 Cleaning up Sample names and Column headers in EPA lab data

Some sample names in the raw results need correcting. Sites were mis-identified in early surveys (ex. MAR006 should be MAR 003), and there is inconsisency in the labels for duplicates. Some of the code here corrects those errors and others.

Also column names are changed to agree with Field data sheet and Conversion Templates.

phos <- phos %>%
  select(SAMPLE_NUMBER, COLLECTION_DATE, ANALYSIS_CODE, RESULT = COMBINATION_RESULT, ANALYSIS_UNIT)
# phos <- rbind_all(list(phos, chl))

#ALIASES:   
phos$ANALYSIS_CODE <- as.character(phos$ANALYSIS_CODE)
phos$ANALYSIS_CODE[phos$ANALYSIS_CODE == "CLPHYL"] <- "CHLA"
phos$ANALYSIS_CODE <- as.factor(phos$ANALYSIS_CODE)

#CHANGE MAR006 to MAR003; early samples had wrong name; correct some other sample names
phos$SAMPLE_NUMBER <- as.character(phos$SAMPLE_NUMBER)
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "MAR006"] <- "MAR003"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "MYR071-B"] <- "MYR071"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "MYR036"] <- "MAR036"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "ULPCTR"] <- "UPLCTR"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "ULPUPL"] <- "UPLUPL"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER == "MAR071"] <- "MYR071"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER %in% c("CHL_DUP","CHL-DUP", "Chl-dup", "CHL-dup", "CHL_Dup", "CHL-Dup")] <- "Chl_dup"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER %in% c("TP-DUP", "TP-Dup", "TP-dup")] <- "TP_dup"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER %in% c("OPHOS-DUP 1", "OPHOS-Dup 1", "OPHOS-dup 1","OPHOS-DUP1", "OPhos_dup1")] <- "Ophos_dup1"
phos$SAMPLE_NUMBER[phos$SAMPLE_NUMBER %in% c("OPHOS-DUP 2", "OPHOS-Dup 2", "OPHOS-dup 2","OPHOS-DUP2", "OPhos_dup2")] <- "Ophos_dup2"
phos$SAMPLE_NUMBER <- as.factor(phos$SAMPLE_NUMBER)

phos$COLLECTION_DATE <- as.Date(phos$COLLECTION_DATE, "%m/%d/%y")

3 Loading field data

setwd("\\\\psf/Home/Dropbox/")
#setwd("c:/Users/Katrina/Dropbox/")

Apr1F <- read.csv("Phosphorus/raw_grab/20150414/field/Field_data_20150414.csv", strip.white=TRUE)
Apr2F <- read.csv("Phosphorus/raw_grab/20150428/field/Field_data_20150428.csv", strip.white=TRUE)
May1F <- read.csv("Phosphorus/raw_grab/20150512/field/Field_data_20150512.csv", strip.white=TRUE)
May2F <- read.csv("Phosphorus/raw_grab/20150526/field/Field_data_20150526.csv", strip.white=TRUE)
Jun1F <- read.csv("Phosphorus/raw_grab/20150609/field/Field_data_20150609.csv", strip.white=TRUE)
Jun2F <- read.csv("Phosphorus/raw_grab/20150623/field/Field_data_20150623.csv", strip.white=TRUE)
Jul1F <- read.csv("Phosphorus/raw_grab/20150707/field/Field_data_20150707.csv", strip.white=TRUE)
Jul2F <- read.csv("Phosphorus/raw_grab/20150721/field/Field_data_20150721.csv", strip.white=TRUE)
Aug1F <- read.csv("Phosphorus/raw_grab/20150804/field/Field_data_20150804.csv", strip.white=TRUE)
Aug2F <- read.csv("Phosphorus/raw_grab/20150818/field/Field_data_20150818.csv", strip.white=TRUE)
Sep1F <- read.csv("Phosphorus/raw_grab/20150901/field/Field_data_20150901.csv", strip.white=TRUE)
Sep2F <- read.csv("Phosphorus/raw_grab/20150915/field/Field_data_20150915.csv", strip.white=TRUE)
Sep3F <- read.csv("Phosphorus/raw_grab/20150929/field/Field_data_20150929.csv", strip.white=TRUE)
Oct1F <- read.csv("Phosphorus/raw_grab/20151013/field/Field_data_20151013.csv", strip.white=TRUE)
Oct2F <- read.csv("Phosphorus/raw_grab/20151027/field/Field_data_20151027.csv", strip.white=TRUE)
Nov1F <- read.csv("Phosphorus/raw_grab/20151110/field/Field_data_20151110.csv", strip.white=TRUE)

field <- rbind(Apr1F, Apr2F, May1F, May2F, Jun1F, Jun2F, Jul1F, Jul2F, Aug1F,
              Aug2F, Sep1F, Sep2F, Sep3F,Oct1F, Oct2F, Nov1F)

field$SampleID[field$SampleID %in% c("CHL_dup")] <- "Chl_dup"
field$SampleID[field$SampleID == "MAR006"] <- "MAR003"
field$SampleID[field$SampleID == "MAR006"] <- "MAR003"
field$SampleID[field$SampleID == "MYR071-B"] <- "MYR071"
field$SampleID[field$SampleID == "MYR071_B"] <- "MYR071"
field$SampleID[field$SampleID == "MYR036"] <- "MAR036"
field$SampleID[field$SampleID == "ULPCTR"] <- "UPLCTR"
field$SampleID[field$SampleID == "ULPUPL"] <- "UPLUPL"
field$SampleID[field$SampleID == "MAR071"] <- "MYR071"
field$SampleID[field$SampleID == "Orthophos_dup2"] <- "Ophos_dup2"

field$Date <- as.Date(field$Date, "%m/%d/%Y")

write.csv(field, file = "Phosphorus/raw_grab/Phos_field_Apr_Nov_2015.csv")

4 Combining lab and field data

Transform lab data to wide format, and then join to field data.

setwd("\\\\psf/Home/Dropbox/")
#setwd("c:/Users/Katrina/Dropbox/")

phos_spread <-  phos %>%
  filter(ANALYSIS_CODE %in% c("TP", "OPHOS", "CHLA")) %>%
  mutate(Date = COLLECTION_DATE, SampleID = SAMPLE_NUMBER)%>%
  spread(ANALYSIS_CODE, RESULT, fill = NA)

phos_spread$Date <- as.Date(phos_spread$Date, "%m/%d/%y")

merged <-  right_join(phos_spread, field)
## Joining by: c("Date", "SampleID")
## Warning in right_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector
write.csv(merged, file = "Phosphorus/raw_grab/Merged_Apr_Nov_2015.csv")

5 Exporting assembled data

The assembled data set is written to a .csv file, which can be downloaded here:
It is ready to be copied into the General conversion template, where some additional work is required.

6 Additional work done in the Conversion template

6.1 Consolidating Field Dup samples and final station name corrections

Field duplicate sample (TP_dup, CHL_dup, Turbidity_dup, and OPhos_dup) need to be cleaned up and consolidated, such that all TP_dups have turbidity values, and any dups at the same site on a given day get recorded in one row. In addition a few StationIDs did not agree with LocationIDs (two different fields in field sheets; one corrected in code, the other not). These were changed manually. See the Main sheet of the Processed file for the details of changes made.

6.2 Adding flags and comments

A relatively small number of samples have flags (<25) and a handful have additional comments. These are also most easily added manually in the conversion template. The comments and flags can be found in the table of all results exported above: Phosphorus/raw_grab/Phos_lab_Apr_Nov_2015.csv.

6.3 Correcting units and change NDs

First, values for parameters reported in ug/L (TP, PO4) were divided by 1000 to get value in mg/L.

Second, ND values were changed to qualified values (ex. “<.005”), according to parameter, using corrected units.

6.4 Adding necessary locations, flags, and methods to the database

Flags introduced in step 6.2 (J, B) were added to the database Flags table with their complete EPA descriptions. Methods for EPA lab were added. And several locations were added UPLCTR, WEPCTR, UPLUPL, WIPCTR, SPPCTR.

6.3 Manually changing method for temperature

Within a given survey, some temperature is taken with manual thermometer and others with sonde. Assigned a single method. Change others manually in EXPORT_NUMERIC tab.

7 Lab Results: Graphs and Tables

The following graphs and tables are generated from the lab results (phos) above.

7.1 TP and CHLA

#remove sample numbers not needed for graphs
phosGraph <- phos %>%
  filter(ANALYSIS_CODE =="TP" |ANALYSIS_CODE == "CHLA" ) %>%
  filter(!(SAMPLE_NUMBER %in% c("CHL_DUP","CHL_LAB_DUP","TP-DUP", "LABDUP", "CHL-DUP", "TP-blank-new",
                               "TP-blank-old", "TP-Dup",  "HOB002-2", "ABR-C0MP-BLANK",
                               "ABR-COMP-1", "ABR-COMP-2", "ABR-COMP-3", "Chl-dup", "CHL-dup", "CHL_Dup", "CHL-Dup", "COMP-LAB-DUP", "ABR006-COMP", "TP-blank", "CHL-LAB-DUP", "COMP-LABDUP-2", "CHL-L-DUP",
                               "CPMP-LABDUP2", "CPMP-LABDUP3", "CPMP-LABDUP1", "COMP-LABDUP-1")) & RESULT != "ND")

phosGraph$RESULT <- as.numeric(as.character(phosGraph$RESULT))
phosGraph$COLLECTION_DATE <- as.Date(phosGraph$COLLECTION_DATE, "%m/%d/%y")
  
g <- ggplot(phosGraph, aes(COLLECTION_DATE, RESULT, color = ANALYSIS_CODE, 
                           group=interaction(ANALYSIS_CODE, SAMPLE_NUMBER), 
                           fill= ANALYSIS_CODE)) 

g + geom_line(size = 1) + geom_point() + facet_wrap(~ SAMPLE_NUMBER, ncol = 3) + 
  scale_color_manual(values = c("darkolivegreen4",  "violetred3")) +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(c(0,100))
## geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).

TP results:

#For tables, organize results to display by site and date.
TP_to_date <- tbl_df(phos) %>%
  filter(ANALYSIS_CODE == "TP") 
OPHOS_to_date <- tbl_df(phos) %>%
  filter(ANALYSIS_CODE == "OPHOS")
CHL_to_date <- tbl_df(phos) %>%
  filter(ANALYSIS_CODE == "CHLA") 

PHOS_to_date <- tbl_df(phos) %>%
  filter(ANALYSIS_CODE %in% c("TP", "OPHOS", "CHLA"))

TP_to_date <- dcast(TP_to_date, SAMPLE_NUMBER ~ COLLECTION_DATE, value.var = "RESULT")
OPHOS_to_date <- dcast(OPHOS_to_date, SAMPLE_NUMBER ~ COLLECTION_DATE, value.var = "RESULT")
CHL_to_date <- dcast(CHL_to_date, SAMPLE_NUMBER ~ COLLECTION_DATE, value.var = "RESULT")

#write.csv(TP_to_date, file = "Phosphorus/raw_grab/TP_to_date.csv")

tab <- xtable(TP_to_date, align = rep("c", dim(TP_to_date)[2] + 1))
print(tab, type = "html")
SAMPLE_NUMBER 2015-04-14 2015-04-28 2015-05-12 2015-05-26 2015-06-09 2015-06-23 2015-07-07 2015-07-21 2015-08-04 2015-08-18 2015-09-01 2015-09-15 2015-09-29 2015-10-13 2015-10-22 2015-10-27 2015-10-29 2015-10-30 2015-11-05 2015-11-10
1 ABR-C0MP-BLANK 15.2
2 ABR-COMP-1 85
3 ABR-COMP-2 41
4 ABR-COMP-3 20
5 ABR006 22 27 21 29 28 44 33 26 9.5 30 19 22 22 20 13.1 17
6 ABR006-COMP 14.9
7 ALB006 69 50 37 41 51 66 58.7 63 50.6 63 53 39 55 53 35.2 36
8 COMP-LABDUP-1 83
9 COMP-LABDUP-2 41
10 COMP-SPLIT-BLANK ND ND
11 CPMP-LABDUP1 15.5
12 CPMP-LABDUP2 13.6
13 CPMP-LABDUP3 14.1
14 HOB002 12 16 8.1 18 11.0 22 16 34 554 44 36 24 37 350 33.3 25
15 HOB002-2 110
16 MAR003 25 28 14 28 37 34 26 46 37.2 29 28 54 40 36 15.7 43
17 MAR036 29 25 55 55 54 36 40 23.3 24 20 33 42 63 23.6 30
18 MIB0045 34 41 35 25 36 41 26.2 17 15.7 14 12 52 17 160 23.5 25
19 MYR071 14 14 11.0 22 20 13 15 13.9 14 11 11 10 13 12.8 13
20 MYR33 26 24 17 28 27 40 50 62 42.1 39 38 62 40 27 16.8 40
21 MYR43 24 24 24 28 33 26 54 21.5 32 36 41 35 35 17.5 22
22 SPPCTR 23 27 21 20 24 18 16.8 12 ND 13 15 14 17 23 18.4 23
23 TP-blank ND
24 TP-blank-new ND
25 TP-blank-old ND
26 TP_dup 27 7.3 8.4 35 21 57 57 25.4 46 21 23 8.7 160 18.6 37
27 UPLCTR 15 13 6.2 8.2 17 14 11 9.1 ND 9.6 7.2 8.7 9.2 16 10 14
28 UPLUPL 26 26 34 28 24 25 18.3 25
29 WEPCTR 15 17.0 6.9 7.4 21 16 17 21 31.7 27.3 31 19 28 30 21.8 29.3
#

Orthophosphate results:

tab2 <- xtable(OPHOS_to_date, align = rep("c", dim(OPHOS_to_date)[2] + 1))
print(tab2, type = "html")
SAMPLE_NUMBER 2015-04-28 2015-05-26 2015-06-23 2015-07-21 2015-08-18 2015-09-15 2015-10-13
1 ABR006 ND ND ND ND ND ND ND
2 ALB006 ND 7.8 8.1 15.2 18 ND 5
3 HOB002 ND 5.15 ND ND ND ND
4 HOB002-2 31
5 MAR003 ND ND ND ND ND ND ND
6 MAR036 ND ND ND ND ND ND ND
7 MIB0045 5.2 ND 6.1 ND ND ND 66
8 MYR071 ND ND ND ND ND ND ND
9 MYR33 ND ND ND ND ND ND ND
10 MYR43 ND ND ND ND ND ND
11 Ophos_dup1 5.0 ND ND ND ND ND ND
12 Ophos_dup2 ND ND ND ND 7.83 ND ND
13 SPPCTR ND ND ND ND ND ND ND
14 UPLCTR ND ND ND ND ND ND ND
15 UPLUPL ND ND ND
16 WEPCTR ND ND ND ND ND ND ND

Chlorophyll results:

tab3 <- xtable(CHL_to_date, align = rep("c", dim(CHL_to_date)[2] + 1))
print(tab3, type = "html")
SAMPLE_NUMBER 2015-04-14 2015-04-28 2015-05-12 2015-05-26 2015-06-09 2015-06-23 2015-07-07 2015-07-21 2015-08-04 2015-08-18 2015-09-01 2015-09-15 2015-09-29 2015-10-13
1 CHL-L-DUP 16.59
2 CHL-LAB-DUP 5.27
3 Chl_dup 16.07 11.20 9.32 12.34 20.02 17.17 15.37 5.34 3.04 15.97 19.85 21.4 24.74
4 HOPCTR 8.34 6.56
5 LABDUP 5.06
6 LOLCTR 7.65 8.57
7 MAR003 2.47 21.69 34.04 55.91 21.22 26.45 15.1 18.9 12.61 12.38 17.06 47.48 21.68 20.53
8 MYR33 4.09 22.17 36.94 39.46 20.62 23.65 21.6 32.99 11.52 17.98 24.81 52.41 18.15 14.07
9 MYR43 3.68 13.36 28.09 30.73 11.18 12.07 16.73 30.97 19.68 20.59 18.41 13.45 24.01 15.84
10 SPPCTR 14.12 35.63 9.42 10.17 8.71 11.94 14.1 7.71 5.51 3.28 19.77 7.42 5.9 17.82
11 UPLCTR 2.13 9.42 6.57 16.31 12.47 8.34 8.52 7.56 5.86 8.76 12.38 7.61 6.56 13.84
12 UPLUPL 1.76 4.55 32.96 16.48 28.99 16.38 17.52 15.68
13 WEPCTR 5.06 14.13 6.60 5.31 5.42 8.88 13.81 15.02 16.71 11.55 12.56 26.11 25.46
14 WIPCTR 12.32