This is an attempt to QA WQN data to accomplish 4 steps in an automated fashion:
library(readxl)
library(janitor)
library(plyr)
library(lubridate)
library(writexl)
library(tidyverse)
library(tidylog)
wqn_df <- read_excel('PANTN_WY19_Q1_data.xlsx', sheet = 'Sheet1') %>%
janitor::clean_names() %>%
mutate(
final_amount = as.numeric(final_amount))
This is what the dataframe looks like after import. There are 6,524 rows and 19 columns.
| collector | wqn | station_name | usgs_gauge | sampling_reason_code | standard_analysis_or_suite | date_collected | time_collected | sequence_number | quality_assurance_type_desc | test_code | test_short_desc | test_long_desc | reading_indicator_code | final_amount | final_amount_units_abbrev | quality_code | standard_comment_desc | free_form_comment |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRBC | WQN0201 | SUSQUEHANNA RVR | 01576000 | 1 | 012 | 2018-10-09 08:30:00 | 08:30 | 287 | NULL | 00410 | ALKALINITY | ALKALINITY AS CaCO3 @ pH 4.5 | NULL | 49.200 | mg/L | NULL | NULL | NULL |
| SRBC | WQN0201 | SUSQUEHANNA RVR | 01576000 | 1 | 012 | 2018-10-09 08:30:00 | 08:30 | 287 | NULL | 01105A | ALUMINUM T | ALUMINUM, TOTAL (WATER & WASTE) BY ICP | NULL | 1160.000 | ug/L | NULL | Currently not certified for drinking water | NULL |
| SRBC | WQN0201 | SUSQUEHANNA RVR | 01576000 | 1 | 012 | 2018-10-09 08:30:00 | 08:30 | 287 | NULL | 00608A | AMMONIA-N D | AMMONIA DISSOLVED AS NITROGEN | NULL | 0.021 | mg/L | NULL | NULL | Dissolved result > 10% Relative Difference of total result. |
# reshape to wide
wqn_df_wide <-
wqn_df %>%
# create unique column for sample ID
unite(unique, collector, wqn, sampling_reason_code, sequence_number,
quality_assurance_type_desc, date_collected) %>%
#create unique column for analyte
unite(analyte, test_code, test_short_desc, final_amount_units_abbrev) %>%
#select columns to spread
select(unique, analyte, final_amount) %>%
#make wide
spread(analyte, final_amount) %>%
# separate unique identifier back into original columns
separate(unique, c('collector', 'wqn', 'sampling_reason_code', 'sequence_number',
'quality_assurance_type_desc', 'date_collected'), sep = "_") %>%
#clean col names
janitor::clean_names() %>%
mutate(
# number of NON NA field params
field_num = rowSums(!is.na(cbind(
f00061_stream_flow_cfs, f00094_specific_con_umhos_cm, f00300_oxygen_fld_mg_l,
f00301_do_percent_field_percent, f00405_p_h_fld_p_h_units, f82078_turbidity_fi_ntu))),
# number of NA field params
field_na = rowSums(is.na(cbind(
f00061_stream_flow_cfs, f00094_specific_con_umhos_cm, f00300_oxygen_fld_mg_l,
f00301_do_percent_field_percent, f00405_p_h_fld_p_h_units, f82078_turbidity_fi_ntu))),
# number of NON NA lab params
lab_num = rowSums(!is.na(cbind(
x00095_spc_25_0_c_umhos_cm, x00403_p_h_p_h_units, x00403t_p_h_temp_c,
x00410_alkalinity_mg_l, x00530_t_susp_solid_mg_l, x00600a_nitrogentot_mg_l,
x00602a_dissolved_n_mg_l, x00608a_ammonia_n_d_mg_l, x00610a_ammonia_n_t_mg_l,
x00630a_no3_no2_n_mg_l, x00631a_no3_no2_n_d_mg_l, x00665a_phosphorus_t_mg_l,
x00666a_phosphorus_d_mg_l, x00671a_p_ortho_diss_mg_l, x00680_t_org_carbon_mg_l,
x00900_hardness_t_mg_l, x00916a_calcium_t_mg_l, x00927a_magnesium_t_mg_l,
x00929a_sodium_t_mg_l, x00937a_potassium_t_mg_l, x00940_chloride_ic_mg_l,
x00945_sulfate_ic_mg_l, x01007a_barium_t_ug_l, x01022k_boron_total_ug_l,
x01042h_copper_t_ug_l, x01045a_iron_t_ug_l, x01051h_lead_t_ug_l,
x01055a_manganese_t_ug_l, x01067a_nickel_t_ug_l, x01082a_strontium_t_ug_l,
x01092a_zinc_t_ug_l, x01105a_aluminum_t_ug_l, x01132a_lithium_t_ug_l,
x01147h_selenium_t_ug_l, x70300u_tds180_usgs_mg_l, x70507a_phos_t_ortho_mg_l,
x82550_osmo_pres_null, x99020_bromide_ug_l))),
# number of NA field params
lab_na = rowSums(is.na(cbind(
x00095_spc_25_0_c_umhos_cm, x00403_p_h_p_h_units, x00403t_p_h_temp_c,
x00410_alkalinity_mg_l, x00530_t_susp_solid_mg_l, x00600a_nitrogentot_mg_l,
x00602a_dissolved_n_mg_l, x00608a_ammonia_n_d_mg_l, x00610a_ammonia_n_t_mg_l,
x00630a_no3_no2_n_mg_l, x00631a_no3_no2_n_d_mg_l, x00665a_phosphorus_t_mg_l,
x00666a_phosphorus_d_mg_l, x00671a_p_ortho_diss_mg_l, x00680_t_org_carbon_mg_l,
x00900_hardness_t_mg_l, x00916a_calcium_t_mg_l, x00927a_magnesium_t_mg_l,
x00929a_sodium_t_mg_l, x00937a_potassium_t_mg_l, x00940_chloride_ic_mg_l,
x00945_sulfate_ic_mg_l, x01007a_barium_t_ug_l, x01022k_boron_total_ug_l,
x01042h_copper_t_ug_l, x01045a_iron_t_ug_l, x01051h_lead_t_ug_l,
x01055a_manganese_t_ug_l, x01067a_nickel_t_ug_l, x01082a_strontium_t_ug_l,
x01092a_zinc_t_ug_l, x01105a_aluminum_t_ug_l, x01132a_lithium_t_ug_l,
x01147h_selenium_t_ug_l, x70300u_tds180_usgs_mg_l, x70507a_phos_t_ortho_mg_l,
x82550_osmo_pres_null, x99020_bromide_ug_l))),
#field chem out of range flags
# SpC outside of 0-1000 range
spc_oor = case_when(
f00094_specific_con_umhos_cm < 0 ~ 'SpC_low',
f00094_specific_con_umhos_cm > 1000 ~ 'SpC_high',
TRUE ~ NA_character_),
# DO outside of 4-15 range
do_oor = case_when(
f00300_oxygen_fld_mg_l < 4 ~'DO_low',
f00300_oxygen_fld_mg_l > 15 ~ 'DO_high',
TRUE ~ NA_character_),
# pH outside of 4-10 range
pH_oor = case_when(
f00405_p_h_fld_p_h_units < 4 ~ 'pH_low',
f00405_p_h_fld_p_h_units > 10 ~ 'pH_high',
TRUE ~ NA_character_),
# turb outside of 0-1000 range
turb_oor = case_when(
f82078_turbidity_fi_ntu < 0 ~ 'turb_low',
f82078_turbidity_fi_ntu > 1000 ~ 'turb_high',
TRUE ~ NA_character_)
) %>%
#sort
arrange(collector, wqn, date_collected)
The resulting dataframe includes 180 unique sampling events and 59 columns. A subset of columns and rows are shown below, including the unique identifiers, a couple columns of lab/field parameters, and the columns we created for QA purposes.
| collector | wqn | sampling_reason_code | sequence_number | quality_assurance_type_desc | date_collected | x00095_spc_25_0_c_umhos_cm | f00405_p_h_fld_p_h_units | f82078_turbidity_fi_ntu | field_num | field_na | lab_num | lab_na | spc_oor | do_oor | pH_oor | turb_oor |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRBC | WQN0201 | 1 | 287 | NULL | 2018-10-09 08:30:00 | 184.2 | 7.54 | 4 | 2 | 32 | 6 | |||||
| SRBC | WQN0201 | 1 | 847 | NULL | 2018-10-29 10:00:00 | 7.86 | 4 | 2 | 11 | 27 | ||||||
| SRBC | WQN0201 | 1 | 288 | NULL | 2018-11-08 08:30:00 | 164.0 | 7.50 | 4 | 2 | 32 | 6 |
field_oor <-
wqn_df_wide %>%
#select only columns of interest
select(collector, wqn, sampling_reason_code, sequence_number, quality_assurance_type_desc, date_collected,
f00010_water_temp_c, f00061_stream_flow_cfs, f00094_specific_con_umhos_cm, f00300_oxygen_fld_mg_l,
f00301_do_percent_field_percent, f00405_p_h_fld_p_h_units, f82078_turbidity_fi_ntu,
spc_oor, do_oor, pH_oor, turb_oor) %>%
#remove samples with no field chem flags
filter(!is.na(spc_oor) | !is.na(do_oor) | !is.na(pH_oor) | !is.na(turb_oor))
The 180 unique samples included in the original dataset contained only 2 records with at least one field chemistry measurement oor. It happened to be 2 occassions where DO > 15 mg/l. One was 15.2 (believable); one was 93.7 (sounds more like % saturation – suspect).
| collector | wqn | sampling_reason_code | sequence_number | quality_assurance_type_desc | date_collected | f00010_water_temp_c | f00061_stream_flow_cfs | f00094_specific_con_umhos_cm | f00300_oxygen_fld_mg_l | f00301_do_percent_field_percent | f00405_p_h_fld_p_h_units | f82078_turbidity_fi_ntu | spc_oor | do_oor | pH_oor | turb_oor |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRBC | WQN0281 | 1 | 146 | NULL | 2018-12-11 15:00:00 | 3.07 | 16.8 | 541 | 15.2 | 8.25 | DO_high | |||||
| SRBC | WQN0301 | 15 | 806 | NULL | 2018-11-28 10:30:00 | 3.05 | 89600.0 | 142 | 93.7 | 7.43 | DO_high |
The ‘wqn_df_wide’ object we created earlier contains a count of the number of lab parameters - and lab na’s - for each sample. However, if we just want to subset the samples that contained no lab parameters, we can created a new object called ‘lab_chem_check’, apply another filter(), and select only columns of interest.
lab_chem_check <-
wqn_df_wide %>%
#subset sites with no lab chem
filter(lab_num == 0) %>%
#select only columns of interest
select(collector, wqn, sampling_reason_code, sequence_number,
quality_assurance_type_desc, date_collected, lab_num, lab_na)
The output identifies 4 samples that have no lab chemistry - a possible QA issue to follow up on?
| collector | wqn | sampling_reason_code | sequence_number | quality_assurance_type_desc | date_collected | lab_num | lab_na |
|---|---|---|---|---|---|---|---|
| USGS | WQN0269 | 1 | 607 | NULL | 2018-12-13 13:00:00 | 0 | 38 |
| USGS | WQN0278 | 1 | 606 | NULL | 2018-12-13 11:45:00 | 0 | 38 |
| USGS | WQN0280 | 1 | 605 | NULL | 2018-12-13 10:15:00 | 0 | 38 |
| USGS | WQN0317 | 14 | 158 | NULL | 2018-11-20 12:00:00 | 0 | 38 |
The ‘wqn_df_wide’ object we created earlier also contains a count of the number of field parameters - and field na’s - for each sample. Since Molly mentioned that 4 parameters are usually required, here we subset the number of samples that have < 4 field chemistry parameters to a new object called ‘field_chem_check’.
field_chem_check <-
wqn_df_wide %>%
#subset sites with < 4 field chem entries
filter(field_num < 4) %>%
#select only columns of interest
select(collector, wqn, sampling_reason_code, sequence_number,
quality_assurance_type_desc, date_collected, field_num, field_na)
The output identifies 24 samples that have no field chemistry.
| collector | wqn | sampling_reason_code | sequence_number | quality_assurance_type_desc | date_collected | field_num | field_na |
|---|---|---|---|---|---|---|---|
| SRBC | WQN0204 | 15 | 267 | Duplicate | 2018-11-07 09:30:00 | 0 | 6 |
| SRBC | WQN0214 | 1 | 280 | Blank | 2018-11-14 09:30:00 | 0 | 6 |
| SRBC | WQN0214 | 1 | 281 | Blank | 2018-11-14 09:30:00 | 0 | 6 |
However, if the sample was a duplicate or blank, no field chemistry was required. So, lets take another step to identify how many samples should have had field chem. We’ll use the group_by() functon followed by a pipe and the summarise() function to determine how many blanks, dups, etc.. The result is 11 blanks and 9 dups that don’t need field chem, but 4 ‘NULL’ samples that should have field chem – a possible QA issue to follow up on?
| quality_assurance_type_desc | Unique_Elements |
|---|---|
| Blank | 11 |
| Duplicate | 9 |
| NULL | 4 |
This last chunk will use the original dataframe (wqn_df; long format) to search for cases when reason code = 2 and sac does not = 021. We can first use the unique() function to search for the unique elements of reason and sac codes. It’s apparent that this dataset doesn’t contain either, but the code that creates the ‘sac_conflict’ object will identify the issue if/when it occurs.
unique(wqn_df$sampling_reason_code) # "1" "15" "14"
unique(wqn_df$standard_analysis_or_suite) # "012" "NULL" "874" "612"
sac_conflict <-
wqn_df %>%
filter(sampling_reason_code == 2 & standard_analysis_or_suite != '021')
We can build a spreadsheet called ‘wqn_qa.xlsx’ containing all the output using the writexl package, write_xlsx() function to our working directory. We can include 4 different tabs with our output from the steps above.
write_xlsx(list(wqn_df_wide = wqn_df_wide,
field_oor = field_oor,
field_chem_check = field_chem_check,
lab_chem_check = lab_chem_check,
sac_conflict = sac_conflict),
"wqn_qa.xlsx")