Gaps in service area boundary data for HOAs

Library

library(tidyverse)
library(sf)

Look at service areas data from SDWA data

service_areas <- read_csv("./SDWA_latest_downloads/SDWA_SERVICE_AREAS.csv")
Rows: 416170 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): SUBMISSIONYEARQUARTER, PWSID, SERVICE_AREA_TYPE_CODE, IS_PRIMARY_SE...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Number of unique PWS in SDWA data

nrow(service_areas %>% group_by(PWSID) %>% summarize(count = n()))
[1] 372829

There are 372,829 unique PWS IDs.

Total number of facilities in SDWA data is 428,138.

facilities <- read_csv("SDWA_latest_downloads/SDWA_PUB_WATER_SYSTEMS.csv")
nrow(facilities)
[1] 428138

Types of service areas

unique(service_areas$SERVICE_AREA_TYPE_CODE)
 [1] "RA" "HM" "ON" "MF" "SC" "OA" "DC" "PA" "MH" "OT" "RS" "RE" "SS" "MP" "SU"
[16] "OR" "WH" "HR" "DI" "SI" "IN" "IA" "SK" "MU" "WB" "SR" "IC" "HA"

Service area code types

DC Daycare Center
DI Dispenser
HA Homeowners Association
HM Hotel/Motel
HR Highway Rest Area
IA Industrial/Agricultural
IC Interstate Carrier
IN Institution
MF Medical Facility
MH Mobile Home Park
MP Mobile Home Park - Principal Residence
MU Municipality
OA Other Area
ON Other Non-transient Area
OR Other Residential
OT Other Transient Area
PA Recreation Area
RA Residential Area
RE Retail Employees
RS Restaurant
SC School
SI Sanitary Improvement District
SK Summer Camp
SR Secondary Residences
SS Service Station
SU Subdivision
WB Water Bottler
WH Wholesaler of Water

Facilities (public water systems) that are Community Water Systems

cws <- facilities %>% 
  filter(PWS_TYPE_CODE == "CWS")

Total number of CWS

nrow(cws)
[1] 96158
nrow(facilities)
[1] 428138

We will look at public water systems not just CWS.

Looking at HOAs

hoa <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "HA") %>% 
  distinct()

table(hoa$IS_PRIMARY_SERVICE_AREA_CODE)

   N    Y 
   2 1246 

Public water systems that are HOAs.

pws_hoa <- merge(facilities, hoa)

178 HOAs in total.

Checking the representativeness of the HOAs in Hydroshare data

#read in Hydroshare data
df <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>% 
  filter(tier == 1) #filtering Tier 1 data
Reading layer `temm' from data source 
  `C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg' 
  using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS:  WGS 84

States with Tier 1 data

table(df$state_code)

  AK   AL   AR   AZ   CA   CO   CT   DC   DE   FL   GA   HI   IA   ID   IL   IN 
   1    2  670  673 2794   40  435    1    1   18    5    1    2    7 1115    3 
  KS   KY   LA   MA   MD   ME   MI   MO   MS   MT   NC   NJ   NM   NV   NY   OH 
 780    1    2   10   11    1   18  285    1    1  491  570  530   11   23    1 
  OK   OR   PA   RI   SC   TN   TX   UT   VA   WA   WI   WV   WY 
 708    5 1721   32    2    1 4437  486    1 1740    2    5    1 

We will remove data where the count per state is lesser than 250.

tier1_count_by_state <- table(df$state_code) %>% as.data.frame()
colnames(tier1_count_by_state) <- c("state", "count")
t1_states_df <- tier1_count_by_state %>% 
  filter(count > 250)
t1_states <- t1_states_df$state

Removing data where majority of records aren’t Tier 1

# Turning off geom as it led to some unexpected errors in summarization
##simplifying data frame 
## Because of slowdown and some unexpected errors, I am turning off geom.
df <- df %>% select(-geom) %>% 
  as.data.frame() %>% 
  filter(state_code != "02" | state_code != "09" ) %>% 
  filter(state_code %in% t1_states)

Filtering HOAs in Tier 1 Hydroshare data

hydroshare_hoa <- df[grepl("HA", df$service_area_type_code), ]
nrow(hydroshare_hoa)
[1] 52

There are 52 HOAs in Tier 1 states.

Looking at the observations that HOAs in SDWA that aren’t covered in Hydroshare, and their geographic distribution for Tier 1 data.

Filtering PWS data for states with Tier 1 data

pws_hoa <- pws_hoa %>%
  filter(STATE_CODE %in% t1_states)

There are 32 records.

not_covered_hoas_pwsid <- setdiff(pws_hoa$PWSID, hydroshare_hoa$pwsid)

states_not_covered_hoas <- lapply(not_covered_hoas_pwsid, function(x) substr(x, 1, 2))

table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq))
  Var1 Freq
1   AZ    5
2   MO    5
3   NC    4
4   ID    1
5   IL    1
6   NY    1
7   OK    1
8   VT    1

For the Tier 1 states, there are very few HOAs that are not covered in Hydroshare data as compared to SDWA data.

Looking at Mobile Homes

Mobile homes in PWS data

mobile <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "MH" | SERVICE_AREA_TYPE_CODE == "MP") %>% 
  distinct()

table(mobile$IS_PRIMARY_SERVICE_AREA_CODE)

    N     Y 
  178 16160 
pws_mobile <- merge(facilities, mobile)
table(df$state_code)

  AR   AZ   CA   CT   IL   KS   MO   NC   NJ   NM   OK   PA   TX   UT   WA 
 670  673 2794  435 1115  780  285  491  570  530  708 1721 4437  486 1740 
hydroshare_mobile <- df[grepl("MH|MP", df$service_area_type_code), ]
nrow(hydroshare_hoa)
[1] 52

Filtering mobile homes in Tier 1 Hydroshare data

pws_mobile <- pws_mobile %>%
  filter(STATE_CODE %in% t1_states)
not_covered_mbl_pwsid <- setdiff(pws_mobile$PWSID, hydroshare_mobile$pwsid)

states_not_covered_mbl <- lapply(not_covered_mbl_pwsid, function(x) substr(x, 1, 2))
table(unlist(states_not_covered_mbl)) %>% as.data.frame() %>% arrange(desc(Freq))
   Var1 Freq
1    IL   26
2    09   17
3    MO   16
4    TX   14
5    AZ    8
6    NC    5
7    OK    3
8    CA    2
9    IN    2
10   MT    2
11   OR    2
12   AR    1
13   CO    1
14   FL    1
15   KS    1
16   LA    1
17   MD    1
18   MN    1
19   NE    1
20   OH    1
21   PA    1
22   UT    1

Mobile homes from these states show some missingness: Illinois, Missouri, Texas.

##Institutional

From the name, IN and OA for Institutions seem to consist of hospitals, rehabs, correctional facilities, airforce base, sheriff’s facility,

inst <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "IN" |SERVICE_AREA_TYPE_CODE == "OA") %>% 
  distinct()

table(inst$IS_PRIMARY_SERVICE_AREA_CODE)

    N     Y 
  303 25282 

Filtering Institutions in Tier 1 Hydroshare data

hydroshare_inst <- df[grepl("IN|OA", df$service_area_type_code), ]
nrow(hydroshare_inst)
[1] 335

There are 335 institutions in Tier 1 states.

Looking at the observations that institutions in SDWA that aren’t covered in Hydroshare, and their geographic distribution for Tier 1 data.

Filtering PWS data for states with Tier 1 data

Public water systems that are institutions

pws_inst <- merge(inst, facilities)
pws_inst <- pws_inst %>%
  filter(STATE_CODE %in% t1_states)

There are 276 records.

not_covered_inst_pwsid <- setdiff(pws_inst$PWSID, hydroshare_inst$pwsid)

states_not_covered_inst <- lapply(not_covered_inst_pwsid, function(x) substr(x, 1, 2))

table(unlist(states_not_covered_inst)) %>% as.data.frame() %>% arrange(desc(Freq))
   Var1 Freq
1    WA   69
2    PA   48
3    CA   33
4    TX   20
5    UT   17
6    MO   12
7    WI    9
8    MI    8
9    AZ    7
10   IL    7
11   NM    6
12   NJ    4
13   OK    4
14   SC    4
15   AR    3
16   FL    3
17   NC    3
18   MD    2
19   NY    1
20   OR    1
21   VT    1

For the Tier 1 states, there is some missingness in Washingtonm, Pennsylvania, California, Texas and Utah.

What is oA?

oa <- df[grepl("OA", df$service_area_type_code), ]