library(tidyverse)
library(sf)Gaps in service area boundary data for HOAs
Library
Look at service areas data from SDWA data
service_areas <- read_csv("./SDWA_latest_downloads/SDWA_SERVICE_AREAS.csv")Rows: 416170 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): SUBMISSIONYEARQUARTER, PWSID, SERVICE_AREA_TYPE_CODE, IS_PRIMARY_SE...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Meaning of PWS according to EPA: “For example, in the SDWA_PUB_WATER_SYSTEMS.csv file, each row describes a public water system in a particular quarter, and is uniquely identified by the values of the key fields SUBMISSIONYEARQUARTER and PWSID. Site visits, violations, and so on for that same system and quarter are identified by the same values of SUBMISSIONYEARQUARTER and PWSID in the other files.”
facilities <- read_csv("SDWA_latest_downloads/SDWA_PUB_WATER_SYSTEMS.csv")Service area code types
DC Daycare Center
DI Dispenser
HA Homeowners Association
HM Hotel/Motel
HR Highway Rest Area
IA Industrial/Agricultural
IC Interstate Carrier
IN Institution
MF Medical Facility
MH Mobile Home Park
MP Mobile Home Park - Principal Residence
MU Municipality
OA Other Area
ON Other Non-transient Area
OR Other Residential
OT Other Transient Area
PA Recreation Area
RA Residential Area
RE Retail Employees
RS Restaurant
SC School
SI Sanitary Improvement District
SK Summer Camp
SR Secondary Residences
SS Service Station
SU Subdivision
WB Water Bottler
WH Wholesaler of Water
Looking at HOAs
hoa <- service_areas %>%
filter(SERVICE_AREA_TYPE_CODE == "HA" | SERVICE_AREA_TYPE_CODE == "SU") %>%
distinct()
table(hoa$IS_PRIMARY_SERVICE_AREA_CODE)
N Y
6 5032
Total HOA+Subdivisions: 5312
HOA is for service areas that has PWS ID, and facilities are public water systems.
pws_hoa <- hoaNumber of public water systems in Hydroshare data in tiers
hyd <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg")%>%
st_drop_geometry()Reading layer `temm' from data source
`C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg'
using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension: XY
Bounding box: xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS: WGS 84
hyd %>%
group_by(tier) %>%
summarise(count = n())# A tibble: 4 × 2
tier count
<dbl> <int>
1 1 17645
2 2 11079
3 3 17249
4 NA 3451
Looking at Mobile Homes
Mobile homes in PWS data
mobile <- service_areas %>%
filter(SERVICE_AREA_TYPE_CODE == "MH" | SERVICE_AREA_TYPE_CODE == "MP") %>%
distinct()
table(mobile$IS_PRIMARY_SERVICE_AREA_CODE)
N Y
178 16160
pws_mobile <- mobiletable(df$state_code)
AR AZ CA CT IL KS MO NC NJ NM OK PA TX UT WA
670 673 2794 435 1115 780 285 491 570 530 708 1721 4437 486 1740
hydroshare_mobile <- df[grepl("MH|MP", df$service_area_type_code), ]nrow(hydroshare_mobile)[1] 1610
Filtering mobile homes in Tier 1 Hydroshare data
pws_mobile <- pws_mobile %>%
mutate(STATE_CODE = substring(PWSID, 1, 2)) %>%
filter(STATE_CODE %in% t1_states)table(pws_mobile$STATE_CODE)
AR AZ CA CT IL KS MO NC NJ NM OK PA TX UT
109 618 630 79 317 148 337 2429 192 227 457 1652 1206 35
not_covered_mbl_pwsid <- setdiff(pws_mobile$PWSID, hydroshare_mobile$pwsid)
states_not_covered_mbl <- lapply(not_covered_mbl_pwsid, function(x) substr(x, 1, 2))table(unlist(states_not_covered_mbl)) %>% as.data.frame() %>% arrange(desc(Freq)) Var1 Freq
1 NC 2367
2 PA 1189
3 TX 795
4 AZ 480
5 OK 455
6 MO 337
7 IL 317
8 CA 286
9 NM 153
10 KS 147
11 NJ 113
12 AR 104
13 CT 47
14 UT 28
There is a lot of missingness in mobile home in the states.
mobile_pws_by_state <- table(pws_mobile$STATE_CODE) %>% as.data.frame()
colnames(mobile_pws_by_state) <- c("Var1", "total")table_mobile <- table(unlist(states_not_covered_mbl)) %>% as.data.frame() %>% arrange(desc(Freq)) %>%
filter(Var1 %in% t1_states) %>%
inner_join(mobile_pws_by_state) %>%
mutate(pct_missing = round(Freq/total*100, 2))Joining with `by = join_by(Var1)`
# Reorder the levels of the Term factor based on the Frequency variable
table_mobile$Var1 <- reorder(table_mobile$Var1, table_mobile$Freq)ggplot(table_mobile, aes(x = Freq, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Number of Mobile Homes missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
scale_x_continuous(limits = c(0, 2500))+
theme_minimal()Percentage missingness
# Reorder the levels of the Term factor based on the Frequency variable
table_mobile$Var1 <- reorder(table_mobile$Var1, table_mobile$pct_missing)
ggplot(table_mobile, aes(x = pct_missing, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Percentage of Mobile homes missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
scale_x_continuous(limits = c(0, 105))+
theme_minimal()Missing data in other Tiers in other states
HOA
hydroshare_others <- setdiff(st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>% st_drop_geometry(), df)Reading layer `temm' from data source
`C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg'
using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension: XY
Bounding box: xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS: WGS 84
hydroshare_others_hoa <- hydroshare_others[grepl("HA|SU", hydroshare_others$service_area_type_code), ]pws_hoa_others <- setdiff(hoa, pws_hoa %>% select(-STATE_CODE))no_hoa_records_others <- setdiff(pws_hoa_others$PWSID, hydroshare_others_hoa$pwsid)
states_not_covered_hoas <- lapply(no_hoa_records_others, function(x) substr(x, 1, 2))
table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq)) Var1 Freq
1 FL 308
2 GA 244
3 IA 149
4 SD 73
5 NY 62
6 OR 42
7 MT 40
8 VA 36
9 NH 34
10 ID 28
11 MN 27
12 VI 26
13 ME 24
14 CO 11
15 AK 9
16 LA 7
17 WY 7
18 IN 5
19 MA 5
20 ND 5
21 NE 5
22 OH 4
23 MI 2
24 WV 2
25 04 1
26 KY 1
27 MD 1
table_hoa <- table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq)) %>%
filter(!Var1 %in% t1_states) %>%
filter(Var1 != "04")
# Reorder the levels of the Term factor based on the Frequency variable
table_hoa$Var1 <- reorder(table_hoa$Var1, table_hoa$Freq)ggplot(table_hoa, aes(x = Freq, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Number of HOAs missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
# scale_x_continuous(limits = c(0, 2500))+
theme_minimal()Number of HOAs in SDWA data in Tier 2 and 3 states
hoa_tier23 <- pws_hoa_others %>%
mutate(Var1 = substr(PWSID, 1, 2)) %>%
group_by(Var1) %>%
summarise(total = n())Missingness data compilation
missingness_others_hoa <- hoa_tier23 %>%
merge(table_hoa) %>%
mutate(pct_missing = round(Freq/total*100, 2))
missingness_others_hoa$Var1 <- reorder(missingness_others_hoa$Var1,
missingness_others_hoa$pct_missing)ggplot(missingness_others_hoa, aes(x = pct_missing, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Percentage of HOAs missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
# scale_x_continuous(limits = c(0, 2500))+
theme_minimal()Mobile
hydroshare_others_mbl <- hydroshare_others[grepl("MH|MP", hydroshare_others$service_area_type_code), ]pws_mbl_others <- setdiff(mobile, pws_mobile %>% select(-STATE_CODE))no_mbl_records_others <- setdiff(pws_mbl_others$PWSID, hydroshare_others_mbl$pwsid)
states_not_covered_mbls <- lapply(no_mbl_records_others, function(x) substr(x, 1, 2))
table(unlist(states_not_covered_mbls)) %>% as.data.frame() %>% arrange(desc(Freq)) Var1 Freq
1 FL 2102
2 SC 754
3 NY 731
4 VA 719
5 OH 467
6 LA 385
7 MD 384
8 GA 353
9 WI 283
10 MT 273
11 ID 263
12 OR 253
13 KY 237
14 IA 227
15 MN 215
16 MI 191
17 WV 186
18 AL 185
19 TN 176
20 IN 165
21 ME 154
22 NV 151
23 MS 149
24 WY 137
25 DE 117
26 CO 111
27 NE 87
28 NH 68
29 AK 67
30 ND 60
31 VT 55
32 SD 52
33 09 22
34 MA 21
35 RI 18
36 PR 7
37 10 6
38 02 3
39 04 2
40 08 2
41 05 1
42 06 1
43 AS 1
44 NN 1
table_mbl <- table(unlist(states_not_covered_mbls)) %>% as.data.frame() %>% arrange(desc(Freq)) %>%
filter(!Var1 %in% t1_states) %>%
filter(!Var1 %in% c("05", "06", "04", "08", "02", "10", "09", "NN", "PR"))
# Reorder the levels of the Term factor based on the Frequency variable
table_mbl$Var1 <- reorder(table_mbl$Var1, table_mbl$Freq)table(table_mbl$Var1)
AS RI MA SD VT ND AK NH NE CO DE WY MS NV ME IN TN AL WV MI MN IA KY OR ID MT
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
WI GA MD LA OH VA NY SC FL 02 04 05 06 08 09 10 NN PR
1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
ggplot(table_mbl, aes(x = Freq, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Number of mobile homes missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
scale_x_continuous(limits = c(0, 2200))+
theme_minimal()Number of mobile homes in SDWA data in Tier 2 and 3 states
mobile_tier23 <- pws_mbl_others %>%
mutate(Var1 = substr(PWSID, 1, 2)) %>%
group_by(Var1) %>%
summarise(total = n())Missingness data compilation
missingness_others_mbl <- mobile_tier23 %>%
merge(table_mbl) %>%
mutate(pct_missing = round(Freq/total*100, 2))
missingness_others_mbl$Var1 <- reorder(missingness_others_mbl$Var1,
missingness_others_mbl$pct_missing)ggplot(missingness_others_mbl, aes(x = pct_missing, y = Var1)) +
geom_col(fill = "steelblue") +
labs(title = "Percentage of mobile homes missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
# scale_x_continuous(limits = c(0, 2500))+
theme_minimal()Number of Tier 2 and Tier 3 records for other states for each category
other_states_df <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>%
filter(!state_code %in% t1_states)Reading layer `temm' from data source
`C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg'
using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension: XY
Bounding box: xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS: WGS 84
HOA
other_hoa <- other_states_df[grepl("HA|SU", other_states_df$service_area_type_code), ] %>%
group_by(state_code, tier) %>%
summarise(num = n()) %>%
st_drop_geometry()`summarise()` has grouped output by 'state_code'. You can override using the
`.groups` argument.
head(other_hoa)# A tibble: 6 × 3
# Groups: state_code [2]
state_code tier num
<chr> <dbl> <int>
1 AK 2 4
2 AK 3 36
3 AK NA 10
4 AL 2 1
5 AL 3 1
6 AL NA 34
# Create stacked bar chart
ggplot(other_hoa, aes(x = state_code, y = num, fill = as.factor(tier))) +
geom_bar(stat = "identity") +
labs(title = "HOAs in Tier 2 and 3 states", x = "", y = "count") +
coord_flip()+
theme_minimal()table(other_hoa$state_code)
AK AL CO FL GA GU HI IA ID IN KY LA MA ME MI MN MS MT ND NE NH NV NY OH OR RI
3 3 4 3 3 1 1 3 3 2 1 2 3 3 1 1 1 3 2 1 3 2 3 1 3 2
SC SD VA VI VT WV WY
1 3 3 1 2 1 3
data_hoa <- other_hoa %>%
filter(!tier == 1) %>%
filter(!str_detect(state_code, "\\d"))data_hoa2 <- data_hoa %>%
filter(tier == 2)ggplot(data_hoa, aes(x = state_code, y = num)) +
geom_bar(stat = "identity") +
geom_text(aes(label = num), hjust = -0.1, color = "black")+
scale_y_continuous(limits = c(0, 1000))+
theme(strip.background = element_blank(),
strip.text.x = element_blank())+
labs(title = "Number of HOAs in Tiers 2 and 3")+
coord_flip()+
theme_minimal()+
facet_wrap(~ tier, scales = "free_y", nrow = 5, ncol = 7)hoa_other_count <- other_hoa %>%
group_by(state_code) %>%
summarize(total_count = sum(num))
hoa_other_count$state_code <- reorder(hoa_other_count$state_code, hoa_other_count$total_count)
ggplot(hoa_other_count, aes(x = total_count, y = state_code)) +
geom_col(fill = "steelblue", width = 0.6) +
labs(title = "Total number of HOAs in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = total_count), hjust = -0.1, color = "black", size = 2.5)+
# scale_x_continuous(limits = c(0, 2500))+
theme_minimal()MObile
other_mbl <- other_states_df[grepl("MH|MP", other_states_df$service_area_type_code), ] %>%
group_by(state_code, tier) %>%
summarise(num = n()) %>%
st_drop_geometry()`summarise()` has grouped output by 'state_code'. You can override using the
`.groups` argument.
head(other_mbl)# A tibble: 6 × 3
# Groups: state_code [2]
state_code tier num
<chr> <dbl> <int>
1 AK 1 1
2 AK 2 2
3 AK 3 31
4 AK NA 2
5 AL 3 2
6 AL NA 2
Summary total count in other states
mbl_other_count <- other_mbl %>%
group_by(state_code) %>%
summarize(total_count = sum(num))
mbl_other_count$state_code <- reorder(mbl_other_count$state_code, mbl_other_count$total_count)
ggplot(mbl_other_count, aes(x = total_count, y = state_code)) +
geom_col(fill = "steelblue", width = 0.6) +
labs(title = "Total number of mobile homes in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = total_count), hjust = -0.1, color = "black", size = 2.5)+
theme(axis.text.x = element_text(size = 3))+
# scale_x_continuous(limits = c(0, 2500))+
theme_minimal()# Create stacked bar chart
ggplot(other_mbl, aes(x = state_code, y = num, fill = as.factor(tier))) +
geom_bar(stat = "identity") +
labs(title = "Mobile homes in Tier 2 and 3 states", x = "", y = "count") +
coord_flip()+
theme_minimal()data_mbl <- other_mbl %>%
filter(!tier == 1)
ggplot(data_mbl, aes(x = state_code, y = num)) +
geom_bar(stat = "identity") +
geom_text(aes(label = num), hjust = -0.1, color = "black")+
scale_y_continuous(limits = c(0, 1000))+
theme(strip.background = element_blank(),
strip.text.x = element_blank())+
labs(title = "Number of Mobile homes in Tiers 2 and 3")+
coord_flip()+
theme_minimal()+
facet_wrap(~ tier, scales = "free_y", nrow = 5, ncol = 8)Size distribution of HOAs
hoa_sizes <- service_areas %>%
filter(SERVICE_AREA_TYPE_CODE == "HA" | SERVICE_AREA_TYPE_CODE == "SU") %>%
inner_join(facilities) %>%
filter(SERVICE_CONNECTIONS_COUNT < 1000)Joining with `by = join_by(SUBMISSIONYEARQUARTER, PWSID, FIRST_REPORTED_DATE,
LAST_REPORTED_DATE)`
HOA summary
hoa_connections_state <- hoa_sizes %>%
group_by(STATE_CODE) %>%
summarise(total_connections = sum(SERVICE_CONNECTIONS_COUNT)) %>%
filter(!is.na(STATE_CODE))
hoa_connections_state$STATE_CODE <- reorder(hoa_connections_state$STATE_CODE,hoa_connections_state$total_connections)
ggplot(hoa_connections_state, aes(x = total_connections, y = STATE_CODE)) +
geom_col(fill = "steelblue", width = 0.6) +
labs(title = "Total HOA service connections by state (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = total_connections), hjust = -0.1, color = "black", size = 2.5)+
scale_x_continuous(limits = c(0, 10000))+
theme_minimal()summary(hoa_sizes$SERVICE_CONNECTIONS_COUNT) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 17.00 28.00 66.36 59.00 985.00
ggplot(hoa_sizes, aes(x = SERVICE_CONNECTIONS_COUNT)) +
geom_histogram(bins = 100)+
labs(title = "Distribution of number of service connections in HOAs")Mobile homes sizes
mbl_sizes <- service_areas %>%
filter(SERVICE_AREA_TYPE_CODE == "MH" | SERVICE_AREA_TYPE_CODE == "MP") %>%
inner_join(facilities) %>%
filter(SERVICE_CONNECTIONS_COUNT < 1000)Joining with `by = join_by(SUBMISSIONYEARQUARTER, PWSID, FIRST_REPORTED_DATE,
LAST_REPORTED_DATE)`
summary(mbl_sizes$SERVICE_CONNECTIONS_COUNT) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 20.0 44.0 98.9 120.0 754.0
ggplot(mbl_sizes, aes(x = SERVICE_CONNECTIONS_COUNT)) +
geom_histogram(bins = 100)+
labs(title = "Distribution of number of service connections in Mobile homes")Mobile summary
mbl_connections_state <- mbl_sizes %>%
group_by(STATE_CODE) %>%
summarise(total_connections = sum(SERVICE_CONNECTIONS_COUNT))
mbl_connections_state$STATE_CODE <- reorder(mbl_connections_state$STATE_CODE,mbl_connections_state$total_connections)
ggplot(mbl_connections_state, aes(x = total_connections, y = STATE_CODE)) +
geom_col(fill = "steelblue", width = 0.6) +
labs(title = "Total mobile homes service connections by state (Tier 2 and 3 states)", x = "", y = "")+
geom_text(aes(label = total_connections), hjust = -0.1, color = "black", size=2.5)+
theme(axis.text.y = element_text(size = 2.5))+
scale_x_continuous(limits = c(0, 13000))+
theme_minimal()