Gaps in service area boundary data for HOAs

Library

library(tidyverse)
library(sf)

Look at service areas data from SDWA data

service_areas <- read_csv("./SDWA_latest_downloads/SDWA_SERVICE_AREAS.csv")
Rows: 416170 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): SUBMISSIONYEARQUARTER, PWSID, SERVICE_AREA_TYPE_CODE, IS_PRIMARY_SE...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Meaning of PWS according to EPA: “For example, in the SDWA_PUB_WATER_SYSTEMS.csv file, each row describes a public water system in a particular quarter, and is uniquely identified by the values of the key fields SUBMISSIONYEARQUARTER and PWSID. Site visits, violations, and so on for that same system and quarter are identified by the same values of SUBMISSIONYEARQUARTER and PWSID in the other files.”

facilities <- read_csv("SDWA_latest_downloads/SDWA_PUB_WATER_SYSTEMS.csv")

Service area code types

DC Daycare Center
DI Dispenser
HA Homeowners Association
HM Hotel/Motel
HR Highway Rest Area
IA Industrial/Agricultural
IC Interstate Carrier
IN Institution
MF Medical Facility
MH Mobile Home Park
MP Mobile Home Park - Principal Residence
MU Municipality
OA Other Area
ON Other Non-transient Area
OR Other Residential
OT Other Transient Area
PA Recreation Area
RA Residential Area
RE Retail Employees
RS Restaurant
SC School
SI Sanitary Improvement District
SK Summer Camp
SR Secondary Residences
SS Service Station
SU Subdivision
WB Water Bottler
WH Wholesaler of Water

Looking at HOAs

hoa <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "HA" | SERVICE_AREA_TYPE_CODE == "SU") %>% 
  distinct()

table(hoa$IS_PRIMARY_SERVICE_AREA_CODE)

   N    Y 
   6 5032 

Total HOA+Subdivisions: 5312

HOA is for service areas that has PWS ID, and facilities are public water systems.

pws_hoa <- hoa

Number of public water systems in Hydroshare data in tiers

hyd <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg")%>%
  st_drop_geometry()
Reading layer `temm' from data source 
  `C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg' 
  using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS:  WGS 84
hyd %>% 
  group_by(tier) %>% 
  summarise(count = n())
# A tibble: 4 × 2
   tier count
  <dbl> <int>
1     1 17645
2     2 11079
3     3 17249
4    NA  3451

Checking the missingness of the HOAs in Hydroshare data

#read in Hydroshare data
df <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>% 
  st_drop_geometry() %>% 
  filter(tier == 1) #filtering Tier 1 data
Reading layer `temm' from data source 
  `C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg' 
  using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS:  WGS 84

States with Tier 1 data

table(df$state_code)

  AK   AL   AR   AZ   CA   CO   CT   DC   DE   FL   GA   HI   IA   ID   IL   IN 
   1    2  670  673 2794   40  435    1    1   18    5    1    2    7 1115    3 
  KS   KY   LA   MA   MD   ME   MI   MO   MS   MT   NC   NJ   NM   NV   NY   OH 
 780    1    2   10   11    1   18  285    1    1  491  570  530   11   23    1 
  OK   OR   PA   RI   SC   TN   TX   UT   VA   WA   WI   WV   WY 
 708    5 1721   32    2    1 4437  486    1 1740    2    5    1 

We will remove data where the count per state is lesser than 250.

tier1_count_by_state <- table(df$state_code) %>% as.data.frame()
colnames(tier1_count_by_state) <- c("state", "count")
t1_states_df <- tier1_count_by_state %>% 
  filter(count > 250)
t1_states <- t1_states_df$state

Removing data where majority of records aren’t Tier 1

# Turning off geom as it led to some unexpected errors in summarization
##simplifying data frame 
## Because of slowdown and some unexpected errors, I am turning off geom.
df <- df %>%  
  as.data.frame() %>% 
  filter(state_code != "02" | state_code != "09" ) %>% 
  filter(state_code %in% t1_states)
table(df$state_code)

  AR   AZ   CA   CT   IL   KS   MO   NC   NJ   NM   OK   PA   TX   UT   WA 
 670  673 2794  435 1115  780  285  491  570  530  708 1721 4437  486 1740 

Hydroshare data for non Tier 1 states

Filtering HOAs in Tier 1 Hydroshare data

hydroshare_hoa <- df[grepl("HA|SU", df$service_area_type_code), ]
nrow(hydroshare_hoa)
[1] 129

There are 129 HOAs in Tier 1 states.

Looking at the observations that HOAs in SDWA that aren’t covered in Hydroshare, and their geographic distribution for Tier 1 data.

Filtering PWS data for states with Tier 1 data

pws_hoa <- pws_hoa %>%
  mutate(STATE_CODE = substring(PWSID, 1, 2)) %>% 
  filter(STATE_CODE %in% t1_states)
hoa_pws_by_state <- table(pws_hoa$STATE_CODE) %>% as.data.frame()
colnames(hoa_pws_by_state) <- c("Var1", "total")

There are 1280 records.

table(hydroshare_hoa$state_code)

AZ CA CT IL KS MO NC NM TX UT 
28  2  6  1  1  2  1 37 42  9 
not_covered_hoas_pwsid <- setdiff(pws_hoa$PWSID, hydroshare_hoa$pwsid)

states_not_covered_hoas <- lapply(not_covered_hoas_pwsid, function(x) substr(x, 1, 2))
table_hoa <- table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq)) %>% 
  filter(Var1 %in% t1_states) %>% 
  inner_join(hoa_pws_by_state) %>% 
  mutate(pct_missing = round(Freq/total*100, 2))
Joining with `by = join_by(Var1)`
# Reorder the levels of the Term factor based on the Frequency variable
table_hoa$Var1 <- reorder(table_hoa$Var1, table_hoa$Freq)

ggplot(table_hoa, aes(x = Freq, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Number of HOAs missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
  geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
  theme_minimal()

Percentage missingness

# Reorder the levels of the Term factor based on the Frequency variable
table_hoa$Var1 <- reorder(table_hoa$Var1, table_hoa$pct_missing)

ggplot(table_hoa, aes(x = pct_missing, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Percentage of HOAs missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
  geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
  scale_x_continuous(limits = c(0, 105))+
  theme_minimal()

NC and Missouri have data gaps in HOAs and SUbdivisions.

Looking at Mobile Homes

Mobile homes in PWS data

mobile <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "MH" | SERVICE_AREA_TYPE_CODE == "MP") %>% 
  distinct()

table(mobile$IS_PRIMARY_SERVICE_AREA_CODE)

    N     Y 
  178 16160 
pws_mobile <- mobile
table(df$state_code)

  AR   AZ   CA   CT   IL   KS   MO   NC   NJ   NM   OK   PA   TX   UT   WA 
 670  673 2794  435 1115  780  285  491  570  530  708 1721 4437  486 1740 
hydroshare_mobile <- df[grepl("MH|MP", df$service_area_type_code), ]
nrow(hydroshare_mobile)
[1] 1610

Filtering mobile homes in Tier 1 Hydroshare data

pws_mobile <- pws_mobile %>%
  mutate(STATE_CODE = substring(PWSID, 1, 2)) %>% 
  filter(STATE_CODE %in% t1_states)
table(pws_mobile$STATE_CODE)

  AR   AZ   CA   CT   IL   KS   MO   NC   NJ   NM   OK   PA   TX   UT 
 109  618  630   79  317  148  337 2429  192  227  457 1652 1206   35 
not_covered_mbl_pwsid <- setdiff(pws_mobile$PWSID, hydroshare_mobile$pwsid)

states_not_covered_mbl <- lapply(not_covered_mbl_pwsid, function(x) substr(x, 1, 2))
table(unlist(states_not_covered_mbl)) %>% as.data.frame() %>% arrange(desc(Freq))
   Var1 Freq
1    NC 2367
2    PA 1189
3    TX  795
4    AZ  480
5    OK  455
6    MO  337
7    IL  317
8    CA  286
9    NM  153
10   KS  147
11   NJ  113
12   AR  104
13   CT   47
14   UT   28

There is a lot of missingness in mobile home in the states.

mobile_pws_by_state <- table(pws_mobile$STATE_CODE) %>% as.data.frame()
colnames(mobile_pws_by_state) <- c("Var1", "total")
table_mobile <- table(unlist(states_not_covered_mbl)) %>% as.data.frame() %>% arrange(desc(Freq)) %>% 
  filter(Var1 %in% t1_states) %>% 
  inner_join(mobile_pws_by_state) %>% 
  mutate(pct_missing = round(Freq/total*100, 2))
Joining with `by = join_by(Var1)`
# Reorder the levels of the Term factor based on the Frequency variable
table_mobile$Var1 <- reorder(table_mobile$Var1, table_mobile$Freq)
ggplot(table_mobile, aes(x = Freq, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Number of Mobile Homes missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
  geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

Percentage missingness

# Reorder the levels of the Term factor based on the Frequency variable
table_mobile$Var1 <- reorder(table_mobile$Var1, table_mobile$pct_missing)

ggplot(table_mobile, aes(x = pct_missing, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Percentage of Mobile homes missing in Hydroshare data (Tier 1 states)", x = "", y = "")+
  geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
  scale_x_continuous(limits = c(0, 105))+
  theme_minimal()

Missing data in other Tiers in other states

HOA

hydroshare_others <- setdiff(st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>% st_drop_geometry(), df)
Reading layer `temm' from data source 
  `C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg' 
  using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS:  WGS 84
hydroshare_others_hoa <- hydroshare_others[grepl("HA|SU", hydroshare_others$service_area_type_code), ]
pws_hoa_others <- setdiff(hoa, pws_hoa %>% select(-STATE_CODE))
no_hoa_records_others <- setdiff(pws_hoa_others$PWSID, hydroshare_others_hoa$pwsid)

states_not_covered_hoas <- lapply(no_hoa_records_others, function(x) substr(x, 1, 2))

table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq))
   Var1 Freq
1    FL  308
2    GA  244
3    IA  149
4    SD   73
5    NY   62
6    OR   42
7    MT   40
8    VA   36
9    NH   34
10   ID   28
11   MN   27
12   VI   26
13   ME   24
14   CO   11
15   AK    9
16   LA    7
17   WY    7
18   IN    5
19   MA    5
20   ND    5
21   NE    5
22   OH    4
23   MI    2
24   WV    2
25   04    1
26   KY    1
27   MD    1
table_hoa <- table(unlist(states_not_covered_hoas)) %>% as.data.frame() %>% arrange(desc(Freq)) %>% 
  filter(!Var1 %in% t1_states) %>% 
  filter(Var1 != "04")

# Reorder the levels of the Term factor based on the Frequency variable
table_hoa$Var1 <- reorder(table_hoa$Var1, table_hoa$Freq)
ggplot(table_hoa, aes(x = Freq, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Number of HOAs missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
#  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

Number of HOAs in SDWA data in Tier 2 and 3 states

hoa_tier23 <- pws_hoa_others %>% 
  mutate(Var1 = substr(PWSID, 1, 2)) %>% 
  group_by(Var1) %>% 
  summarise(total = n())

Missingness data compilation

missingness_others_hoa <- hoa_tier23 %>% 
  merge(table_hoa) %>% 
  mutate(pct_missing = round(Freq/total*100, 2))

missingness_others_hoa$Var1 <- reorder(missingness_others_hoa$Var1, 
                                       missingness_others_hoa$pct_missing)
ggplot(missingness_others_hoa, aes(x = pct_missing, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Percentage of HOAs missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
#  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

Mobile

hydroshare_others_mbl <- hydroshare_others[grepl("MH|MP", hydroshare_others$service_area_type_code), ]
pws_mbl_others <- setdiff(mobile, pws_mobile %>% select(-STATE_CODE))
no_mbl_records_others <- setdiff(pws_mbl_others$PWSID, hydroshare_others_mbl$pwsid)

states_not_covered_mbls <- lapply(no_mbl_records_others, function(x) substr(x, 1, 2))

table(unlist(states_not_covered_mbls)) %>% as.data.frame() %>% arrange(desc(Freq))
   Var1 Freq
1    FL 2102
2    SC  754
3    NY  731
4    VA  719
5    OH  467
6    LA  385
7    MD  384
8    GA  353
9    WI  283
10   MT  273
11   ID  263
12   OR  253
13   KY  237
14   IA  227
15   MN  215
16   MI  191
17   WV  186
18   AL  185
19   TN  176
20   IN  165
21   ME  154
22   NV  151
23   MS  149
24   WY  137
25   DE  117
26   CO  111
27   NE   87
28   NH   68
29   AK   67
30   ND   60
31   VT   55
32   SD   52
33   09   22
34   MA   21
35   RI   18
36   PR    7
37   10    6
38   02    3
39   04    2
40   08    2
41   05    1
42   06    1
43   AS    1
44   NN    1
table_mbl <- table(unlist(states_not_covered_mbls)) %>% as.data.frame() %>% arrange(desc(Freq)) %>% 
  filter(!Var1 %in% t1_states) %>% 
  filter(!Var1 %in% c("05", "06", "04", "08", "02", "10", "09", "NN", "PR"))

# Reorder the levels of the Term factor based on the Frequency variable
table_mbl$Var1 <- reorder(table_mbl$Var1, table_mbl$Freq)
table(table_mbl$Var1)

AS RI MA SD VT ND AK NH NE CO DE WY MS NV ME IN TN AL WV MI MN IA KY OR ID MT 
 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
WI GA MD LA OH VA NY SC FL 02 04 05 06 08 09 10 NN PR 
 1  1  1  1  1  1  1  1  1  0  0  0  0  0  0  0  0  0 
ggplot(table_mbl, aes(x = Freq, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Number of mobile homes missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = Freq), hjust = -0.1, color = "black")+
  scale_x_continuous(limits = c(0, 2200))+
  theme_minimal()

Number of mobile homes in SDWA data in Tier 2 and 3 states

mobile_tier23 <- pws_mbl_others %>% 
  mutate(Var1 = substr(PWSID, 1, 2)) %>% 
  group_by(Var1) %>% 
  summarise(total = n())

Missingness data compilation

missingness_others_mbl <- mobile_tier23 %>% 
  merge(table_mbl) %>% 
  mutate(pct_missing = round(Freq/total*100, 2))

missingness_others_mbl$Var1 <- reorder(missingness_others_mbl$Var1, 
                                       missingness_others_mbl$pct_missing)
ggplot(missingness_others_mbl, aes(x = pct_missing, y = Var1)) +
  geom_col(fill = "steelblue") +
  labs(title = "Percentage of mobile homes missing in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = pct_missing), hjust = -0.1, color = "black")+
#  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

Number of Tier 2 and Tier 3 records for other states for each category

other_states_df <- st_read("C:/Users/kshit/OneDrive/Desktop/PhD projects/water/temm.gpkg") %>% 
  filter(!state_code %in% t1_states)
Reading layer `temm' from data source 
  `C:\Users\kshit\OneDrive\Desktop\PhD projects\water\temm.gpkg' 
  using driver `GPKG'
replacing null geometries with empty geometries
Simple feature collection with 49424 features and 21 fields (with 3451 geometries empty)
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -174.3233 ymin: 19.064 xmax: 173.0331 ymax: 71.34019
Geodetic CRS:  WGS 84

HOA

other_hoa <- other_states_df[grepl("HA|SU", other_states_df$service_area_type_code), ] %>% 
  group_by(state_code, tier) %>% 
  summarise(num = n()) %>% 
  st_drop_geometry()
`summarise()` has grouped output by 'state_code'. You can override using the
`.groups` argument.
head(other_hoa)
# A tibble: 6 × 3
# Groups:   state_code [2]
  state_code  tier   num
  <chr>      <dbl> <int>
1 AK             2     4
2 AK             3    36
3 AK            NA    10
4 AL             2     1
5 AL             3     1
6 AL            NA    34
# Create stacked bar chart
ggplot(other_hoa, aes(x = state_code, y = num, fill = as.factor(tier))) +
  geom_bar(stat = "identity") +
  labs(title = "HOAs in Tier 2 and 3 states", x = "", y = "count") +
  coord_flip()+
  theme_minimal()

table(other_hoa$state_code)

AK AL CO FL GA GU HI IA ID IN KY LA MA ME MI MN MS MT ND NE NH NV NY OH OR RI 
 3  3  4  3  3  1  1  3  3  2  1  2  3  3  1  1  1  3  2  1  3  2  3  1  3  2 
SC SD VA VI VT WV WY 
 1  3  3  1  2  1  3 
data_hoa <- other_hoa %>% 
  filter(!tier == 1) %>% 
  filter(!str_detect(state_code, "\\d"))
data_hoa2 <- data_hoa %>% 
  filter(tier == 2)
ggplot(data_hoa, aes(x = state_code, y = num)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = num), hjust = -0.1, color = "black")+
  scale_y_continuous(limits = c(0, 1000))+
  theme(strip.background = element_blank(),
        strip.text.x = element_blank())+
  labs(title = "Number of HOAs in Tiers 2 and 3")+
  coord_flip()+
  theme_minimal()+
  facet_wrap(~ tier, scales = "free_y", nrow = 5, ncol = 7)

hoa_other_count <- other_hoa %>% 
  group_by(state_code) %>% 
  summarize(total_count = sum(num))

hoa_other_count$state_code <- reorder(hoa_other_count$state_code, hoa_other_count$total_count)

ggplot(hoa_other_count, aes(x = total_count, y = state_code)) +
  geom_col(fill = "steelblue", width = 0.6) +
  labs(title = "Total number of HOAs in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = total_count), hjust = -0.1, color = "black", size = 2.5)+
#  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

MObile

other_mbl <- other_states_df[grepl("MH|MP", other_states_df$service_area_type_code), ] %>% 
  group_by(state_code, tier) %>% 
  summarise(num = n()) %>% 
  st_drop_geometry()
`summarise()` has grouped output by 'state_code'. You can override using the
`.groups` argument.
head(other_mbl)
# A tibble: 6 × 3
# Groups:   state_code [2]
  state_code  tier   num
  <chr>      <dbl> <int>
1 AK             1     1
2 AK             2     2
3 AK             3    31
4 AK            NA     2
5 AL             3     2
6 AL            NA     2

Summary total count in other states

mbl_other_count <- other_mbl %>% 
  group_by(state_code) %>% 
  summarize(total_count = sum(num))

mbl_other_count$state_code <- reorder(mbl_other_count$state_code, mbl_other_count$total_count)

ggplot(mbl_other_count, aes(x = total_count, y = state_code)) +
  geom_col(fill = "steelblue", width = 0.6) +
  labs(title = "Total number of mobile homes in Hydroshare data (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = total_count), hjust = -0.1, color = "black", size = 2.5)+
  theme(axis.text.x = element_text(size = 3))+
#  scale_x_continuous(limits = c(0, 2500))+
  theme_minimal()

# Create stacked bar chart
ggplot(other_mbl, aes(x = state_code, y = num, fill = as.factor(tier))) +
  geom_bar(stat = "identity") +
  labs(title = "Mobile homes in Tier 2 and 3 states", x = "", y = "count") +
  coord_flip()+
  theme_minimal()

data_mbl <- other_mbl %>% 
  filter(!tier == 1)

ggplot(data_mbl, aes(x = state_code, y = num)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = num), hjust = -0.1, color = "black")+
  scale_y_continuous(limits = c(0, 1000))+
  theme(strip.background = element_blank(),
        strip.text.x = element_blank())+
  labs(title = "Number of Mobile homes in Tiers 2 and 3")+
  coord_flip()+
  theme_minimal()+
  facet_wrap(~ tier, scales = "free_y", nrow = 5, ncol = 8)

Size distribution of HOAs

hoa_sizes <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "HA" | SERVICE_AREA_TYPE_CODE == "SU") %>% 
  inner_join(facilities) %>% 
  filter(SERVICE_CONNECTIONS_COUNT < 1000)
Joining with `by = join_by(SUBMISSIONYEARQUARTER, PWSID, FIRST_REPORTED_DATE,
LAST_REPORTED_DATE)`

HOA summary

hoa_connections_state <- hoa_sizes %>% 
  group_by(STATE_CODE) %>% 
  summarise(total_connections = sum(SERVICE_CONNECTIONS_COUNT)) %>% 
  filter(!is.na(STATE_CODE))

hoa_connections_state$STATE_CODE <- reorder(hoa_connections_state$STATE_CODE,hoa_connections_state$total_connections)

ggplot(hoa_connections_state, aes(x = total_connections, y = STATE_CODE)) +
  geom_col(fill = "steelblue", width = 0.6) +
  labs(title = "Total HOA service connections by state (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = total_connections), hjust = -0.1, color = "black", size = 2.5)+
  scale_x_continuous(limits = c(0, 10000))+
  theme_minimal()

summary(hoa_sizes$SERVICE_CONNECTIONS_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   17.00   28.00   66.36   59.00  985.00 
ggplot(hoa_sizes, aes(x = SERVICE_CONNECTIONS_COUNT)) +
  geom_histogram(bins = 100)+
  labs(title = "Distribution of number of service connections in HOAs")

Mobile homes sizes

mbl_sizes <- service_areas %>% 
  filter(SERVICE_AREA_TYPE_CODE == "MH" | SERVICE_AREA_TYPE_CODE == "MP") %>% 
  inner_join(facilities) %>% 
  filter(SERVICE_CONNECTIONS_COUNT < 1000)
Joining with `by = join_by(SUBMISSIONYEARQUARTER, PWSID, FIRST_REPORTED_DATE,
LAST_REPORTED_DATE)`
summary(mbl_sizes$SERVICE_CONNECTIONS_COUNT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0    20.0    44.0    98.9   120.0   754.0 
ggplot(mbl_sizes, aes(x = SERVICE_CONNECTIONS_COUNT)) +
  geom_histogram(bins = 100)+
  labs(title = "Distribution of number of service connections in Mobile homes")

Mobile summary

mbl_connections_state <- mbl_sizes %>% 
  group_by(STATE_CODE) %>% 
  summarise(total_connections = sum(SERVICE_CONNECTIONS_COUNT))

mbl_connections_state$STATE_CODE <- reorder(mbl_connections_state$STATE_CODE,mbl_connections_state$total_connections)

ggplot(mbl_connections_state, aes(x = total_connections, y = STATE_CODE)) +
  geom_col(fill = "steelblue", width = 0.6) +
  labs(title = "Total mobile homes service connections by state (Tier 2 and 3 states)", x = "", y = "")+
  geom_text(aes(label = total_connections), hjust = -0.1, color = "black", size=2.5)+
  theme(axis.text.y = element_text(size = 2.5))+
  scale_x_continuous(limits = c(0, 13000))+
  theme_minimal()