##EXPLORATORY ANALYSIS

# Load required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## corrplot 0.95 loaded
library(RColorBrewer)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(broom)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(effsize)
## Warning: package 'effsize' was built under R version 4.4.3
library(agricolae)
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
library(maps)
## Warning: package 'maps' was built under R version 4.4.3
## 
## Attaching package: 'maps'
## The following object is masked from 'package:viridis':
## 
##     unemp
library(stringr)
library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:agricolae':
## 
##     kurtosis, skewness
# Load the final dataset
data <- read.csv("final_data_clean.csv")

#Make a copy of the dataset
df<-data

#Check missing values
sum(is.na(df))
## [1] 0
# See the structure
str(df)
## 'data.frame':    1238 obs. of  22 variables:
##  $ pfoa_val               : num  4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
##  $ pfoa_detected          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ state                  : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ water_use              : chr  "Production" "Water supply, other" "Domestic" "Domestic" ...
##  $ tritium_tu             : num  -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
##  $ well_depth             : num  152.4 141.7 77.7 54 64 ...
##  $ tritium_category       : chr  "PreModern" "PreModern" "PreModern" "Modern" ...
##  $ month                  : chr  "August" "June" "June" "July" ...
##  $ median_income          : int  50536 50536 50536 50536 50536 50536 50536 50536 50536 50536 ...
##  $ poverty_rate           : num  16.7 16.7 16.7 16.7 16.7 ...
##  $ unemployment_rate      : num  5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
##  $ pct_white              : num  68.1 68.1 68.1 68.1 68.1 ...
##  $ pct_black              : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ pct_hispanic           : num  4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
##  $ pct_bachelor_plus      : num  23.9 23.9 23.9 23.9 23.9 ...
##  $ pct_children           : num  22.5 22.5 22.5 22.5 22.5 ...
##  $ pct_adults             : num  61 61 61 61 61 ...
##  $ pct_older              : num  16.5 16.5 16.5 16.5 16.5 ...
##  $ pop_density_per_sq_mile: num  96.3 96.3 96.3 96.3 96.3 ...
##  $ longitude              : num  -86.8 -86.8 -86.8 -86.8 -86.8 ...
##  $ latitude               : num  32.8 32.8 32.8 32.8 32.8 ...
##  $ cancer_rate            : num  0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
#Convert incorrect data classes to their appropriate classes
df$pfoa_detected <- as.factor(df$pfoa_detected)
df$state <- as.factor(df$state)
df$water_use <- as.factor(df$water_use)
df$tritium_category <- as.factor(df$tritium_category)
df$month <- as.factor(df$month)
df$median_income <- as.numeric(df$median_income)
str(df)
## 'data.frame':    1238 obs. of  22 variables:
##  $ pfoa_val               : num  4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
##  $ pfoa_detected          : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ state                  : Factor w/ 32 levels "Alabama","California",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ water_use              : Factor w/ 6 levels "Domestic","Irrigation",..: 4 6 1 1 1 1 1 1 6 1 ...
##  $ tritium_tu             : num  -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
##  $ well_depth             : num  152.4 141.7 77.7 54 64 ...
##  $ tritium_category       : Factor w/ 4 levels "Mixed","Modern",..: 3 3 3 2 2 2 3 1 2 1 ...
##  $ month                  : Factor w/ 12 levels "April","August",..: 2 7 7 6 6 6 6 6 6 6 ...
##  $ median_income          : num  50536 50536 50536 50536 50536 ...
##  $ poverty_rate           : num  16.7 16.7 16.7 16.7 16.7 ...
##  $ unemployment_rate      : num  5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
##  $ pct_white              : num  68.1 68.1 68.1 68.1 68.1 ...
##  $ pct_black              : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ pct_hispanic           : num  4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
##  $ pct_bachelor_plus      : num  23.9 23.9 23.9 23.9 23.9 ...
##  $ pct_children           : num  22.5 22.5 22.5 22.5 22.5 ...
##  $ pct_adults             : num  61 61 61 61 61 ...
##  $ pct_older              : num  16.5 16.5 16.5 16.5 16.5 ...
##  $ pop_density_per_sq_mile: num  96.3 96.3 96.3 96.3 96.3 ...
##  $ longitude              : num  -86.8 -86.8 -86.8 -86.8 -86.8 ...
##  $ latitude               : num  32.8 32.8 32.8 32.8 32.8 ...
##  $ cancer_rate            : num  0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
# Set theme for better looking plots
theme_set(theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 11),
        axis.text.x = element_text(angle = 45, hjust = 1)))
#Distribution of the Target Variable(s)

# For pfoa_val (continuous concentration)
ggplot(df, aes(x = pfoa_val)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
  labs(title = "Distribution of PFOA Concentration (pfoa_val)",
       x = "PFOA Concentration (pfoa_val)",
       y = "Frequency") 

# PFOA concentrations is largely right skewed, using log-transformation to improve
df$log_pfoa <- log(df$pfoa_val + 1)

ggplot(df, aes(x = log_pfoa)) +
  geom_histogram(binwidth = 0.2, fill = "darkgreen", color = "black") +
  labs(title = "Distribution of Log-transformed PFOA Concentration",
       x = "Log(PFOA Concentration)",
       y = "Frequency") 

# For pfoa_detected (binary detection status)
ggplot(df, aes(x = pfoa_detected, fill = pfoa_detected)) +
  geom_bar() +
  labs(title = "PFOA Detection Status",
       x = "Detected (1) / Not Detected (0)",
       y = "Count") +
  scale_fill_viridis_d()

# Original skewness
skewness(df$pfoa_val)
## [1] 33.9364
# Log-transformed skewness
skewness(log(df$pfoa_val))
## [1] 2.461264
# QQ plot for raw pfoa_val
qq_pfoa <- ggplot(df, aes(sample = pfoa_val)) +
  stat_qq(color = "steelblue") +
  stat_qq_line(color = "red", linetype = "dashed") +
  labs(title = "Q-Q Plot of PFOA Concentration")

# QQ plot for log-transformed pfoa_val
qq_log_pfoa <- ggplot(df, aes(sample = log_pfoa)) +
  stat_qq(color = "darkgreen") +
  stat_qq_line(color = "red", linetype = "dashed") +
  labs(title = "Q-Q Plot of Log_pfoa")
      

# Display side by side
gridExtra::grid.arrange(qq_pfoa, qq_log_pfoa, ncol = 2)

#Compare log(PFOA) by detection status

ggplot(df, aes(x = factor(pfoa_detected), y = log_pfoa, fill = factor(pfoa_detected))) +
  geom_boxplot() +
  labs(title = "Log(PFOA) by Detection Status",
       x = "Detected (0 = No, 1 = Yes)",
       y = "Log(PFOA Concentration)") +
  scale_fill_manual(values = c("red", "blue")) 

df %>%
  group_by(state) %>%
  summarise(
    total_pfoa = sum(pfoa_val, na.rm = TRUE),
    avg_pfoa = mean(pfoa_val, na.rm = TRUE),
    n_samples = n()
  ) %>%
  arrange(desc(total_pfoa))
## # A tibble: 32 × 4
##    state         total_pfoa avg_pfoa n_samples
##    <fct>              <dbl>    <dbl>     <int>
##  1 California         1900.     3.28       580
##  2 West Virginia      1704.   284.           6
##  3 Ohio                439.     4.93        89
##  4 Florida             361.     4.76        76
##  5 Massachusetts       335.     8.38        40
##  6 Tennessee           152.     2.77        55
##  7 Alabama             144.     3.06        47
##  8 Louisiana           143.     2.75        52
##  9 Utah                104.     2.73        38
## 10 Mississippi         103.     3.23        32
## # ℹ 22 more rows

Map showing the average PFOA concerntrations across the states

# Debug: Check unique states in your data
cat("Unique states in your data:\n")
## Unique states in your data:
unique_states <- unique(df$state)
print(sort(unique_states))
##  [1] Alabama        California     Colorado       Delaware       Florida       
##  [6] Georgia        Illinois       Indiana        Kansas         Kentucky      
## [11] Louisiana      Maine          Maryland       Massachusetts  Mississippi   
## [16] New Hampshire  New Jersey     New Mexico     New York       North Carolina
## [21] Ohio           Oklahoma       Oregon         Pennsylvania   Rhode Island  
## [26] South Carolina Tennessee      Texas          Utah           Virginia      
## [31] West Virginia  Wisconsin     
## 32 Levels: Alabama California Colorado Delaware Florida Georgia ... Wisconsin
# Create comprehensive state abbreviation lookup
state_abbrev <- data.frame(
  state = c("alabama", "california", "colorado", "delaware", "florida", "georgia", 
            "illinois", "indiana", "kansas", "kentucky", "louisiana", 
            "maine", "maryland", "massachusetts", "mississippi", "new hampshire", 
            "new jersey", "new mexico", "new york", "north carolina", 
            "ohio", "oklahoma", "oregon", "pennsylvania", "south carolina", 
            "tennessee", "texas", "utah", "virginia", "washington", 
            "west virginia", "wisconsin", "wyoming"),
  abbrev = c("AL", "CA", "CO", "DE", "FL", "GA", "IL", "IN", "KS", "KY", "LA", 
             "ME", "MD", "MA", "MS", "NH", "NJ", "NM", "NY", "NC", 
             "OH", "OK", "OR", "PA", "SC", "TN", "TX", "UT", "VA", "WA", 
             "WV", "WI", "WY")
)

# Convert state names to lowercase
df$state <- tolower(df$state)

# Debug: Compute average PFOA concentration and check results
df_avg <- df %>%
  group_by(state) %>%
  summarise(
    avg_pfoa = mean(pfoa_val, na.rm = TRUE),
    n_samples = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_pfoa))

cat("\nAverage PFOA by state (top 10):\n")
## 
## Average PFOA by state (top 10):
print(head(df_avg, 10))
## # A tibble: 10 × 3
##    state         avg_pfoa n_samples
##    <chr>            <dbl>     <int>
##  1 west virginia   284.           6
##  2 massachusetts     8.38        40
##  3 georgia           5.33        11
##  4 rhode island      5.29         3
##  5 ohio              4.93        89
##  6 florida           4.76        76
##  7 kentucky          4.50         7
##  8 new hampshire     4.37        11
##  9 new mexico        4            1
## 10 oregon            4            3
# Check if West Virginia is in the averages
wv_data <- df_avg %>% filter(state == "west virginia")
cat("\nWest Virginia data:\n")
## 
## West Virginia data:
print(wv_data)
## # A tibble: 1 × 3
##   state         avg_pfoa n_samples
##   <chr>            <dbl>     <int>
## 1 west virginia     284.         6
# Load US map data
us_states <- map_data("state")

# Debug: Check if West Virginia is in map data
wv_map_check <- us_states %>% filter(region == "west virginia")
cat("\nWest Virginia in map data:", nrow(wv_map_check), "rows\n")
## 
## West Virginia in map data: 373 rows
# Merge map data with averages
map_df <- left_join(us_states, df_avg, by = c("region" = "state"))

# Debug: Check merged data for West Virginia
wv_merged <- map_df %>% filter(region == "west virginia")
cat("West Virginia in merged data:", nrow(wv_merged), "rows\n")
## West Virginia in merged data: 373 rows
# Compute centroids for label placement 
centroids <- map_df %>%
  filter(!is.na(avg_pfoa)) %>%  
  group_by(region) %>%
  summarise(
    lon = mean(long, na.rm = TRUE),
    lat = mean(lat, na.rm = TRUE),
    avg_pfoa = first(avg_pfoa),  
    .groups = "drop"
  ) %>%
  left_join(state_abbrev, by = c("region" = "state")) %>%
  filter(!is.na(abbrev)) %>%  
  mutate(
    label = paste0(abbrev, " ", sprintf("%.2f", avg_pfoa))
  )

cat("\nNumber of centroids calculated:", nrow(centroids), "\n")
## 
## Number of centroids calculated: 31
print(sort(centroids$region))
##  [1] "alabama"        "california"     "colorado"       "delaware"      
##  [5] "florida"        "georgia"        "illinois"       "indiana"       
##  [9] "kansas"         "kentucky"       "louisiana"      "maine"         
## [13] "maryland"       "massachusetts"  "mississippi"    "new hampshire" 
## [17] "new jersey"     "new mexico"     "new york"       "north carolina"
## [21] "ohio"           "oklahoma"       "oregon"         "pennsylvania"  
## [25] "south carolina" "tennessee"      "texas"          "utah"          
## [29] "virginia"       "west virginia"  "wisconsin"
# Check if West Virginia made it to centroids
wv_centroid <- centroids %>% filter(region == "west virginia")
cat("\nWest Virginia centroid data:\n")
## 
## West Virginia centroid data:
print(wv_centroid)
## # A tibble: 1 × 6
##   region          lon   lat avg_pfoa abbrev label    
##   <chr>         <dbl> <dbl>    <dbl> <chr>  <chr>    
## 1 west virginia -80.3  38.8     284. WV     WV 284.04
# Plot
ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
  geom_polygon(color = "gray90", size = 0.3) +
  geom_text(data = centroids, aes(x = lon, y = lat, label = label),
            inherit.aes = FALSE, color = "black", size = 2.5, fontface = "bold",
            check_overlap = TRUE) +
  scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
  coord_fixed(1.3) +
  theme_void() +
  labs(title = "Average PFOA Concentrations by State") +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

df_summary <- df %>%
  group_by(state) %>%
  summarise(
    total_pfoa = sum(pfoa_val, na.rm = TRUE),
    avg_pfoa = mean(pfoa_val, na.rm = TRUE),
    n_samples = n()
  ) %>%
  arrange(desc(total_pfoa))
df_summary
## # A tibble: 32 × 4
##    state         total_pfoa avg_pfoa n_samples
##    <chr>              <dbl>    <dbl>     <int>
##  1 california         1900.     3.28       580
##  2 west virginia      1704.   284.           6
##  3 ohio                439.     4.93        89
##  4 florida             361.     4.76        76
##  5 massachusetts       335.     8.38        40
##  6 tennessee           152.     2.77        55
##  7 alabama             144.     3.06        47
##  8 louisiana           143.     2.75        52
##  9 utah                104.     2.73        38
## 10 mississippi         103.     3.23        32
## # ℹ 22 more rows
# Fix 1: Add Rhode Island to state abbreviation lookup
state_abbrev <- data.frame(
  state = c("alabama", "california", "colorado", "delaware", "florida", "georgia", "illinois", "indiana", "kansas", "kentucky", "louisiana", 
 "maine", "maryland", "massachusetts", "mississippi", "new hampshire", 
 "new jersey", "new mexico", "new york", "north carolina",  "ohio", "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina", 
"tennessee", "texas", "utah", "virginia", "washington", "west virginia", "wisconsin", "wyoming"),
  abbrev = c("AL", "CA", "CO", "DE", "FL", "GA", "IL", "IN", "KS", "KY", "LA", "ME", "MD", "MA", "MS", "NH", "NJ", "NM", "NY", "NC", "OH", "OK", "OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "WA", "WV", "WI", "WY")
)

# Fix 2: Recalculate centroids with the complete lookup
centroids <- map_df %>%
  filter(!is.na(avg_pfoa)) %>%  # Only states with data
  group_by(region) %>%
  summarise(
    lon = mean(long, na.rm = TRUE),
    lat = mean(lat, na.rm = TRUE),
    avg_pfoa = first(avg_pfoa),
    .groups = "drop"
  ) %>%
  left_join(state_abbrev, by = c("region" = "state")) %>%
  filter(!is.na(abbrev)) %>%
  mutate(
    label = paste0(abbrev, " ", sprintf("%.2f", avg_pfoa))
  )

cat("\nNumber of centroids after fix:", nrow(centroids), "\n")
## 
## Number of centroids after fix: 32
print(sort(centroids$region))
##  [1] "alabama"        "california"     "colorado"       "delaware"      
##  [5] "florida"        "georgia"        "illinois"       "indiana"       
##  [9] "kansas"         "kentucky"       "louisiana"      "maine"         
## [13] "maryland"       "massachusetts"  "mississippi"    "new hampshire" 
## [17] "new jersey"     "new mexico"     "new york"       "north carolina"
## [21] "ohio"           "oklahoma"       "oregon"         "pennsylvania"  
## [25] "rhode island"   "south carolina" "tennessee"      "texas"         
## [29] "utah"           "virginia"       "west virginia"  "wisconsin"
# Fix 3: Create plot without check_overlap to show all labels
# Solution 1: Smaller font size with check_overlap
p3 <- ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
  geom_polygon(color = "gray90", size = 0.3) +
  geom_text(data = centroids, aes(x = lon, y = lat, label = label),
            inherit.aes = FALSE, color = "black", size = 2.0, fontface = "bold",
            check_overlap = TRUE) +
  scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
  coord_fixed(1.3) +
  theme_void() +
  labs(title = "Average PFOA Concentrations by State (Smaller Font)") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))


# Fix 4: Alternative with check_overlap for cleaner look
# First identify which states are getting hidden
all_states <- centroids$region
visible_states_clean <- c() # You'd need to manually identify from your clean plot

# Create manual adjustments for small northeastern states
centroids_adjusted <- centroids %>%
  mutate(
    lon_adj = case_when(
      region == "rhode island" ~ lon + 1.5,  # Move RI label east
      region == "new hampshire" ~ lon + 1.0, # Move NH label east
      region == "massachusetts" ~ lon + 1.0, # Move MA label east
      region == "new jersey" ~ lon + 0.8,    # Move NJ label east
      region == "delaware" ~ lon + 1.2,      # Move DE label east
      TRUE ~ lon
    ),
    lat_adj = case_when(
      region == "rhode island" ~ lat - 0.5,  # Move RI label south
      region == "new hampshire" ~ lat + 0.5, # Move NH label north
      region == "massachusetts" ~ lat - 0.8, # Move MA label south
      TRUE ~ lat
    )
  )

centroids_sized <- centroids %>%
  mutate(
    is_northeast = region %in% c("rhode island", "massachusetts", "new hampshire", 
                                 "new jersey", "delaware", "maryland"),
    label_size = ifelse(is_northeast, 1.8, 2.3)
  )


# Show plot
print(p3)

# Fix 5: Create a summary table of all states with data
summary_table <- df_avg %>%
  left_join(state_abbrev, by = c("state" = "state")) %>%
  arrange(desc(avg_pfoa)) %>%
  select(state, abbrev, avg_pfoa, n_samples) %>%
  mutate(across(where(is.numeric), round, 2))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
cat("\nComplete summary of all", nrow(summary_table), "states with PFOA data:\n")
## 
## Complete summary of all 32 states with PFOA data:
print(summary_table)
## # A tibble: 32 × 4
##    state         abbrev avg_pfoa n_samples
##    <chr>         <chr>     <dbl>     <dbl>
##  1 west virginia WV       284.           6
##  2 massachusetts MA         8.38        40
##  3 georgia       GA         5.33        11
##  4 rhode island  RI         5.29         3
##  5 ohio          OH         4.93        89
##  6 florida       FL         4.76        76
##  7 kentucky      KY         4.5          7
##  8 new hampshire NH         4.37        11
##  9 new mexico    NM         4            1
## 10 oregon        OR         4            3
## # ℹ 22 more rows
# solution: Remove check_overlap entirely and use smaller, uniform font
p_all_visible <- ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
  geom_polygon(color = "gray90", size = 0.3) +
  geom_text(data = centroids, aes(x = lon, y = lat, label = label),
            inherit.aes = FALSE, color = "black", size = 1.9, fontface = "bold") +
  scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
  coord_fixed(1.3) +
  theme_void() +
  labs(title = "Average PFOA Concentrations by State (32 States)") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

print(p_all_visible)

# Count labels to verify all 32 are showing
cat("Total number of labels that should appear:", nrow(centroids), "\n")
## Total number of labels that should appear: 32
# Show the exact positions of the missing states
missing_likely <- centroids %>% 
  filter(region %in% c("rhode island", "indiana")) %>%
  select(region, abbrev, lon, lat, avg_pfoa, label)

print(missing_likely)
## # A tibble: 2 × 6
##   region       abbrev   lon   lat avg_pfoa label  
##   <chr>        <chr>  <dbl> <dbl>    <dbl> <chr>  
## 1 indiana      IN     -86.7  38.8     2.89 IN 2.89
## 2 rhode island RI     -71.5  41.6     5.29 RI 5.29
# Create focused water use categories
df_filtered <- df[df$water_use != "Unknown", ]

# Recode water use categories to focus on 4 main categories
df_filtered$water_use_grouped <- as.character(df_filtered$water_use)
df_filtered$water_use_grouped[df_filtered$water_use_grouped == "Water supply, other"] <- "Water Supply"

# Keep only the 4 main categories (excluding Unknown and Observation)
df_filtered <- df_filtered[df_filtered$water_use_grouped %in% c("Production", "Domestic", "Irrigation", "Water Supply"), ]

# Calculate mean PFOA levels by category
pfoa_summary <- aggregate(pfoa_val ~ water_use_grouped, data = df_filtered, 
                         FUN = function(x) c(mean_pfoa = round(mean(x), 2),
                                           count = length(x),
                                           se = round(sd(x)/sqrt(length(x)), 2)))
pfoa_summary_df <- data.frame(
  category = pfoa_summary$water_use_grouped,
  mean_pfoa = pfoa_summary$pfoa_val[,1],
  count = pfoa_summary$pfoa_val[,2],
  se = pfoa_summary$pfoa_val[,3]
)

# Create bar chart with ggplot2
library(ggplot2)
p3 <- ggplot(pfoa_summary_df, aes(x = category, y = mean_pfoa, fill = category)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste("Category:", category, "\nMean PFOA:", mean_pfoa)), 
            vjust = 0.5, color = "white", size = 3, fontface = "bold") +
  scale_fill_manual(values = c('#2E86AB', '#A23B72', '#F18F01', '#4CAF50')) +
  labs(title = "Mean PFOA Contamination by Water Use Category",
       x = "Water Use Category",
       y = "Mean PFOA Level") +
  theme_minimal() +
  theme(legend.position = "none")

print(p3)

p4 <- df %>%
  filter(!is.na(tritium_category) & tritium_category != "unknown") %>%  
  group_by(tritium_category) %>%
  summarise(
    mean_pfoa = mean(pfoa_val, na.rm = TRUE),
    mean_well_depth = mean(well_depth, na.rm = TRUE),
    mean_cancer_rate = mean(cancer_rate, na.rm = TRUE),
    sample_count = n(),
    .groups = 'drop'
  )

# Define consistent colors for each category
category_colors <- c("Mixed" = "#66C2A5", "PreModern" = "#FC8D62", "Unknown" = "#8DA0CB", "Modern" = "#E78AC3")

# Create separate plots for each metric - much clearer!
plot_pfoa <- p4 %>%
  ggplot(aes(x = reorder(tritium_category, mean_pfoa), y = mean_pfoa, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = round(mean_pfoa, 1)), vjust = -0.5, hjust = 0.5, 
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +  
  scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +  
  labs(title = "Mean PFOA Levels", x = "", y = "PFOA Level") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

plot_depth <- p4 %>%
  ggplot(aes(x = reorder(tritium_category, mean_well_depth), y = mean_well_depth, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = round(mean_well_depth, 0)), vjust = -0.5, hjust = 0.5,
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +  # Use consistent colors
  scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +  
  labs(title = "Mean Well Depth", x = "", y = "Depth (feet)") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

plot_cancer <- p4 %>%
  ggplot(aes(x = reorder(tritium_category, mean_cancer_rate), y = mean_cancer_rate, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = paste0(round(mean_cancer_rate * 100, 1))), vjust = -0.5, hjust = 0.5,
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +  
  scale_y_continuous(expand = expansion(mult = c(0, 0.25)), 
                     labels = scales::percent_format()) +  
  labs(title = "Mean Cancer Rate", x = "Water Age Category", y = "Cancer Rate (%)") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

# Arrange in a 1x3 grid (since we removed the count plot)
grid.arrange(plot_pfoa, plot_depth, plot_cancer, ncol = 3, nrow = 1)

# Print the summary table for reference
print(p4)
## # A tibble: 4 × 5
##   tritium_category mean_pfoa mean_well_depth mean_cancer_rate sample_count
##   <fct>                <dbl>           <dbl>            <dbl>        <int>
## 1 Mixed                 2.95           116.             0.495          283
## 2 Modern                6.51            51.7            0.559          654
## 3 PreModern             3.02           125.             0.542          184
## 4 Unknown               3.54            80.6            0.553          117
library(gridExtra)  

# Plot: Mean PFOA
plot_pfoa <- p4 %>%
  filter(tritium_category != "Unknown") %>%
  ggplot(aes(x = reorder(tritium_category, mean_pfoa), y = mean_pfoa, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = round(mean_pfoa, 1)), vjust = -0.5, hjust = 0.5, 
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +  
  labs(title = "Mean PFOA Levels", x = "", y = "PFOA Level") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

# Plot: Mean Well Depth
plot_depth <- p4 %>%
  filter(tritium_category != "Unknown") %>%
  ggplot(aes(x = reorder(tritium_category, mean_well_depth), y = mean_well_depth, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = round(mean_well_depth, 0)), vjust = -0.5, hjust = 0.5,
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +  
  scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +  
  labs(title = "Mean Well Depth", x = "", y = "Depth (feet)") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

# Plot: Mean Cancer Rate
plot_cancer <- p4 %>%
  filter(tritium_category != "Unknown") %>%
  ggplot(aes(x = reorder(tritium_category, mean_cancer_rate), y = mean_cancer_rate, fill = tritium_category)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = paste0(round(mean_cancer_rate * 100, 1))), vjust = -0.5, hjust = 0.5,
            fontface = "bold", size = 3.5, color = "black") +  
  scale_fill_manual(values = category_colors) +  
  scale_y_continuous(expand = expansion(mult = c(0, 0.25)), 
                     labels = scales::percent_format()) +  
  labs(title = "Mean Cancer Rate", x = "Water Age Category", y = "Cancer Rate (%)") +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
        plot.title = element_text(size = 14, face = "bold"))

# Arrange the plots in one row
grid.arrange(plot_pfoa, plot_depth, plot_cancer, ncol = 3, nrow = 1)

# Print the summary table for reference
print(p4)
## # A tibble: 4 × 5
##   tritium_category mean_pfoa mean_well_depth mean_cancer_rate sample_count
##   <fct>                <dbl>           <dbl>            <dbl>        <int>
## 1 Mixed                 2.95           116.             0.495          283
## 2 Modern                6.51            51.7            0.559          654
## 3 PreModern             3.02           125.             0.542          184
## 4 Unknown               3.54            80.6            0.553          117
# PLOT 5: Seasonal Patterns and Water Use Analysis

# Monthly patterns
monthly_summary <- df %>%
  group_by(month, water_use) %>%
  summarise(
    avg_pfoa = mean(pfoa_val, na.rm = TRUE),
    count = n(),
    .groups = 'drop'
  ) %>%
  filter(count >= 5)  # Only include combinations with sufficient data

# Convert month to factor with proper ordering
month_order <- c("January", "February", "March", "April", "May", "June",
                 "July", "August", "September", "October", "November", "December")
monthly_summary$month <- factor(monthly_summary$month, levels = month_order)

p4 <- monthly_summary %>%
  ggplot(aes(x = month, y = avg_pfoa, color = water_use, group = water_use)) +
  geom_line(linewidth = 1.2, alpha = 0.8) +
  geom_point(size = 2.5) +
  scale_color_brewer(type = "qual", palette = "Dark2", name = "Water Use Type") +
  labs(
    title = "Seasonal PFOA Contamination Patterns by Water Use",
    x = "Month",
    y = "Average PFOA Level"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p4)

# PLOT 5: Seasonal Patterns and Water Use Analysis (Filtered)
# Monthly patterns - excluding Unknown category
monthly_summary <- df %>%
  filter(water_use != "Unknown") %>%  # Filter out Unknown category
  group_by(month, water_use) %>%
  summarise(
    avg_pfoa = mean(pfoa_val, na.rm = TRUE),
    count = n(),
    .groups = 'drop'
  ) %>%
  filter(count >= 5)  # Only include combinations with sufficient data

# Convert month to factor with proper ordering
month_order <- c("January", "February", "March", "April", "May", "June",
                 "July", "August", "September", "October", "November", "December")
monthly_summary$month <- factor(monthly_summary$month, levels = month_order)

# Create the plot without Unknown category
p4 <- monthly_summary %>%
  ggplot(aes(x = month, y = avg_pfoa, color = water_use, group = water_use)) +
  geom_line(linewidth = 1.2, alpha = 0.8) +
  geom_point(size = 2.5) +
  scale_color_brewer(type = "qual", palette = "Dark2", name = "Water Use Type") +
  labs(
    title = "Monthly Average PFOA Levels by Water Use Category (ng/L, 2019-2022)",
    x = "Month",
    y = "Average PFOA Level"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p4)

# Optional: Check what categories remain after filtering
cat("Remaining water use categories after filtering:\n")
## Remaining water use categories after filtering:
remaining_categories <- monthly_summary %>% 
  distinct(water_use) %>% 
  pull(water_use)
print(sort(remaining_categories))
## [1] Domestic    Irrigation  Observation Production 
## 6 Levels: Domestic Irrigation Observation Production ... Water supply, other
df$state <- as.factor(df$state)
str(df)
## 'data.frame':    1238 obs. of  23 variables:
##  $ pfoa_val               : num  4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
##  $ pfoa_detected          : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ state                  : Factor w/ 32 levels "alabama","california",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ water_use              : Factor w/ 6 levels "Domestic","Irrigation",..: 4 6 1 1 1 1 1 1 6 1 ...
##  $ tritium_tu             : num  -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
##  $ well_depth             : num  152.4 141.7 77.7 54 64 ...
##  $ tritium_category       : Factor w/ 4 levels "Mixed","Modern",..: 3 3 3 2 2 2 3 1 2 1 ...
##  $ month                  : Factor w/ 12 levels "April","August",..: 2 7 7 6 6 6 6 6 6 6 ...
##  $ median_income          : num  50536 50536 50536 50536 50536 ...
##  $ poverty_rate           : num  16.7 16.7 16.7 16.7 16.7 ...
##  $ unemployment_rate      : num  5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
##  $ pct_white              : num  68.1 68.1 68.1 68.1 68.1 ...
##  $ pct_black              : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ pct_hispanic           : num  4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
##  $ pct_bachelor_plus      : num  23.9 23.9 23.9 23.9 23.9 ...
##  $ pct_children           : num  22.5 22.5 22.5 22.5 22.5 ...
##  $ pct_adults             : num  61 61 61 61 61 ...
##  $ pct_older              : num  16.5 16.5 16.5 16.5 16.5 ...
##  $ pop_density_per_sq_mile: num  96.3 96.3 96.3 96.3 96.3 ...
##  $ longitude              : num  -86.8 -86.8 -86.8 -86.8 -86.8 ...
##  $ latitude               : num  32.8 32.8 32.8 32.8 32.8 ...
##  $ cancer_rate            : num  0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
##  $ log_pfoa               : num  1.61 1.61 1.61 1.55 1.61 ...

Correlation

library(corrplot)
library(ggplot2)
library(dplyr)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:maps':
## 
##     map
## The following object is masked from 'package:car':
## 
##     some
# DUAL CORRELATION ANALYSIS

# 1. Select variables (excluding pfoa_val to avoid redundancy with log_pfoa)
numeric_vars <- df %>%
  select(log_pfoa, tritium_tu, well_depth, median_income, 
         poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older, 
         pop_density_per_sq_mile, cancer_rate) %>%
  na.omit()

# 2. ANALYSIS 1: log_PFOA as Response Variable
# Extract correlations with log_pfoa
log_pfoa_correlations <- cor(numeric_vars, use = "complete.obs")[,"log_pfoa"] %>%
  as.data.frame() %>%
  rename(correlation = 1) %>%
  mutate(variable = rownames(.)) %>%
  filter(variable != "log_pfoa") %>%
  arrange(desc(abs(correlation)))

# Create bar plot for log_PFOA correlations
p_log_pfoa <- log_pfoa_correlations %>%
  mutate(
    direction = ifelse(correlation > 0, "Positive", "Negative"),
    variable = reorder(variable, correlation),
    abs_corr = abs(correlation)
  ) %>%
  ggplot(aes(x = variable, y = correlation, fill = direction)) +
  geom_col(alpha = 0.8) +
  scale_fill_manual(values = c("Negative" = "#d73027", "Positive" = "#1a9850")) +
  coord_flip() +
  labs(
    title = "Correlations with log(PFOA) - Response Variable Analysis",
    x = "Predictor Variables",
    y = "Correlation Coefficient",
    fill = "Direction"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "bottom"
  ) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5)

print(p_log_pfoa)

# Print top correlations with log_PFOA
cat("=== LOG_PFOA AS RESPONSE VARIABLE ===\n")
## === LOG_PFOA AS RESPONSE VARIABLE ===
cat("Top 5 Positive Correlations with log_PFOA:\n")
## Top 5 Positive Correlations with log_PFOA:
positive_log <- log_pfoa_correlations %>%
  filter(correlation > 0) %>%
  head(5) %>%
  mutate(correlation = round(correlation, 4))
print(positive_log)
##                         correlation                variable
## pct_white                    0.2264               pct_white
## tritium_tu                   0.2119              tritium_tu
## pct_older                    0.1623               pct_older
## pop_density_per_sq_mile      0.1347 pop_density_per_sq_mile
## poverty_rate                 0.0867            poverty_rate
cat("\nTop 5 Negative Correlations with log_PFOA:\n")
## 
## Top 5 Negative Correlations with log_PFOA:
negative_log <- log_pfoa_correlations %>%
  filter(correlation < 0) %>%
  head(5) %>%
  mutate(correlation = round(correlation, 4))
print(negative_log)
##                   correlation          variable
## median_income         -0.1922     median_income
## pct_children          -0.1692      pct_children
## pct_hispanic          -0.1144      pct_hispanic
## well_depth            -0.0917        well_depth
## pct_bachelor_plus     -0.0838 pct_bachelor_plus
# 3. ANALYSIS 2: Predictor-Only Correlation Matrix
# Remove log_pfoa to focus on predictors only
predictors_only <- numeric_vars %>%
  select(-log_pfoa)

predictor_cor_matrix <- cor(predictors_only, use = "complete.obs")

# Create correlation plot for predictors only
corrplot(predictor_cor_matrix, method = "color", type = "upper", 
         order = "hclust", tl.cex = 0.7, tl.col = "black", tl.srt = 45,
         col = colorRampPalette(c("#053061", "#2166ac", "#4393c3", "#92c5de",
                                  "#d1e5f0", "#ffffff", "#fddbc7", "#f4a582",
                                  "#d6604d", "#b2182b", "#67001f"))(200),
         title = "Predictor Variables Correlation Matrix", mar = c(0,0,2,0))

# 4. Find strongest predictor-predictor correlations
predictor_pairs <- which(abs(predictor_cor_matrix) > 0.7 & 
                        predictor_cor_matrix != 1, arr.ind = TRUE)

if(nrow(predictor_pairs) > 0) {
  strong_correlations <- data.frame(
    var1 = rownames(predictor_cor_matrix)[predictor_pairs[,1]],
    var2 = colnames(predictor_cor_matrix)[predictor_pairs[,2]],
    correlation = predictor_cor_matrix[predictor_pairs]
  ) %>%
  arrange(desc(abs(correlation))) %>%
  mutate(correlation = round(correlation, 4))
  
  cat("\n=== PREDICTOR-PREDICTOR RELATIONSHIPS ===\n")
  cat("Strong Correlations (|r| > 0.7) Among Predictors:\n")
  print(strong_correlations)
} else {
  cat("\n=== PREDICTOR-PREDICTOR RELATIONSHIPS ===\n")
  cat("No correlations > 0.7 found among predictors\n")
}
## 
## === PREDICTOR-PREDICTOR RELATIONSHIPS ===
## Strong Correlations (|r| > 0.7) Among Predictors:
##                 var1              var2 correlation
## 1  pct_bachelor_plus     median_income      0.8618
## 2      median_income pct_bachelor_plus      0.8618
## 3  pct_bachelor_plus      poverty_rate     -0.8576
## 4       poverty_rate pct_bachelor_plus     -0.8576
## 5          pct_white unemployment_rate     -0.7858
## 6  unemployment_rate         pct_white     -0.7858
## 7          pct_older      pct_children     -0.7716
## 8       pct_children         pct_older     -0.7716
## 9       pct_hispanic         pct_white     -0.7494
## 10         pct_white      pct_hispanic     -0.7494
## 11         pct_black      poverty_rate      0.7374
## 12      poverty_rate         pct_black      0.7374
## 13        pct_adults     median_income      0.7344
## 14     median_income        pct_adults      0.7344
## 15      poverty_rate     median_income     -0.7261
## 16     median_income      poverty_rate     -0.7261
## 17      pct_hispanic     median_income      0.7195
## 18     median_income      pct_hispanic      0.7195
# 5. Moderate correlations among predictors (0.5 to 0.7)
moderate_pairs <- which(abs(predictor_cor_matrix) > 0.5 & 
                       abs(predictor_cor_matrix) <= 0.7 & 
                       predictor_cor_matrix != 1, arr.ind = TRUE)

if(nrow(moderate_pairs) > 0) {
  moderate_correlations <- data.frame(
    var1 = rownames(predictor_cor_matrix)[moderate_pairs[,1]],
    var2 = colnames(predictor_cor_matrix)[moderate_pairs[,2]],
    correlation = predictor_cor_matrix[moderate_pairs]
  ) %>%
  arrange(desc(abs(correlation))) %>%
  mutate(correlation = round(correlation, 4))
  
  cat("\nModerate Correlations (0.5 < |r| ≤ 0.7) Among Predictors:\n")
  print(head(moderate_correlations, 10))
}
## 
## Moderate Correlations (0.5 < |r| ≤ 0.7) Among Predictors:
##                 var1              var2 correlation
## 1         pct_adults      pct_hispanic      0.6564
## 2       pct_hispanic        pct_adults      0.6564
## 3         pct_adults pct_bachelor_plus      0.6494
## 4  pct_bachelor_plus        pct_adults      0.6494
## 5          pct_black     median_income     -0.6430
## 6      median_income         pct_black     -0.6430
## 7       pct_hispanic         pct_black     -0.6316
## 8          pct_black      pct_hispanic     -0.6316
## 9          pct_white     median_income     -0.6176
## 10     median_income         pct_white     -0.6176
# 6. Statistical significance for log_PFOA correlations
log_pfoa_sig_tests <- predictors_only %>%
  map_dfr(~{
    test_result <- cor.test(numeric_vars$log_pfoa, .x)
    data.frame(
      correlation = test_result$estimate,
      p_value = test_result$p.value,
      significant = test_result$p.value < 0.05,
      conf_low = test_result$conf.int[1],
      conf_high = test_result$conf.int[2]
    )
  }, .id = "variable") %>%
  arrange(desc(abs(correlation))) %>%
  mutate(across(where(is.numeric), round, 4))


cat("\n=== STATISTICAL SIGNIFICANCE ===\n")
## 
## === STATISTICAL SIGNIFICANCE ===
cat("Statistically Significant Correlations with log_PFOA (p < 0.05):\n")
## Statistically Significant Correlations with log_PFOA (p < 0.05):
significant_log <- log_pfoa_sig_tests %>%
  filter(significant == TRUE)
print(significant_log)
##                         variable correlation p_value significant conf_low
## cor...1                pct_white      0.2264  0.0000        TRUE   0.1729
## cor...2               tritium_tu      0.2119  0.0000        TRUE   0.1581
## cor...3            median_income     -0.1922  0.0000        TRUE  -0.2453
## cor...4             pct_children     -0.1692  0.0000        TRUE  -0.2229
## cor...5                pct_older      0.1623  0.0000        TRUE   0.1076
## cor...6  pop_density_per_sq_mile      0.1347  0.0000        TRUE   0.0795
## cor...7             pct_hispanic     -0.1144  0.0001        TRUE  -0.1690
## cor...8               well_depth     -0.0917  0.0012        TRUE  -0.1467
## cor...9             poverty_rate      0.0867  0.0023        TRUE   0.0312
## cor...10       pct_bachelor_plus     -0.0838  0.0032        TRUE  -0.1388
## cor...11             cancer_rate      0.0729  0.0102        TRUE   0.0173
##          conf_high
## cor...1     0.2786
## cor...2     0.2645
## cor...3    -0.1379
## cor...4    -0.1146
## cor...5     0.2161
## cor...6     0.1890
## cor...7    -0.0591
## cor...8    -0.0362
## cor...9     0.1418
## cor...10   -0.0282
## cor...11    0.1281
# Select only numeric columns and exclude unwanted variables
numeric_data <- df %>% 
  select(where(is.numeric)) %>%
  select(-pfoa_val, -latitude, -longitude)  # Remove these specific columns

# Calculate correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Plot correlation heatmap
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.cex = 0.8, 
         col = colorRampPalette(c("blue", "white", "red"))(200),
         title = "Correlation Matrix with log_pfoa", mar = c(0,0,1,0))

MODELLING

#Filter out states with less than equal to 3 observations
df_filtered <- df %>%
  group_by(state) %>%
  filter(n() >= 3) %>%
  ungroup()

# Count how many unique states remain after filtering
states_remaining <- df_filtered %>%
  distinct(state) %>%
  nrow()

# Display the result
cat("Number of states remaining after filtering:", states_remaining, "\n")
## Number of states remaining after filtering: 30
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ plotly::filter()     masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::map()         masks maps::map()
## ✖ car::recode()        masks dplyr::recode()
## ✖ purrr::some()        masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)

# 1. OVERALL ANALYSIS: PFOA Detection vs Average Cancer Rate

# Summary statistics by PFOA detection status
# Note: cancer_rate is already a calculated average for each area
summary_stats <- df_filtered %>%
  group_by(pfoa_detected) %>%
  summarise(
    n_observations = n(),
    mean_cancer_rate = mean(cancer_rate, na.rm = TRUE),
    sd_cancer_rate = sd(cancer_rate, na.rm = TRUE),
    median_cancer_rate = median(cancer_rate, na.rm = TRUE),
    min_cancer_rate = min(cancer_rate, na.rm = TRUE),
    max_cancer_rate = max(cancer_rate, na.rm = TRUE),
    .groups = 'drop'
  )

print("Summary Statistics - Cancer Rates by PFOA Detection:")
## [1] "Summary Statistics - Cancer Rates by PFOA Detection:"
print(summary_stats)
## # A tibble: 2 × 7
##   pfoa_detected n_observations mean_cancer_rate sd_cancer_rate
##   <fct>                  <int>            <dbl>          <dbl>
## 1 0                        126            0.570          0.181
## 2 1                       1109            0.538          0.187
## # ℹ 3 more variables: median_cancer_rate <dbl>, min_cancer_rate <dbl>,
## #   max_cancer_rate <dbl>
# T-test for difference in cancer rates between detected vs not detected areas
t_test_result <- t.test(cancer_rate ~ pfoa_detected, data = df_filtered)
print("\nT-test Results - PFOA Detection vs Cancer Rates:")
## [1] "\nT-test Results - PFOA Detection vs Cancer Rates:"
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  cancer_rate by pfoa_detected
## t = 1.85, df = 156.86, p-value = 0.0662
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.002140528  0.065390274
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5699206       0.5382958
# Calculate effect size (Cohen's d)
library(effectsize)
## Warning: package 'effectsize' was built under R version 4.4.3
cohens_d <- cohens_d(cancer_rate ~ pfoa_detected, data = df_filtered)
print("\nEffect Size (Cohen's d):")
## [1] "\nEffect Size (Cohen's d):"
print(cohens_d)
## Cohen's d |        95% CI
## -------------------------
## 0.17      | [-0.01, 0.35]
## 
## - Estimated using pooled SD.

p-value = 0.0662 (not statistically significant at α = 0.05) The difference in cancer rates between PFOA detected vs not detected groups is not statistically significant

To examine if there is a statistical significant relationship between log_pfoa and cancer_rate?

# OVERALL ANALYSIS: Log PFOA vs Cancer Rate

# Basic correlation
correlation_result <- cor.test(df_filtered$log_pfoa, df_filtered$cancer_rate, method = "pearson")
print("Correlation between log_pfoa and cancer_rate:")
## [1] "Correlation between log_pfoa and cancer_rate:"
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  df_filtered$log_pfoa and df_filtered$cancer_rate
## t = 2.5551, df = 1233, p-value = 0.01074
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01685972 0.12783734
## sample estimates:
##        cor 
## 0.07257316
# Simple linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
print(summary(lm_simple))
## 
## Call:
## lm(formula = cancer_rate ~ log_pfoa, data = df_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18955 -0.08401 -0.07536  0.04333  1.03328 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.50337    0.01584  31.771   <2e-16 ***
## log_pfoa     0.02789    0.01092   2.555   0.0107 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1863 on 1233 degrees of freedom
## Multiple R-squared:  0.005267,   Adjusted R-squared:  0.00446 
## F-statistic: 6.528 on 1 and 1233 DF,  p-value: 0.01074

Higher PFOA levels are associated with higher cancer rates This is a dose-response relationship - more PFOA exposure correlates with more cancer

# Load required libraries
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
library(tidyverse)

# Extract results from your linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
summary_results <- summary(lm_simple)

# Create a clean results table
regression_table <- tibble(
  Variable = c("Intercept", "Log PFOA"),
  Coefficient = c(summary_results$coefficients[1,1], summary_results$coefficients[2,1]),
  `Std Error` = c(summary_results$coefficients[1,2], summary_results$coefficients[2,2]),
  `t-value` = c(summary_results$coefficients[1,3], summary_results$coefficients[2,3]),
  `p-value` = c(summary_results$coefficients[1,4], summary_results$coefficients[2,4]),
  Significance = case_when(
    `p-value` < 0.001 ~ "***",
    `p-value` < 0.01 ~ "**",
    `p-value` < 0.05 ~ "*",
    `p-value` < 0.1 ~ ".",
    TRUE ~ ""
  )
)

# Create model statistics table
model_stats <- tibble(
  Statistic = c("R-squared", "Adjusted R-squared", "F-statistic", "p-value (model)", "Sample Size"),
  Value = c(
    summary_results$r.squared,
    summary_results$adj.r.squared,
    summary_results$fstatistic[1],
    pf(summary_results$fstatistic[1], summary_results$fstatistic[2], summary_results$fstatistic[3], lower.tail = FALSE),
    nrow(df_filtered)
  )
)

# Create the main regression results table
gt_table1 <- regression_table %>%
  gt() %>%
  tab_header(
    title = "Linear Regression Results: Cancer Rate vs Log PFOA",
    subtitle = "Simple linear regression analysis (n = 1,235)"
  ) %>%
  fmt_number(
    columns = c(Coefficient, `Std Error`),
    decimals = 5
  ) %>%
  fmt_number(
    columns = `t-value`,
    decimals = 3
  ) %>%
  fmt_scientific(
    columns = `p-value`,
    decimals = 3
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      columns = everything(),
      rows = Variable == "Log PFOA"
    )
  ) %>%
  tab_style(
    style = cell_fill(color = "lightblue"),
    locations = cells_body(
      columns = everything(),
      rows = Variable == "Log PFOA"
    )
  ) %>%
  tab_footnote(
    footnote = "*** p < 0.001, ** p < 0.01, * p < 0.05, . p < 0.1",
    locations = cells_column_labels(columns = Significance)
  ) %>%
  tab_source_note(
    source_note = "Data: Water sampling locations across 30 states (2019-2022)"
  )

print(gt_table1)
## <div id="uwjpqaiwyw" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
##   <style>#uwjpqaiwyw table {
##   font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
##   -webkit-font-smoothing: antialiased;
##   -moz-osx-font-smoothing: grayscale;
## }
## 
## #uwjpqaiwyw thead, #uwjpqaiwyw tbody, #uwjpqaiwyw tfoot, #uwjpqaiwyw tr, #uwjpqaiwyw td, #uwjpqaiwyw th {
##   border-style: none;
## }
## 
## #uwjpqaiwyw p {
##   margin: 0;
##   padding: 0;
## }
## 
## #uwjpqaiwyw .gt_table {
##   display: table;
##   border-collapse: collapse;
##   line-height: normal;
##   margin-left: auto;
##   margin-right: auto;
##   color: #333333;
##   font-size: 16px;
##   font-weight: normal;
##   font-style: normal;
##   background-color: #FFFFFF;
##   width: auto;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #A8A8A8;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #A8A8A8;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_caption {
##   padding-top: 4px;
##   padding-bottom: 4px;
## }
## 
## #uwjpqaiwyw .gt_title {
##   color: #333333;
##   font-size: 125%;
##   font-weight: initial;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-color: #FFFFFF;
##   border-bottom-width: 0;
## }
## 
## #uwjpqaiwyw .gt_subtitle {
##   color: #333333;
##   font-size: 85%;
##   font-weight: initial;
##   padding-top: 3px;
##   padding-bottom: 5px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-color: #FFFFFF;
##   border-top-width: 0;
## }
## 
## #uwjpqaiwyw .gt_heading {
##   background-color: #FFFFFF;
##   text-align: center;
##   border-bottom-color: #FFFFFF;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_bottom_border {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_col_headings {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_col_heading {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 6px;
##   padding-left: 5px;
##   padding-right: 5px;
##   overflow-x: hidden;
## }
## 
## #uwjpqaiwyw .gt_column_spanner_outer {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   padding-top: 0;
##   padding-bottom: 0;
##   padding-left: 4px;
##   padding-right: 4px;
## }
## 
## #uwjpqaiwyw .gt_column_spanner_outer:first-child {
##   padding-left: 0;
## }
## 
## #uwjpqaiwyw .gt_column_spanner_outer:last-child {
##   padding-right: 0;
## }
## 
## #uwjpqaiwyw .gt_column_spanner {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 5px;
##   overflow-x: hidden;
##   display: inline-block;
##   width: 100%;
## }
## 
## #uwjpqaiwyw .gt_spanner_row {
##   border-bottom-style: hidden;
## }
## 
## #uwjpqaiwyw .gt_group_heading {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   text-align: left;
## }
## 
## #uwjpqaiwyw .gt_empty_group_heading {
##   padding: 0.5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: middle;
## }
## 
## #uwjpqaiwyw .gt_from_md > :first-child {
##   margin-top: 0;
## }
## 
## #uwjpqaiwyw .gt_from_md > :last-child {
##   margin-bottom: 0;
## }
## 
## #uwjpqaiwyw .gt_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   margin: 10px;
##   border-top-style: solid;
##   border-top-width: 1px;
##   border-top-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   overflow-x: hidden;
## }
## 
## #uwjpqaiwyw .gt_stub {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #uwjpqaiwyw .gt_stub_row_group {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
##   vertical-align: top;
## }
## 
## #uwjpqaiwyw .gt_row_group_first td {
##   border-top-width: 2px;
## }
## 
## #uwjpqaiwyw .gt_row_group_first th {
##   border-top-width: 2px;
## }
## 
## #uwjpqaiwyw .gt_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #uwjpqaiwyw .gt_first_summary_row {
##   border-top-style: solid;
##   border-top-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_first_summary_row.thick {
##   border-top-width: 2px;
## }
## 
## #uwjpqaiwyw .gt_last_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_grand_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #uwjpqaiwyw .gt_first_grand_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-style: double;
##   border-top-width: 6px;
##   border-top-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_last_grand_summary_row_top {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: double;
##   border-bottom-width: 6px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_striped {
##   background-color: rgba(128, 128, 128, 0.05);
## }
## 
## #uwjpqaiwyw .gt_table_body {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_footnotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_footnote {
##   margin: 0px;
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #uwjpqaiwyw .gt_sourcenotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #uwjpqaiwyw .gt_sourcenote {
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #uwjpqaiwyw .gt_left {
##   text-align: left;
## }
## 
## #uwjpqaiwyw .gt_center {
##   text-align: center;
## }
## 
## #uwjpqaiwyw .gt_right {
##   text-align: right;
##   font-variant-numeric: tabular-nums;
## }
## 
## #uwjpqaiwyw .gt_font_normal {
##   font-weight: normal;
## }
## 
## #uwjpqaiwyw .gt_font_bold {
##   font-weight: bold;
## }
## 
## #uwjpqaiwyw .gt_font_italic {
##   font-style: italic;
## }
## 
## #uwjpqaiwyw .gt_super {
##   font-size: 65%;
## }
## 
## #uwjpqaiwyw .gt_footnote_marks {
##   font-size: 75%;
##   vertical-align: 0.4em;
##   position: initial;
## }
## 
## #uwjpqaiwyw .gt_asterisk {
##   font-size: 100%;
##   vertical-align: 0;
## }
## 
## #uwjpqaiwyw .gt_indent_1 {
##   text-indent: 5px;
## }
## 
## #uwjpqaiwyw .gt_indent_2 {
##   text-indent: 10px;
## }
## 
## #uwjpqaiwyw .gt_indent_3 {
##   text-indent: 15px;
## }
## 
## #uwjpqaiwyw .gt_indent_4 {
##   text-indent: 20px;
## }
## 
## #uwjpqaiwyw .gt_indent_5 {
##   text-indent: 25px;
## }
## 
## #uwjpqaiwyw .katex-display {
##   display: inline-flex !important;
##   margin-bottom: 0.75em !important;
## }
## 
## #uwjpqaiwyw div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
##   height: 0px !important;
## }
## </style>
##   <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
##   <thead>
##     <tr class="gt_heading">
##       <td colspan="6" class="gt_heading gt_title gt_font_normal" style>Linear Regression Results: Cancer Rate vs Log PFOA</td>
##     </tr>
##     <tr class="gt_heading">
##       <td colspan="6" class="gt_heading gt_subtitle gt_font_normal gt_bottom_border" style>Simple linear regression analysis (n = 1,235)</td>
##     </tr>
##     <tr class="gt_col_headings">
##       <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Variable">Variable</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Coefficient">Coefficient</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Std-Error">Std Error</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="t-value">t-value</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="p-value">p-value</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Significance">Significance<span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
##     </tr>
##   </thead>
##   <tbody class="gt_table_body">
##     <tr><td headers="Variable" class="gt_row gt_left">Intercept</td>
## <td headers="Coefficient" class="gt_row gt_right">0.50337</td>
## <td headers="Std Error" class="gt_row gt_right">0.01584</td>
## <td headers="t-value" class="gt_row gt_right">31.771</td>
## <td headers="p-value" class="gt_row gt_right">2.462&nbsp;×&nbsp;10<sup style='font-size: 65%;'>−162</sup></td>
## <td headers="Significance" class="gt_row gt_right">***</td></tr>
##     <tr><td headers="Variable" class="gt_row gt_left" style="font-weight: bold; background-color: #ADD8E6;">Log PFOA</td>
## <td headers="Coefficient" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">0.02789</td>
## <td headers="Std Error" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">0.01092</td>
## <td headers="t-value" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">2.555</td>
## <td headers="p-value" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">1.074&nbsp;×&nbsp;10<sup style='font-size: 65%;'>−2</sup></td>
## <td headers="Significance" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">*</td></tr>
##   </tbody>
##   <tfoot class="gt_sourcenotes">
##     <tr>
##       <td class="gt_sourcenote" colspan="6">Data: Water sampling locations across 30 states (2019-2022)</td>
##     </tr>
##   </tfoot>
##   <tfoot class="gt_footnotes">
##     <tr>
##       <td class="gt_footnote" colspan="6"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span> *** p &lt; 0.001, ** p &lt; 0.01, * p &lt; 0.05, . p &lt; 0.1</td>
##     </tr>
##   </tfoot>
## </table>
## </div>
# Create model statistics table
gt_table2 <- model_stats %>%
  gt() %>%
  tab_header(
    title = "Model Performance Statistics"
  ) %>%
  fmt_number(
    columns = Value,
    decimals = 4,
    rows = 1:3
  ) %>%
  fmt_scientific(
    columns = Value,
    decimals = 3,
    rows = 4
  ) %>%
  fmt_number(
    columns = Value,
    decimals = 0,
    rows = 5
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      rows = Statistic %in% c("p-value (model)", "R-squared")
    )
  )

print(gt_table2)
## <div id="cccxryjpbj" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
##   <style>#cccxryjpbj table {
##   font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
##   -webkit-font-smoothing: antialiased;
##   -moz-osx-font-smoothing: grayscale;
## }
## 
## #cccxryjpbj thead, #cccxryjpbj tbody, #cccxryjpbj tfoot, #cccxryjpbj tr, #cccxryjpbj td, #cccxryjpbj th {
##   border-style: none;
## }
## 
## #cccxryjpbj p {
##   margin: 0;
##   padding: 0;
## }
## 
## #cccxryjpbj .gt_table {
##   display: table;
##   border-collapse: collapse;
##   line-height: normal;
##   margin-left: auto;
##   margin-right: auto;
##   color: #333333;
##   font-size: 16px;
##   font-weight: normal;
##   font-style: normal;
##   background-color: #FFFFFF;
##   width: auto;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #A8A8A8;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #A8A8A8;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_caption {
##   padding-top: 4px;
##   padding-bottom: 4px;
## }
## 
## #cccxryjpbj .gt_title {
##   color: #333333;
##   font-size: 125%;
##   font-weight: initial;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-color: #FFFFFF;
##   border-bottom-width: 0;
## }
## 
## #cccxryjpbj .gt_subtitle {
##   color: #333333;
##   font-size: 85%;
##   font-weight: initial;
##   padding-top: 3px;
##   padding-bottom: 5px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-color: #FFFFFF;
##   border-top-width: 0;
## }
## 
## #cccxryjpbj .gt_heading {
##   background-color: #FFFFFF;
##   text-align: center;
##   border-bottom-color: #FFFFFF;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_bottom_border {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_col_headings {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_col_heading {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 6px;
##   padding-left: 5px;
##   padding-right: 5px;
##   overflow-x: hidden;
## }
## 
## #cccxryjpbj .gt_column_spanner_outer {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   padding-top: 0;
##   padding-bottom: 0;
##   padding-left: 4px;
##   padding-right: 4px;
## }
## 
## #cccxryjpbj .gt_column_spanner_outer:first-child {
##   padding-left: 0;
## }
## 
## #cccxryjpbj .gt_column_spanner_outer:last-child {
##   padding-right: 0;
## }
## 
## #cccxryjpbj .gt_column_spanner {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 5px;
##   overflow-x: hidden;
##   display: inline-block;
##   width: 100%;
## }
## 
## #cccxryjpbj .gt_spanner_row {
##   border-bottom-style: hidden;
## }
## 
## #cccxryjpbj .gt_group_heading {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   text-align: left;
## }
## 
## #cccxryjpbj .gt_empty_group_heading {
##   padding: 0.5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: middle;
## }
## 
## #cccxryjpbj .gt_from_md > :first-child {
##   margin-top: 0;
## }
## 
## #cccxryjpbj .gt_from_md > :last-child {
##   margin-bottom: 0;
## }
## 
## #cccxryjpbj .gt_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   margin: 10px;
##   border-top-style: solid;
##   border-top-width: 1px;
##   border-top-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   overflow-x: hidden;
## }
## 
## #cccxryjpbj .gt_stub {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #cccxryjpbj .gt_stub_row_group {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
##   vertical-align: top;
## }
## 
## #cccxryjpbj .gt_row_group_first td {
##   border-top-width: 2px;
## }
## 
## #cccxryjpbj .gt_row_group_first th {
##   border-top-width: 2px;
## }
## 
## #cccxryjpbj .gt_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #cccxryjpbj .gt_first_summary_row {
##   border-top-style: solid;
##   border-top-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_first_summary_row.thick {
##   border-top-width: 2px;
## }
## 
## #cccxryjpbj .gt_last_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_grand_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #cccxryjpbj .gt_first_grand_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-style: double;
##   border-top-width: 6px;
##   border-top-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_last_grand_summary_row_top {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: double;
##   border-bottom-width: 6px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_striped {
##   background-color: rgba(128, 128, 128, 0.05);
## }
## 
## #cccxryjpbj .gt_table_body {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_footnotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_footnote {
##   margin: 0px;
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #cccxryjpbj .gt_sourcenotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #cccxryjpbj .gt_sourcenote {
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #cccxryjpbj .gt_left {
##   text-align: left;
## }
## 
## #cccxryjpbj .gt_center {
##   text-align: center;
## }
## 
## #cccxryjpbj .gt_right {
##   text-align: right;
##   font-variant-numeric: tabular-nums;
## }
## 
## #cccxryjpbj .gt_font_normal {
##   font-weight: normal;
## }
## 
## #cccxryjpbj .gt_font_bold {
##   font-weight: bold;
## }
## 
## #cccxryjpbj .gt_font_italic {
##   font-style: italic;
## }
## 
## #cccxryjpbj .gt_super {
##   font-size: 65%;
## }
## 
## #cccxryjpbj .gt_footnote_marks {
##   font-size: 75%;
##   vertical-align: 0.4em;
##   position: initial;
## }
## 
## #cccxryjpbj .gt_asterisk {
##   font-size: 100%;
##   vertical-align: 0;
## }
## 
## #cccxryjpbj .gt_indent_1 {
##   text-indent: 5px;
## }
## 
## #cccxryjpbj .gt_indent_2 {
##   text-indent: 10px;
## }
## 
## #cccxryjpbj .gt_indent_3 {
##   text-indent: 15px;
## }
## 
## #cccxryjpbj .gt_indent_4 {
##   text-indent: 20px;
## }
## 
## #cccxryjpbj .gt_indent_5 {
##   text-indent: 25px;
## }
## 
## #cccxryjpbj .katex-display {
##   display: inline-flex !important;
##   margin-bottom: 0.75em !important;
## }
## 
## #cccxryjpbj div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
##   height: 0px !important;
## }
## </style>
##   <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
##   <thead>
##     <tr class="gt_heading">
##       <td colspan="2" class="gt_heading gt_title gt_font_normal gt_bottom_border" style>Model Performance Statistics</td>
##     </tr>
##     
##     <tr class="gt_col_headings">
##       <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Statistic">Statistic</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Value">Value</th>
##     </tr>
##   </thead>
##   <tbody class="gt_table_body">
##     <tr><td headers="Statistic" class="gt_row gt_left" style="font-weight: bold;">R-squared</td>
## <td headers="Value" class="gt_row gt_right" style="font-weight: bold;">0.0053</td></tr>
##     <tr><td headers="Statistic" class="gt_row gt_left">Adjusted R-squared</td>
## <td headers="Value" class="gt_row gt_right">0.0045</td></tr>
##     <tr><td headers="Statistic" class="gt_row gt_left">F-statistic</td>
## <td headers="Value" class="gt_row gt_right">6.5284</td></tr>
##     <tr><td headers="Statistic" class="gt_row gt_left" style="font-weight: bold;">p-value (model)</td>
## <td headers="Value" class="gt_row gt_right" style="font-weight: bold;">1.074&nbsp;×&nbsp;10<sup style='font-size: 65%;'>−2</sup></td></tr>
##     <tr><td headers="Statistic" class="gt_row gt_left">Sample Size</td>
## <td headers="Value" class="gt_row gt_right">1,235</td></tr>
##   </tbody>
##   
##   
## </table>
## </div>
# Create interpretation summary table
interpretation_table <- tibble(
  `Key Finding` = c(
    "Statistical Significance",
    "Direction of Relationship", 
    "Effect Size",
    "Practical Interpretation",
    "Model Fit"
  ),
  `Result` = c(
    "p = 0.011 (statistically significant)",
    "Positive association (β = 0.02789)",
    "Small but consistent effect",
    "1-unit increase in log PFOA → 0.028 increase in cancer rate",
    "R² = 0.53% (small effect, large sample)"
  )
)

gt_table3 <- interpretation_table %>%
  gt() %>%
  tab_header(
    title = "Key Findings Summary",
    subtitle = "Log PFOA levels and cancer rate relationship"
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = `Key Finding`)
  ) %>%
  tab_style(
    style = cell_fill(color = "lightyellow"),
    locations = cells_body(rows = `Key Finding` == "Statistical Significance")
  )

print(gt_table3)
## <div id="wmedzglkcm" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
##   <style>#wmedzglkcm table {
##   font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
##   -webkit-font-smoothing: antialiased;
##   -moz-osx-font-smoothing: grayscale;
## }
## 
## #wmedzglkcm thead, #wmedzglkcm tbody, #wmedzglkcm tfoot, #wmedzglkcm tr, #wmedzglkcm td, #wmedzglkcm th {
##   border-style: none;
## }
## 
## #wmedzglkcm p {
##   margin: 0;
##   padding: 0;
## }
## 
## #wmedzglkcm .gt_table {
##   display: table;
##   border-collapse: collapse;
##   line-height: normal;
##   margin-left: auto;
##   margin-right: auto;
##   color: #333333;
##   font-size: 16px;
##   font-weight: normal;
##   font-style: normal;
##   background-color: #FFFFFF;
##   width: auto;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #A8A8A8;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #A8A8A8;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_caption {
##   padding-top: 4px;
##   padding-bottom: 4px;
## }
## 
## #wmedzglkcm .gt_title {
##   color: #333333;
##   font-size: 125%;
##   font-weight: initial;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-color: #FFFFFF;
##   border-bottom-width: 0;
## }
## 
## #wmedzglkcm .gt_subtitle {
##   color: #333333;
##   font-size: 85%;
##   font-weight: initial;
##   padding-top: 3px;
##   padding-bottom: 5px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-color: #FFFFFF;
##   border-top-width: 0;
## }
## 
## #wmedzglkcm .gt_heading {
##   background-color: #FFFFFF;
##   text-align: center;
##   border-bottom-color: #FFFFFF;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_bottom_border {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_col_headings {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_col_heading {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 6px;
##   padding-left: 5px;
##   padding-right: 5px;
##   overflow-x: hidden;
## }
## 
## #wmedzglkcm .gt_column_spanner_outer {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   padding-top: 0;
##   padding-bottom: 0;
##   padding-left: 4px;
##   padding-right: 4px;
## }
## 
## #wmedzglkcm .gt_column_spanner_outer:first-child {
##   padding-left: 0;
## }
## 
## #wmedzglkcm .gt_column_spanner_outer:last-child {
##   padding-right: 0;
## }
## 
## #wmedzglkcm .gt_column_spanner {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 5px;
##   overflow-x: hidden;
##   display: inline-block;
##   width: 100%;
## }
## 
## #wmedzglkcm .gt_spanner_row {
##   border-bottom-style: hidden;
## }
## 
## #wmedzglkcm .gt_group_heading {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   text-align: left;
## }
## 
## #wmedzglkcm .gt_empty_group_heading {
##   padding: 0.5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: middle;
## }
## 
## #wmedzglkcm .gt_from_md > :first-child {
##   margin-top: 0;
## }
## 
## #wmedzglkcm .gt_from_md > :last-child {
##   margin-bottom: 0;
## }
## 
## #wmedzglkcm .gt_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   margin: 10px;
##   border-top-style: solid;
##   border-top-width: 1px;
##   border-top-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   overflow-x: hidden;
## }
## 
## #wmedzglkcm .gt_stub {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #wmedzglkcm .gt_stub_row_group {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
##   vertical-align: top;
## }
## 
## #wmedzglkcm .gt_row_group_first td {
##   border-top-width: 2px;
## }
## 
## #wmedzglkcm .gt_row_group_first th {
##   border-top-width: 2px;
## }
## 
## #wmedzglkcm .gt_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #wmedzglkcm .gt_first_summary_row {
##   border-top-style: solid;
##   border-top-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_first_summary_row.thick {
##   border-top-width: 2px;
## }
## 
## #wmedzglkcm .gt_last_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_grand_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #wmedzglkcm .gt_first_grand_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-style: double;
##   border-top-width: 6px;
##   border-top-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_last_grand_summary_row_top {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: double;
##   border-bottom-width: 6px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_striped {
##   background-color: rgba(128, 128, 128, 0.05);
## }
## 
## #wmedzglkcm .gt_table_body {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_footnotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_footnote {
##   margin: 0px;
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #wmedzglkcm .gt_sourcenotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #wmedzglkcm .gt_sourcenote {
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #wmedzglkcm .gt_left {
##   text-align: left;
## }
## 
## #wmedzglkcm .gt_center {
##   text-align: center;
## }
## 
## #wmedzglkcm .gt_right {
##   text-align: right;
##   font-variant-numeric: tabular-nums;
## }
## 
## #wmedzglkcm .gt_font_normal {
##   font-weight: normal;
## }
## 
## #wmedzglkcm .gt_font_bold {
##   font-weight: bold;
## }
## 
## #wmedzglkcm .gt_font_italic {
##   font-style: italic;
## }
## 
## #wmedzglkcm .gt_super {
##   font-size: 65%;
## }
## 
## #wmedzglkcm .gt_footnote_marks {
##   font-size: 75%;
##   vertical-align: 0.4em;
##   position: initial;
## }
## 
## #wmedzglkcm .gt_asterisk {
##   font-size: 100%;
##   vertical-align: 0;
## }
## 
## #wmedzglkcm .gt_indent_1 {
##   text-indent: 5px;
## }
## 
## #wmedzglkcm .gt_indent_2 {
##   text-indent: 10px;
## }
## 
## #wmedzglkcm .gt_indent_3 {
##   text-indent: 15px;
## }
## 
## #wmedzglkcm .gt_indent_4 {
##   text-indent: 20px;
## }
## 
## #wmedzglkcm .gt_indent_5 {
##   text-indent: 25px;
## }
## 
## #wmedzglkcm .katex-display {
##   display: inline-flex !important;
##   margin-bottom: 0.75em !important;
## }
## 
## #wmedzglkcm div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
##   height: 0px !important;
## }
## </style>
##   <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
##   <thead>
##     <tr class="gt_heading">
##       <td colspan="2" class="gt_heading gt_title gt_font_normal" style>Key Findings Summary</td>
##     </tr>
##     <tr class="gt_heading">
##       <td colspan="2" class="gt_heading gt_subtitle gt_font_normal gt_bottom_border" style>Log PFOA levels and cancer rate relationship</td>
##     </tr>
##     <tr class="gt_col_headings">
##       <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Key-Finding">Key Finding</th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Result">Result</th>
##     </tr>
##   </thead>
##   <tbody class="gt_table_body">
##     <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold; background-color: #FFFFE0;">Statistical Significance</td>
## <td headers="Result" class="gt_row gt_left" style="background-color: #FFFFE0;">p = 0.011 (statistically significant)</td></tr>
##     <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Direction of Relationship</td>
## <td headers="Result" class="gt_row gt_left">Positive association (β = 0.02789)</td></tr>
##     <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Effect Size</td>
## <td headers="Result" class="gt_row gt_left">Small but consistent effect</td></tr>
##     <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Practical Interpretation</td>
## <td headers="Result" class="gt_row gt_left">1-unit increase in log PFOA → 0.028 increase in cancer rate</td></tr>
##     <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Model Fit</td>
## <td headers="Result" class="gt_row gt_left">R² = 0.53% (small effect, large sample)</td></tr>
##   </tbody>
##   
##   
## </table>
## </div>
# Print code to save tables for PowerPoint
cat("\n# To save tables for PowerPoint, run:\n")
## 
## # To save tables for PowerPoint, run:
cat("gtsave(gt_table1, 'regression_results.png', vwidth = 800, vheight = 400)\n")
## gtsave(gt_table1, 'regression_results.png', vwidth = 800, vheight = 400)
cat("gtsave(gt_table2, 'model_stats.png', vwidth = 600, vheight = 300)\n")
## gtsave(gt_table2, 'model_stats.png', vwidth = 600, vheight = 300)
cat("gtsave(gt_table3, 'interpretation.png', vwidth = 800, vheight = 350)\n")
## gtsave(gt_table3, 'interpretation.png', vwidth = 800, vheight = 350)
# Extract results from your linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
summary_results <- summary(lm_simple)

# Create a clean results dataframe
regression_results <- data.frame(
  Variable = c("Intercept", "Log PFOA"),
  Coefficient = round(summary_results$coefficients[,1], 5),
  Std_Error = round(summary_results$coefficients[,2], 5),
  t_value = round(summary_results$coefficients[,3], 3),
  p_value = round(summary_results$coefficients[,4], 4),
  Significance = c(
    ifelse(summary_results$coefficients[1,4] < 0.001, "***",
           ifelse(summary_results$coefficients[1,4] < 0.01, "**",
                  ifelse(summary_results$coefficients[1,4] < 0.05, "*", ""))),
    ifelse(summary_results$coefficients[2,4] < 0.001, "***",
           ifelse(summary_results$coefficients[2,4] < 0.01, "**",
                  ifelse(summary_results$coefficients[2,4] < 0.05, "*", "")))
  )
)

# Print the table
cat("Linear Regression: Cancer Rate ~ Log PFOA\n")
## Linear Regression: Cancer Rate ~ Log PFOA
cat("==========================================\n")
## ==========================================
print(regression_results, row.names = FALSE)
##   Variable Coefficient Std_Error t_value p_value Significance
##  Intercept     0.50337   0.01584  31.771  0.0000          ***
##   Log PFOA     0.02789   0.01092   2.555  0.0107            *
# Add model statistics
cat("\nModel Statistics:\n")
## 
## Model Statistics:
cat("=================\n")
## =================
cat("R-squared:", round(summary_results$r.squared, 6), "\n")
## R-squared: 0.005267
cat("Adjusted R-squared:", round(summary_results$adj.r.squared, 6), "\n") 
## Adjusted R-squared: 0.00446
cat("F-statistic:", round(summary_results$fstatistic[1], 3), 
    "on", summary_results$fstatistic[2], "and", summary_results$fstatistic[3], "DF\n")
## F-statistic: 6.528 on 1 and 1233 DF
cat("p-value:", round(pf(summary_results$fstatistic[1], 
                       summary_results$fstatistic[2], 
                       summary_results$fstatistic[3], 
                       lower.tail = FALSE), 4), "\n")
## p-value: 0.0107
cat("Sample size:", nrow(df_filtered), "\n")
## Sample size: 1235
# Create a copy-paste friendly version
cat("\n\nCopy-Paste Friendly Table:\n")
## 
## 
## Copy-Paste Friendly Table:
cat("==========================\n")
## ==========================
cat("Variable\tCoefficient\tStd Error\tt-value\tp-value\tSignificance\n")
## Variable Coefficient Std Error   t-value p-value Significance
cat("Intercept\t0.50337\t0.01584\t31.771\t<0.0001\t***\n")
## Intercept    0.50337 0.01584 31.771  <0.0001 ***
cat("Log PFOA\t0.02789\t0.01092\t2.555\t0.0107\t*\n")
## Log PFOA 0.02789 0.01092 2.555   0.0107  *
cat("\n")
cat("R² = 0.005267, F = 6.528, p = 0.0107, n = 1,235\n")
## R² = 0.005267, F = 6.528, p = 0.0107, n = 1,235

LINEAR REGRESSION MODEL

# Step 1: Fit a full model with all predictors
full_model <- lm(log_pfoa ~ .-pfoa_val-pfoa_detected-tritium_category-month-state-latitude-longitude, data = df_filtered)

# Step 2: Fit a null model with only the intercept
null_model <- lm(log_pfoa ~ 1, data = df_filtered)

# Step 3: Perform stepwise regression (both directions)
step_model <- step(null_model, 
                   scope = list(lower = null_model, upper = full_model), 
                   direction = "both",
                   trace = TRUE)
## Start:  AIC=-1782.11
## log_pfoa ~ 1
## 
##                           Df Sum of Sq    RSS     AIC
## + pct_white                1   15.1094 276.15 -1845.9
## + tritium_tu               1   13.0066 278.25 -1836.5
## + median_income            1   10.6492 280.61 -1826.1
## + pct_children             1    8.4056 282.85 -1816.3
## + pct_older                1    7.5947 283.66 -1812.7
## + pop_density_per_sq_mile  1    5.2520 286.00 -1802.6
## + pct_hispanic             1    3.8754 287.38 -1796.7
## + well_depth               1    2.4368 288.82 -1790.5
## + water_use                5    4.2522 287.00 -1790.3
## + poverty_rate             1    2.0759 289.18 -1789.0
## + pct_bachelor_plus        1    1.9224 289.33 -1788.3
## + cancer_rate              1    1.5340 289.72 -1786.6
## + unemployment_rate        1    0.8110 290.44 -1783.6
## <none>                                 291.25 -1782.1
## + pct_adults               1    0.0950 291.16 -1780.5
## + pct_black                1    0.0199 291.24 -1780.2
## 
## Step:  AIC=-1845.9
## log_pfoa ~ pct_white
## 
##                           Df Sum of Sq    RSS     AIC
## + unemployment_rate        1   12.2045 263.94 -1899.7
## + pct_children             1    7.2066 268.94 -1876.6
## + pct_adults               1    6.8976 269.25 -1875.1
## + water_use                5    6.9649 269.18 -1867.5
## + pop_density_per_sq_mile  1    4.9470 271.20 -1866.2
## + tritium_tu               1    4.9158 271.23 -1866.1
## + poverty_rate             1    2.4262 273.72 -1854.8
## + pct_hispanic             1    2.1002 274.05 -1853.3
## + pct_older                1    1.3170 274.83 -1849.8
## + median_income            1    1.1880 274.96 -1849.2
## + cancer_rate              1    0.5576 275.59 -1846.4
## <none>                                 276.15 -1845.9
## + pct_bachelor_plus        1    0.0660 276.08 -1844.2
## + pct_black                1    0.0406 276.11 -1844.1
## + well_depth               1    0.0002 276.14 -1843.9
## - pct_white                1   15.1094 291.25 -1782.1
## 
## Step:  AIC=-1899.73
## log_pfoa ~ pct_white + unemployment_rate
## 
##                           Df Sum of Sq    RSS     AIC
## + pct_adults               1    9.3076 254.63 -1942.1
## + water_use                5    7.4193 256.52 -1924.9
## + pop_density_per_sq_mile  1    4.8511 259.09 -1920.6
## + tritium_tu               1    4.5882 259.35 -1919.4
## + pct_bachelor_plus        1    4.5610 259.38 -1919.3
## + pct_hispanic             1    3.4454 260.50 -1914.0
## + pct_children             1    3.3646 260.58 -1913.6
## + cancer_rate              1    1.6806 262.26 -1905.6
## + median_income            1    1.3161 262.62 -1903.9
## + pct_black                1    1.1865 262.75 -1903.3
## + poverty_rate             1    0.9063 263.04 -1902.0
## + well_depth               1    0.4402 263.50 -1899.8
## <none>                                 263.94 -1899.7
## + pct_older                1    0.0005 263.94 -1897.7
## - unemployment_rate        1   12.2045 276.15 -1845.9
## - pct_white                1   26.5029 290.44 -1783.6
## 
## Step:  AIC=-1942.07
## log_pfoa ~ pct_white + unemployment_rate + pct_adults
## 
##                           Df Sum of Sq    RSS     AIC
## + tritium_tu               1     3.095 251.54 -1955.2
## + water_use                5     4.695 249.94 -1955.0
## + pop_density_per_sq_mile  1     1.121 253.51 -1945.5
## + cancer_rate              1     1.001 253.63 -1944.9
## + pct_older                1     0.953 253.68 -1944.7
## + pct_children             1     0.944 253.69 -1944.7
## + median_income            1     0.923 253.71 -1944.5
## + poverty_rate             1     0.732 253.90 -1943.6
## + pct_hispanic             1     0.581 254.05 -1942.9
## + pct_black                1     0.580 254.05 -1942.9
## <none>                                 254.63 -1942.1
## + well_depth               1     0.185 254.45 -1941.0
## + pct_bachelor_plus        1     0.007 254.63 -1940.1
## - pct_adults               1     9.308 263.94 -1899.7
## - unemployment_rate        1    14.615 269.25 -1875.1
## - pct_white                1    35.804 290.44 -1781.6
## 
## Step:  AIC=-1955.17
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu
## 
##                           Df Sum of Sq    RSS     AIC
## + water_use                5    5.9181 245.62 -1974.6
## + median_income            1    1.3642 250.17 -1959.9
## + pct_older                1    1.3142 250.22 -1959.6
## + pct_children             1    1.3037 250.24 -1959.6
## + pop_density_per_sq_mile  1    1.2969 250.24 -1959.5
## + poverty_rate             1    1.0151 250.52 -1958.2
## + pct_black                1    0.8009 250.74 -1957.1
## + cancer_rate              1    0.7938 250.74 -1957.1
## + pct_hispanic             1    0.7356 250.80 -1956.8
## + well_depth               1    0.4848 251.05 -1955.5
## <none>                                 251.54 -1955.2
## + pct_bachelor_plus        1    0.0044 251.53 -1953.2
## - tritium_tu               1    3.0954 254.63 -1942.1
## - pct_adults               1    7.8148 259.35 -1919.4
## - unemployment_rate        1   14.1031 265.64 -1889.8
## - pct_white                1   26.6251 278.16 -1832.9
## 
## Step:  AIC=-1974.57
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use
## 
##                           Df Sum of Sq    RSS     AIC
## + median_income            1    1.7237 243.90 -1981.3
## + poverty_rate             1    1.5606 244.06 -1980.5
## + pop_density_per_sq_mile  1    1.4992 244.12 -1980.1
## + pct_black                1    1.2272 244.39 -1978.8
## + pct_older                1    1.0279 244.59 -1977.8
## + pct_children             1    1.0195 244.60 -1977.7
## + cancer_rate              1    0.5733 245.05 -1975.5
## <none>                                 245.62 -1974.6
## + pct_hispanic             1    0.3589 245.26 -1974.4
## + well_depth               1    0.0441 245.58 -1972.8
## + pct_bachelor_plus        1    0.0001 245.62 -1972.6
## - water_use                5    5.9181 251.54 -1955.2
## - tritium_tu               1    4.3186 249.94 -1955.0
## - pct_adults               1    4.8119 250.43 -1952.6
## - unemployment_rate        1   13.8328 259.45 -1908.9
## - pct_white                1   26.6354 272.25 -1849.4
## 
## Step:  AIC=-1981.27
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use + median_income
## 
##                           Df Sum of Sq    RSS     AIC
## + pop_density_per_sq_mile  1    3.5326 240.36 -1997.3
## + pct_bachelor_plus        1    2.5194 241.38 -1992.1
## + pct_hispanic             1    1.7058 242.19 -1987.9
## + pct_older                1    0.8769 243.02 -1983.7
## + pct_children             1    0.8698 243.03 -1983.7
## + cancer_rate              1    0.8366 243.06 -1983.5
## <none>                                 243.90 -1981.3
## + pct_black                1    0.0892 243.81 -1979.7
## + poverty_rate             1    0.0330 243.86 -1979.4
## + well_depth               1    0.0001 243.90 -1979.3
## - median_income            1    1.7237 245.62 -1974.6
## - unemployment_rate        1    4.3902 248.29 -1961.2
## - water_use                5    6.2776 250.17 -1959.9
## - tritium_tu               1    4.8903 248.79 -1958.8
## - pct_adults               1    6.4641 250.36 -1951.0
## - pct_white                1    9.1652 253.06 -1937.7
## 
## Step:  AIC=-1997.29
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use + median_income + pop_density_per_sq_mile
## 
##                           Df Sum of Sq    RSS     AIC
## + pct_hispanic             1    2.5639 237.80 -2008.5
## + cancer_rate              1    1.1646 239.20 -2001.3
## + pct_black                1    0.3914 239.97 -1997.3
## <none>                                 240.36 -1997.3
## + pct_bachelor_plus        1    0.2840 240.08 -1996.8
## + poverty_rate             1    0.1358 240.23 -1996.0
## + well_depth               1    0.1345 240.23 -1996.0
## + pct_children             1    0.0002 240.36 -1995.3
## + pct_older                1    0.0001 240.36 -1995.3
## - unemployment_rate        1    1.9103 242.27 -1989.5
## - pop_density_per_sq_mile  1    3.5326 243.90 -1981.3
## - median_income            1    3.7572 244.12 -1980.1
## - pct_white                1    3.8675 244.23 -1979.6
## - pct_adults               1    4.8767 245.24 -1974.5
## - water_use                5    6.7945 247.16 -1972.9
## - tritium_tu               1    5.8518 246.22 -1969.6
## 
## Step:  AIC=-2008.54
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use + median_income + pop_density_per_sq_mile + pct_hispanic
## 
##                           Df Sum of Sq    RSS     AIC
## + pct_black                1    2.8489 234.95 -2021.4
## + poverty_rate             1    2.1023 235.70 -2017.5
## + cancer_rate              1    0.9552 236.84 -2011.5
## <none>                                 237.80 -2008.5
## + pct_bachelor_plus        1    0.1382 237.66 -2007.2
## + well_depth               1    0.0811 237.72 -2007.0
## + pct_children             1    0.0002 237.80 -2006.5
## + pct_older                1    0.0001 237.80 -2006.5
## - unemployment_rate        1    0.8735 238.67 -2006.0
## - pct_hispanic             1    2.5639 240.36 -1997.3
## - pct_white                1    4.0455 241.84 -1989.7
## - water_use                5    5.9454 243.75 -1988.0
## - pop_density_per_sq_mile  1    4.3907 242.19 -1987.9
## - pct_adults               1    4.5531 242.35 -1987.1
## - median_income            1    6.0274 243.83 -1979.6
## - tritium_tu               1    6.5592 244.36 -1976.9
## 
## Step:  AIC=-2021.42
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use + median_income + pop_density_per_sq_mile + pct_hispanic + 
##     pct_black
## 
##                           Df Sum of Sq    RSS     AIC
## + poverty_rate             1    2.2031 232.75 -2031.0
## - median_income            1    0.0176 234.97 -2023.3
## + cancer_rate              1    0.4757 234.47 -2021.9
## - pop_density_per_sq_mile  1    0.3349 235.29 -2021.7
## <none>                                 234.95 -2021.4
## + pct_bachelor_plus        1    0.1364 234.81 -2020.1
## + well_depth               1    0.1243 234.83 -2020.1
## + pct_children             1    0.0168 234.93 -2019.5
## + pct_older                1    0.0158 234.94 -2019.5
## - unemployment_rate        1    2.6586 237.61 -2009.5
## - pct_black                1    2.8489 237.80 -2008.5
## - water_use                5    5.1680 240.12 -2004.5
## - pct_hispanic             1    5.0214 239.97 -1997.3
## - pct_white                1    5.8468 240.80 -1993.1
## - pct_adults               1    6.6193 241.57 -1989.1
## - tritium_tu               1    7.5882 242.54 -1984.2
## 
## Step:  AIC=-2031.05
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu + 
##     water_use + median_income + pop_density_per_sq_mile + pct_hispanic + 
##     pct_black + poverty_rate
## 
##                           Df Sum of Sq    RSS     AIC
## <none>                                 232.75 -2031.0
## + cancer_rate              1    0.2280 232.52 -2030.3
## - unemployment_rate        1    0.5662 233.31 -2030.0
## + pct_bachelor_plus        1    0.1418 232.61 -2029.8
## - pop_density_per_sq_mile  1    0.6296 233.38 -2029.7
## + well_depth               1    0.0311 232.72 -2029.2
## + pct_older                1    0.0160 232.73 -2029.1
## + pct_children             1    0.0154 232.73 -2029.1
## - median_income            1    0.8614 233.61 -2028.5
## - poverty_rate             1    2.2031 234.95 -2021.4
## - pct_black                1    2.9496 235.70 -2017.5
## - water_use                5    5.6428 238.39 -2011.5
## - pct_adults               1    5.2936 238.04 -2005.3
## - pct_hispanic             1    6.6701 239.42 -1998.2
## - pct_white                1    7.6787 240.43 -1993.0
## - tritium_tu               1    8.0059 240.75 -1991.3
# Step 4: View the final selected model
summary(step_model)
## 
## Call:
## lm(formula = log_pfoa ~ pct_white + unemployment_rate + pct_adults + 
##     tritium_tu + water_use + median_income + pop_density_per_sq_mile + 
##     pct_hispanic + pct_black + poverty_rate, data = df_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1277 -0.2191 -0.0722  0.1459  5.2367 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.085e+01  1.684e+00  -6.442 1.70e-10 ***
## pct_white                     4.426e-02  6.976e-03   6.344 3.14e-10 ***
## unemployment_rate             7.965e-02  4.623e-02   1.723  0.08517 .  
## pct_adults                    1.004e-01  1.905e-02   5.268 1.63e-07 ***
## tritium_tu                    2.880e-02  4.447e-03   6.478 1.34e-10 ***
## water_useIrrigation           9.020e-02  8.866e-02   1.017  0.30914    
## water_useObservation         -1.132e-01  4.328e-02  -2.616  0.00901 ** 
## water_useProduction           7.993e-02  3.111e-02   2.569  0.01030 *  
## water_useUnknown             -5.609e-02  9.292e-02  -0.604  0.54619    
## water_useWater supply, other  4.219e-02  1.817e-01   0.232  0.81643    
## median_income                 1.128e-05  5.309e-06   2.125  0.03380 *  
## pop_density_per_sq_mile       2.049e-04  1.128e-04   1.817  0.06951 .  
## pct_hispanic                  2.015e-02  3.408e-03   5.913 4.36e-09 ***
## pct_black                     2.309e-02  5.873e-03   3.932 8.90e-05 ***
## poverty_rate                  7.759e-02  2.283e-02   3.398  0.00070 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4368 on 1220 degrees of freedom
## Multiple R-squared:  0.2009, Adjusted R-squared:  0.1917 
## F-statistic: 21.91 on 14 and 1220 DF,  p-value: < 2.2e-16
library(car)
vif(step_model)
##                              GVIF Df GVIF^(1/(2*Df))
## pct_white               46.032167  1        6.784701
## unemployment_rate        8.854461  1        2.975645
## pct_adults               3.308788  1        1.819007
## tritium_tu               1.329555  1        1.153063
## water_use                1.767752  5        1.058625
## median_income           32.056732  1        5.661866
## pop_density_per_sq_mile  2.547550  1        1.596105
## pct_hispanic            18.252410  1        4.272284
## pct_black               17.484098  1        4.181399
## poverty_rate            17.068816  1        4.131442
model = lm(log_pfoa ~ pct_adults + 
    tritium_tu + water_use + median_income + pop_density_per_sq_mile + 
    pct_black, data = df_filtered)


summary(model)
## 
## Call:
## lm(formula = log_pfoa ~ pct_adults + tritium_tu + water_use + 
##     median_income + pop_density_per_sq_mile + pct_black, data = df_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1687 -0.2436 -0.0625  0.1433  5.4976 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -2.456e+00  9.922e-01  -2.475  0.01346 *  
## pct_adults                    8.177e-02  1.687e-02   4.848 1.41e-06 ***
## tritium_tu                    2.692e-02  4.260e-03   6.319 3.69e-10 ***
## water_useIrrigation           8.407e-02  9.004e-02   0.934  0.35060    
## water_useObservation         -1.148e-01  4.225e-02  -2.716  0.00670 ** 
## water_useProduction           9.130e-02  3.067e-02   2.977  0.00297 ** 
## water_useUnknown              5.840e-03  9.363e-02   0.062  0.95027    
## water_useWater supply, other  2.048e-02  1.844e-01   0.111  0.91159    
## median_income                -1.897e-05  1.675e-06 -11.322  < 2e-16 ***
## pop_density_per_sq_mile       5.663e-04  7.961e-05   7.114 1.91e-12 ***
## pct_black                    -8.156e-03  1.967e-03  -4.146 3.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4438 on 1224 degrees of freedom
## Multiple R-squared:  0.1722, Adjusted R-squared:  0.1655 
## F-statistic: 25.47 on 10 and 1224 DF,  p-value: < 2.2e-16
vif(model)
##                             GVIF Df GVIF^(1/(2*Df))
## pct_adults              2.511911  1        1.584901
## tritium_tu              1.181861  1        1.087134
## water_use               1.554752  5        1.045120
## median_income           3.091922  1        1.758386
## pop_density_per_sq_mile 1.229472  1        1.108816
## pct_black               1.899671  1        1.378286
m=lm(formula = log_pfoa ~  pct_adults + 
    tritium_tu + water_use + median_income + pop_density_per_sq_mile + 
    pct_black, data = df_filtered)
 summary(m)
## 
## Call:
## lm(formula = log_pfoa ~ pct_adults + tritium_tu + water_use + 
##     median_income + pop_density_per_sq_mile + pct_black, data = df_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1687 -0.2436 -0.0625  0.1433  5.4976 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -2.456e+00  9.922e-01  -2.475  0.01346 *  
## pct_adults                    8.177e-02  1.687e-02   4.848 1.41e-06 ***
## tritium_tu                    2.692e-02  4.260e-03   6.319 3.69e-10 ***
## water_useIrrigation           8.407e-02  9.004e-02   0.934  0.35060    
## water_useObservation         -1.148e-01  4.225e-02  -2.716  0.00670 ** 
## water_useProduction           9.130e-02  3.067e-02   2.977  0.00297 ** 
## water_useUnknown              5.840e-03  9.363e-02   0.062  0.95027    
## water_useWater supply, other  2.048e-02  1.844e-01   0.111  0.91159    
## median_income                -1.897e-05  1.675e-06 -11.322  < 2e-16 ***
## pop_density_per_sq_mile       5.663e-04  7.961e-05   7.114 1.91e-12 ***
## pct_black                    -8.156e-03  1.967e-03  -4.146 3.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4438 on 1224 degrees of freedom
## Multiple R-squared:  0.1722, Adjusted R-squared:  0.1655 
## F-statistic: 25.47 on 10 and 1224 DF,  p-value: < 2.2e-16
 vif(m)
##                             GVIF Df GVIF^(1/(2*Df))
## pct_adults              2.511911  1        1.584901
## tritium_tu              1.181861  1        1.087134
## water_use               1.554752  5        1.045120
## median_income           3.091922  1        1.758386
## pop_density_per_sq_mile 1.229472  1        1.108816
## pct_black               1.899671  1        1.378286
# Load libraries
library(dplyr)
library(broom)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(car)

# Fit the model
m <- lm(log_pfoa ~ pct_adults + tritium_tu + water_use + 
          median_income + pop_density_per_sq_mile + pct_black, 
        data = df_filtered)

# Create tidy regression table with significance
reg_table <- tidy(m) %>%
  mutate(Significance = case_when(
    p.value < 0.001 ~ "***",
    p.value < 0.01  ~ "**",
    p.value < 0.05  ~ "*",
    p.value < 0.1   ~ ".",
    TRUE            ~ ""
  ),
  is_significant = p.value < 0.01)  # for coloring

# Create and display the styled regression table
reg_table %>%
  select(term, estimate, std.error, statistic, p.value, Significance) %>%
  kable(
    digits = 4,
    caption = "Regression Results",
    col.names = c("Predictor", "Estimate", "Std. Error", "t value", "p-value", "Sig")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(which(reg_table$is_significant), bold = TRUE, color = "black", background = "#dff0d8")
Regression Results
Predictor Estimate Std. Error t value p-value Sig
(Intercept) -2.4555 0.9922 -2.4749 0.0135
pct_adults 0.0818 0.0169 4.8478 0.0000 ***
tritium_tu 0.0269 0.0043 6.3186 0.0000 ***
water_useIrrigation 0.0841 0.0900 0.9338 0.3506
water_useObservation -0.1148 0.0423 -2.7159 0.0067 **
water_useProduction 0.0913 0.0307 2.9773 0.0030 **
water_useUnknown 0.0058 0.0936 0.0624 0.9503
water_useWater supply, other 0.0205 0.1844 0.1111 0.9116
median_income 0.0000 0.0000 -11.3216 0.0000 ***
pop_density_per_sq_mile 0.0006 0.0001 7.1143 0.0000 ***
pct_black -0.0082 0.0020 -4.1464 0.0000 ***
# Load libraries
library(dplyr)
library(broom)
library(kableExtra)
library(car)

# Fit the model
m <- lm(log_pfoa ~ pct_adults + tritium_tu + water_use + 
          median_income + pop_density_per_sq_mile + pct_black, 
        data = df_filtered)

# Create tidy regression table with significance
reg_table <- tidy(m) %>%
  mutate(Significance = case_when(
    p.value < 0.001 ~ "***",
    p.value < 0.01  ~ "**",
    p.value < 0.05  ~ "*",
    p.value < 0.1   ~ ".",
    TRUE            ~ ""
  ),
  is_significant = p.value < 0.01)  # for coloring

# Filter for only the specific variables of interest
key_variables <- c("pct_adults", "tritium_tu", "water_useProduction", 
                   "median_income", "pop_density_per_sq_mile")

# Create filtered table first
filtered_table <- reg_table %>%
  filter(term %in% key_variables)

# Create and display the styled regression table
filtered_table %>%
  select(term, estimate, std.error, statistic, p.value, Significance) %>%
  kable(
    digits = 4,
    caption = "Key PFOA Contamination Predictors - Regression Results",
    col.names = c("Predictor", "Estimate", "Std. Error", "t value", "p-value", "Sig")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(which(filtered_table$is_significant), 
           bold = TRUE, color = "black", background = "#dff0d8")
Key PFOA Contamination Predictors - Regression Results
Predictor Estimate Std. Error t value p-value Sig
pct_adults 0.0818 0.0169 4.8478 0.000 ***
tritium_tu 0.0269 0.0043 6.3186 0.000 ***
water_useProduction 0.0913 0.0307 2.9773 0.003 **
median_income 0.0000 0.0000 -11.3216 0.000 ***
pop_density_per_sq_mile 0.0006 0.0001 7.1143 0.000 ***

FINDING THE BEST MODEL USING AUTOML

#AutoML for log_pfoa (Regression Task)

# Load the h2o library
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
# Start H2O
h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\doris\AppData\Local\Temp\RtmpQNrnuW\file5df85c16132/h2o_doris_started_from_r.out
##     C:\Users\doris\AppData\Local\Temp\RtmpQNrnuW\file5df82ab1a74/h2o_doris_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 172 milliseconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 9 months and 21 days 
##     H2O cluster name:           H2O_started_from_R_doris_oru630 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.88 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 9 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert your dataframe to an H2OFrame
df_h2o <- as.h2o(df_filtered)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Define response and predictors
variables_to_remove <- c("latitude", "longitude", "tritium_category", "pfoa_val", "pfoa_detected", "log_pfoa", "month")
x_reg <- setdiff(names(df_filtered), variables_to_remove) 
y_reg <- "log_pfoa"

# Print predictor information
cat("Predictors used:", paste(x_reg, collapse = ", "), "\n")
## Predictors used: state, water_use, tritium_tu, well_depth, median_income, poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older, pop_density_per_sq_mile, cancer_rate
cat("Number of predictors:", length(x_reg), "\n")
## Number of predictors: 16
# Split the data
splits_reg <- h2o.splitFrame(data = df_h2o, ratios = 0.7, seed = 123)
train_reg <- splits_reg[[1]]
test_reg  <- splits_reg[[2]]


# Run AutoML
aml_reg <- h2o.automl(
  x = x_reg,
  y = y_reg,
  training_frame = train_reg,
  leaderboard_frame = test_reg,
  max_models = 20,
  seed = 1234
)
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%
## 18:23:38.394: AutoML: XGBoost is not available; skipping it.  |                                                                              |==========                                                            |  15%  |                                                                              |===================                                                   |  26%  |                                                                              |===============================                                       |  44%  |                                                                              |=====================================                                 |  53%  |                                                                              |===========================================                           |  62%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================================| 100%
# Show full leaderboard
lb_reg <- h2o.get_leaderboard(aml_reg, extra_columns = "ALL")
print(lb_reg, n = nrow(lb_reg))
##                                                   model_id      rmse       mse
## 1              GBM_grid_1_AutoML_1_20251011_182338_model_2 0.4091900 0.1674365
## 2              GBM_grid_1_AutoML_1_20251011_182338_model_1 0.4118630 0.1696311
## 3                           GBM_1_AutoML_1_20251011_182338 0.4132166 0.1707479
## 4              GBM_grid_1_AutoML_1_20251011_182338_model_3 0.4133137 0.1708282
## 5                           GBM_2_AutoML_1_20251011_182338 0.4134107 0.1709084
## 6     DeepLearning_grid_3_AutoML_1_20251011_182338_model_1 0.4142181 0.1715767
## 7  StackedEnsemble_BestOfFamily_1_AutoML_1_20251011_182338 0.4177212 0.1744910
## 8                           GBM_4_AutoML_1_20251011_182338 0.4185395 0.1751753
## 9                           GBM_3_AutoML_1_20251011_182338 0.4186187 0.1752416
## 10             GBM_grid_1_AutoML_1_20251011_182338_model_4 0.4211605 0.1773762
## 11             GBM_grid_1_AutoML_1_20251011_182338_model_5 0.4222271 0.1782757
## 12    DeepLearning_grid_1_AutoML_1_20251011_182338_model_1 0.4245086 0.1802075
## 13                          GLM_1_AutoML_1_20251011_182338 0.4257319 0.1812476
## 14    DeepLearning_grid_2_AutoML_1_20251011_182338_model_1 0.4278609 0.1830650
## 15                          DRF_1_AutoML_1_20251011_182338 0.4337441 0.1881339
## 16                 DeepLearning_1_AutoML_1_20251011_182338 0.4343779 0.1886842
## 17                          XRT_1_AutoML_1_20251011_182338 0.4409525 0.1944391
## 18    DeepLearning_grid_1_AutoML_1_20251011_182338_model_2 0.4459299 0.1988535
## 19    StackedEnsemble_AllModels_1_AutoML_1_20251011_182338 0.4509164 0.2033256
## 20    DeepLearning_grid_3_AutoML_1_20251011_182338_model_2 0.4685734 0.2195610
## 21                          GBM_5_AutoML_1_20251011_182338 0.4752991 0.2259092
## 22    DeepLearning_grid_2_AutoML_1_20251011_182338_model_2 0.5343135 0.2854909
##          mae     rmsle mean_residual_deviance training_time_ms
## 1  0.2412755 0.1458620              0.1674365               80
## 2  0.2373114 0.1467636              0.1696311              133
## 3  0.2452379 0.1476060              0.1707479              131
## 4  0.2478323 0.1477924              0.1708282               57
## 5  0.2411996 0.1468352              0.1709084              221
## 6  0.2447922 0.1477984              0.1715767            12219
## 7  0.2411486 0.1466883              0.1744910             1161
## 8  0.2437177 0.1490390              0.1751753               92
## 9  0.2459783 0.1491281              0.1752416              404
## 10 0.2391034 0.1487703              0.1773762               83
## 11 0.2480273 0.1491586              0.1782757               52
## 12 0.2354055 0.1486766              0.1802075            11498
## 13 0.2566747 0.1515693              0.1812476               18
## 14 0.2768064 0.1559760              0.1830650            13157
## 15 0.2373061 0.1491110              0.1881339              191
## 16 0.2541884 0.1536553              0.1886842               53
## 17 0.2367766 0.1500611              0.1944391              192
## 18 0.2427991 0.1535918              0.1988535            10247
## 19 0.2516250 0.1532961              0.2033256              767
## 20 0.2592057 0.1579481              0.2195610            15294
## 21 0.2519053 0.1545553              0.2259092               55
## 22 0.3149483 0.1774324              0.2854909            15515
##    predict_time_per_row_ms            algo
## 1                 0.018810             GBM
## 2                 0.022039             GBM
## 3                 0.017213             GBM
## 4                 0.013082             GBM
## 5                 0.016750             GBM
## 6                 0.051639    DeepLearning
## 7                 0.094652 StackedEnsemble
## 8                 0.022533             GBM
## 9                 0.018551             GBM
## 10                0.018444             GBM
## 11                0.011734             GBM
## 12                0.013001    DeepLearning
## 13                0.004852             GLM
## 14                0.028543    DeepLearning
## 15                0.031277             DRF
## 16                0.008921    DeepLearning
## 17                0.016480             DRF
## 18                0.011057    DeepLearning
## 19                0.049441 StackedEnsemble
## 20                0.022150    DeepLearning
## 21                0.011370             GBM
## 22                0.015444    DeepLearning
## 
## [22 rows x 9 columns]
# Best model
best_model_reg <- aml_reg@leader
print(best_model_reg)
## Model Details:
## ==============
## 
## H2ORegressionModel: gbm
## Model ID:  GBM_grid_1_AutoML_1_20251011_182338_model_2 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              30                       30                7132         7
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         7    7.00000          9         18    14.30000
## 
## 
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.1620572
## RMSE:  0.4025633
## MAE:  0.2327796
## RMSLE:  0.1365644
## Mean Residual Deviance :  0.1620572
## 
## 
## 
## H2ORegressionMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.1913423
## RMSE:  0.4374269
## MAE:  0.2554557
## RMSLE:  0.1507954
## Mean Residual Deviance :  0.1913423
## 
## 
## Cross-Validation Metrics Summary: 
##                            mean       sd cv_1_valid cv_2_valid cv_3_valid
## mae                    0.254589 0.014197   0.251779   0.242526   0.274830
## mean_residual_deviance 0.192532 0.073301   0.193840   0.137706   0.159939
## mse                    0.192532 0.073301   0.193840   0.137706   0.159939
## r2                     0.220505 0.093034   0.186806   0.254176   0.128440
## residual_deviance      0.192532 0.073301   0.193840   0.137706   0.159939
## rmse                   0.433284 0.077437   0.440273   0.371087   0.399924
## rmsle                  0.151173 0.009947   0.154119   0.146017   0.152461
##                        cv_4_valid cv_5_valid
## mae                      0.241211   0.262599
## mean_residual_deviance   0.152778   0.318397
## mse                      0.152778   0.318397
## r2                       0.365635   0.167467
## residual_deviance        0.152778   0.318397
## rmse                     0.390868   0.564267
## rmsle                    0.138246   0.165020
# Evaluate model on test set
perf_test <- h2o.performance(best_model_reg, newdata = test_reg)
print(perf_test)
## H2ORegressionMetrics: gbm
## 
## MSE:  0.1674365
## RMSE:  0.40919
## MAE:  0.2412755
## RMSLE:  0.145862
## Mean Residual Deviance :  0.1674365
# Predict on test set
predictions_reg <- h2o.predict(best_model_reg, test_reg)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
print(head(predictions_reg))
##    predict
## 1 1.627676
## 2 1.604784
## 3 1.584260
## 4 1.576299
## 5 1.592688
## 6 1.588860
# Compare actual vs predicted
actual_vs_predicted <- h2o.cbind(test_reg[[y_reg]], predictions_reg)
colnames(actual_vs_predicted) <- c("actual_log_pfoa", "predicted_log_pfoa")
print(head(actual_vs_predicted, 10))
##    actual_log_pfoa predicted_log_pfoa
## 1        1.5475625           1.627676
## 2        1.6094379           1.604784
## 3        1.5475625           1.584260
## 4        1.6486586           1.576299
## 5        1.6094379           1.592688
## 6        1.6094379           1.588860
## 7        1.6094379           1.591966
## 8        1.0647107           1.151556
## 9        1.0647107           1.148198
## 10       0.8329091           1.252979
# Variable importance (if available)
if ("variable_importances" %in% names(best_model_reg@model)) {
  var_imp <- h2o.varimp(best_model_reg)
  print(head(var_imp, 10))
} else {
  cat("Variable importance is not available for this model type.\n")
}
## Variable Importances: 
##             variable relative_importance scaled_importance percentage
## 1              state          135.455292          1.000000   0.366447
## 2         tritium_tu           44.184357          0.326191   0.119532
## 3      median_income           42.411461          0.313103   0.114736
## 4         well_depth           26.667093          0.196870   0.072142
## 5          water_use           22.160738          0.163602   0.059951
## 6         pct_adults           19.215893          0.141862   0.051985
## 7       poverty_rate           14.598371          0.107773   0.039493
## 8          pct_older           12.807136          0.094549   0.034647
## 9  pct_bachelor_plus           11.924089          0.088030   0.032258
## 10      pct_children           10.011117          0.073907   0.027083

#AutoML for pfoa_detected (Binary Classification)

# Load the h2o library
library(h2o)

# Start H2O 
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         9 minutes 16 seconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 9 months and 21 days 
##     H2O cluster name:           H2O_started_from_R_doris_oru630 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.82 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 9 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert your dataframe to an H2OFrame
df_h2o <- as.h2o(df_filtered)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Convert to factor (in the H2OFrame)
df_h2o[, "pfoa_detected"] <- as.factor(df_h2o[, "pfoa_detected"])

# Define response and predictors - REMOVING specified variables
variables_to_remove <- c("latitude", "longitude", "tritium_category", "log_pfoa", "pfoa_val", "pfoa_detected", "month")
x_clf <- setdiff(names(df_filtered), variables_to_remove)
y_clf <- "pfoa_detected"

print(paste("Predictors used:", paste(x_clf, collapse = ", ")))
## [1] "Predictors used: state, water_use, tritium_tu, well_depth, median_income, poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older, pop_density_per_sq_mile, cancer_rate"
print(paste("Number of predictors:", length(x_clf)))
## [1] "Number of predictors: 16"
# Split the data
splits_clf <- h2o.splitFrame(data = df_h2o, ratios = 0.7, seed = 123)
train_clf <- splits_clf[[1]]
test_clf  <- splits_clf[[2]]

# Check class distribution before balancing
print(h2o.table(train_clf[, y_clf]))
##   pfoa_detected Count
## 1             0    88
## 2             1   777
## 
## [2 rows x 2 columns]
# Run AutoML (Classification) with class balancing
aml_clf <- h2o.automl(x = x_clf,
                      y = y_clf,
                      training_frame = train_clf,
                      leaderboard_frame = test_clf,
                      max_models = 20,
                      seed = 1234,
                      balance_classes = TRUE,  # Enable class balancing
                      max_after_balance_size = 3.0)  
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%
## 18:32:55.633: AutoML: XGBoost is not available; skipping it.  |                                                                              |============                                                          |  18%  |                                                                              |===================                                                   |  26%  |                                                                              |===============================                                       |  44%  |                                                                              |=====================================                                 |  53%  |                                                                              |===========================================                           |  62%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================================| 100%
# Show leaderboard with all columns
lb_clf <- h2o.get_leaderboard(aml_clf, extra_columns = "ALL")
print(lb_clf, n = nrow(lb_clf))
##                                                   model_id       auc   logloss
## 1  StackedEnsemble_BestOfFamily_1_AutoML_2_20251011_183255 0.7458782 0.3036463
## 2                           GBM_2_AutoML_2_20251011_183255 0.7296290 0.3126845
## 3              GBM_grid_1_AutoML_2_20251011_183255_model_3 0.7140536 0.3106991
## 4                  DeepLearning_1_AutoML_2_20251011_183255 0.7120323 0.3206283
## 5              GBM_grid_1_AutoML_2_20251011_183255_model_5 0.7093373 0.3178046
## 6                           GBM_3_AutoML_2_20251011_183255 0.7069990 0.3181845
## 7                           GBM_5_AutoML_2_20251011_183255 0.7037096 0.3180435
## 8                           GBM_1_AutoML_2_20251011_183255 0.6997464 0.3131314
## 9              GBM_grid_1_AutoML_2_20251011_183255_model_4 0.6955850 0.3360300
## 10                          GBM_4_AutoML_2_20251011_183255 0.6933656 0.3275421
## 11             GBM_grid_1_AutoML_2_20251011_183255_model_1 0.6932070 0.3301203
## 12             GBM_grid_1_AutoML_2_20251011_183255_model_2 0.6927315 0.3209874
## 13                          DRF_1_AutoML_2_20251011_183255 0.6905913 0.4240308
## 14                          GLM_1_AutoML_2_20251011_183255 0.6873811 0.3186461
## 15                          XRT_1_AutoML_2_20251011_183255 0.6807229 0.3775919
## 16    DeepLearning_grid_2_AutoML_2_20251011_183255_model_1 0.6777108 0.4533509
## 17    DeepLearning_grid_3_AutoML_2_20251011_183255_model_1 0.6722416 0.3948481
## 18    DeepLearning_grid_1_AutoML_2_20251011_183255_model_1 0.6713697 0.5170865
## 19    StackedEnsemble_AllModels_1_AutoML_2_20251011_183255 0.6671687 0.3721049
## 20    DeepLearning_grid_2_AutoML_2_20251011_183255_model_2 0.6630866 0.8983242
## 21    DeepLearning_grid_3_AutoML_2_20251011_183255_model_2 0.6612238 0.6877595
## 22    DeepLearning_grid_1_AutoML_2_20251011_183255_model_2 0.6484623 0.8641396
##        aucpr mean_per_class_error      rmse        mse training_time_ms
## 1  0.9565417            0.4868421 0.2945397 0.08675365             1468
## 2  0.9522173            0.4270767 0.2952365 0.08716458              126
## 3  0.9519246            0.5000000 0.2972149 0.08833672               79
## 4  0.9501891            0.5000000 0.3054933 0.09332617              178
## 5  0.9497658            0.5000000 0.3000993 0.09005960              149
## 6  0.9453798            0.4387286 0.2948400 0.08693061              130
## 7  0.9506859            0.4270767 0.2954323 0.08728023              126
## 8  0.9481461            0.5000000 0.2977178 0.08863589              132
## 9  0.9448205            0.5000000 0.2988058 0.08928489              206
## 10 0.9414377            0.4037730 0.2968646 0.08812859              140
## 11 0.9464142            0.4868421 0.2959362 0.08757824              145
## 12 0.9445046            0.5000000 0.2968006 0.08809061              128
## 13 0.9446372            0.5000000 0.2993358 0.08960192              344
## 14 0.9486930            0.5000000 0.3008084 0.09048568               75
## 15 0.9419124            0.4635384 0.3012470 0.09074976              414
## 16 0.9455646            0.5000000 0.3178608 0.10103551            12241
## 17 0.9443273            0.5000000 0.3129565 0.09794174            14406
## 18 0.9442389            0.5000000 0.3373033 0.11377354            10338
## 19 0.9395425            0.5000000 0.3085790 0.09522099             1075
## 20 0.9421426            0.5000000 0.3756124 0.14108465             6890
## 21 0.9384233            0.5000000 0.3595837 0.12930044            18847
## 22 0.9393107            0.5000000 0.3737734 0.13970656             8669
##    predict_time_per_row_ms            algo
## 1                 0.056464 StackedEnsemble
## 2                 0.021168             GBM
## 3                 0.017100             GBM
## 4                 0.017208    DeepLearning
## 5                 0.022432             GBM
## 6                 0.023116             GBM
## 7                 0.021131             GBM
## 8                 0.015421             GBM
## 9                 0.022992             GBM
## 10                0.022384             GBM
## 11                0.020750             GBM
## 12                0.018048             GBM
## 13                0.026251             DRF
## 14                0.010720             GLM
## 15                0.027443             DRF
## 16                0.034029    DeepLearning
## 17                0.042069    DeepLearning
## 18                0.022774    DeepLearning
## 19                0.054624 StackedEnsemble
## 20                0.026879    DeepLearning
## 21                0.036121    DeepLearning
## 22                0.028200    DeepLearning
## 
## [22 rows x 10 columns]
# Best classification model
best_model_clf <- aml_clf@leader
print(best_model_clf)
## Model Details:
## ==============
## 
## H2OBinomialModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_2_20251011_183255 
## Model Summary for Stacked Ensemble: 
##                                          key            value
## 1                          Stacking strategy cross_validation
## 2       Number of base models (used / total)              4/5
## 3           # GBM base models (used / total)              1/1
## 4  # DeepLearning base models (used / total)              1/1
## 5           # DRF base models (used / total)              1/2
## 6           # GLM base models (used / total)              1/1
## 7                      Metalearner algorithm              GLM
## 8         Metalearner fold assignment scheme           Random
## 9                         Metalearner nfolds                5
## 10                   Metalearner fold_column               NA
## 11        Custom metalearner hyperparameters             None
## 
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.0719238
## RMSE:  0.2681861
## LogLoss:  0.2499077
## Mean Per-Class Error:  0.3510954
## AUC:  0.913632
## AUCPR:  0.989563
## Gini:  0.827264
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         0   1    Error     Rate
## 0      27  61 0.693182   =61/88
## 1       7 770 0.009009   =7/777
## Totals 34 831 0.078613  =68/865
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.696123   0.957711 370
## 2                       max f2  0.669396   0.979251 382
## 3                 max f0point5  0.820134   0.948207 301
## 4                 max accuracy  0.696123   0.921387 370
## 5                max precision  0.983946   1.000000   0
## 6                   max recall  0.573735   1.000000 395
## 7              max specificity  0.983946   1.000000   0
## 8             max absolute_mcc  0.790070   0.481104 321
## 9   max min_per_class_accuracy  0.883536   0.829545 243
## 10 max mean_per_class_accuracy  0.898598   0.858978 214
## 11                     max tns  0.983946  88.000000   0
## 12                     max fns  0.983946 775.000000   0
## 13                     max fps  0.551082  88.000000 399
## 14                     max tps  0.573735 777.000000 395
## 15                     max tnr  0.983946   1.000000   0
## 16                     max fnr  0.983946   0.997426   0
## 17                     max fpr  0.551082   1.000000 399
## 18                     max tpr  0.573735   1.000000 395
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.08593275
## RMSE:  0.2931429
## LogLoss:  0.3140038
## Mean Per-Class Error:  0.5
## AUC:  0.6706666
## AUCPR:  0.9259375
## Gini:  0.3413332
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        0   1    Error     Rate
## 0      0  88 1.000000   =88/88
## 1      0 777 0.000000   =0/777
## Totals 0 865 0.101734  =88/865
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.295440   0.946407 399
## 2                       max f2  0.295440   0.977850 399
## 3                 max f0point5  0.820032   0.927472 311
## 4                 max accuracy  0.295440   0.898266 399
## 5                max precision  0.953846   0.942308  58
## 6                   max recall  0.295440   1.000000 399
## 7              max specificity  0.999134   0.988636   0
## 8             max absolute_mcc  0.820032   0.294436 311
## 9   max min_per_class_accuracy  0.908920   0.613636 185
## 10 max mean_per_class_accuracy  0.880175   0.676955 253
## 11                     max tns  0.999134  87.000000   0
## 12                     max fns  0.999134 777.000000   0
## 13                     max fps  0.631417  88.000000 393
## 14                     max tps  0.295440 777.000000 399
## 15                     max tnr  0.999134   0.988636   0
## 16                     max fnr  0.999134   1.000000   0
## 17                     max fpr  0.631417   1.000000 393
## 18                     max tpr  0.295440   1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy   0.904066 0.024688   0.898876   0.947674   0.891429   0.888235
## auc        0.699835 0.110445   0.760443   0.852761   0.670041   0.652492
## err        0.095934 0.024688   0.101124   0.052326   0.108571   0.111765
## err_count 16.600000 4.277850  18.000000   9.000000  19.000000  19.000000
## f0point5   0.924564 0.020414   0.930657   0.957697   0.911215   0.908544
##           cv_5_valid
## accuracy    0.894118
## auc         0.563439
## err         0.105882
## err_count  18.000000
## f0point5    0.914709
## 
## ---
##                         mean        sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.909199  0.025205   0.921687   0.947674   0.891429
## r2                  0.051139  0.040307   0.103258   0.069426   0.043097
## recall              0.992329  0.013713   0.968354   1.000000   1.000000
## residual_deviance 105.856520 23.547125 110.006320  65.839100 112.519400
## rmse                0.291197  0.044320   0.299059   0.214814   0.304323
## specificity         0.108095  0.158397   0.350000   0.000000   0.000000
##                   cv_4_valid cv_5_valid
## precision           0.888235   0.896970
## r2                  0.046777  -0.006863
## recall              1.000000   0.993289
## residual_deviance 112.446350 128.471420
## rmse                0.307619   0.330171
## specificity         0.000000   0.190476
# Get model performance on test set
perf_test <- h2o.performance(best_model_clf, test_clf)
print(perf_test)
## H2OBinomialMetrics: stackedensemble
## 
## MSE:  0.08675365
## RMSE:  0.2945397
## LogLoss:  0.3036463
## Mean Per-Class Error:  0.4868421
## AUC:  0.7458782
## AUCPR:  0.9565417
## Gini:  0.4917565
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        0   1    Error     Rate
## 0      1  37 0.973684   =37/38
## 1      0 332 0.000000   =0/332
## Totals 1 369 0.100000  =37/370
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.612512   0.947218 365
## 2                       max f2  0.612512   0.978197 365
## 3                 max f0point5  0.698370   0.921946 355
## 4                 max accuracy  0.612512   0.900000 365
## 5                max precision  0.980705   1.000000   0
## 6                   max recall  0.612512   1.000000 365
## 7              max specificity  0.980705   1.000000   0
## 8             max absolute_mcc  0.905335   0.287909 245
## 9   max min_per_class_accuracy  0.905335   0.710843 245
## 10 max mean_per_class_accuracy  0.905335   0.723843 245
## 11                     max tns  0.980705  38.000000   0
## 12                     max fns  0.980705 331.000000   0
## 13                     max fps  0.600712  38.000000 366
## 14                     max tps  0.612512 332.000000 365
## 15                     max tnr  0.980705   1.000000   0
## 16                     max fnr  0.980705   0.996988   0
## 17                     max fpr  0.600712   1.000000 366
## 18                     max tpr  0.612512   1.000000 365
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Generate confusion matrix
cm <- h2o.confusionMatrix(perf_test)
print(cm)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.61251175891771:
##        0   1    Error     Rate
## 0      1  37 0.973684   =37/38
## 1      0 332 0.000000   =0/332
## Totals 1 369 0.100000  =37/370
# Make predictions on test set
predictions_clf <- h2o.predict(best_model_clf, test_clf)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
print(head(predictions_clf))
##   predict         p0        p1
## 1       1 0.05208713 0.9479129
## 2       1 0.05152928 0.9484707
## 3       1 0.05041690 0.9495831
## 4       1 0.04322418 0.9567758
## 5       1 0.05718886 0.9428111
## 6       1 0.08632103 0.9136790
# Create a proper confusion matrix using R's table function
# Convert actual and predicted values to vectors
actual <- as.vector(test_clf[, "pfoa_detected"])
predicted <- as.vector(h2o.predict(best_model_clf, test_clf)$predict)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Convert to factors with the same levels
actual <- factor(as.character(actual))
predicted <- factor(as.character(predicted), levels = levels(actual))

# Create confusion matrix
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
conf_matrix <- table(Predicted = predicted, Actual = actual)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0   0   0
##         1  38 332
TP <- as.numeric(conf_matrix["1", "1"])
TN <- as.numeric(conf_matrix["0", "0"])
FP <- as.numeric(conf_matrix["1", "0"])
FN <- as.numeric(conf_matrix["0", "1"])


# Calculate all metrics
total <- sum(conf_matrix)
accuracy <- (TP + TN) / total
sensitivity <- TP / (TP + FN)   # Recall or True Positive Rate
specificity <- TN / (TN + FP)   # True Negative Rate
precision <- TP / (TP + FP)     # Positive Predictive Value
npv <- TN / (TN + FN)          # Negative Predictive Value
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
error_rate <- 1 - accuracy
false_positive_rate <- FP / (FP + TN)
false_negative_rate <- FN / (TP + FN)

# Get AUC from H2O
perf_test <- h2o.performance(best_model_clf, test_clf)
auc_value <- h2o.auc(perf_test)

# Print detailed metrics
cat("\n=== DETAILED CLASSIFICATION METRICS ===\n")
## 
## === DETAILED CLASSIFICATION METRICS ===
cat("True Positives (TP) :", TP, "\n")
## True Positives (TP) : 332
cat("True Negatives (TN) :", TN, "\n")
## True Negatives (TN) : 0
cat("False Positives (FP):", FP, "\n")
## False Positives (FP): 38
cat("False Negatives (FN):", FN, "\n")
## False Negatives (FN): 0
cat("Total Samples       :", total, "\n")
## Total Samples       : 370
cat("\n=== PERFORMANCE METRICS ===\n")
## 
## === PERFORMANCE METRICS ===
cat("Accuracy            :", round(accuracy, 4), "\n")
## Accuracy            : 0.8973
cat("Sensitivity (Recall):", round(sensitivity, 4), "\n")
## Sensitivity (Recall): 1
cat("Specificity         :", round(specificity, 4), "\n")
## Specificity         : 0
cat("Precision (PPV)     :", round(precision, 4), "\n")
## Precision (PPV)     : 0.8973
cat("Negative Pred Value :", round(npv, 4), "\n")
## Negative Pred Value : NaN
cat("F1 Score            :", round(f1_score, 4), "\n")
## F1 Score            : 0.9459
cat("Error Rate          :", round(error_rate, 4), "\n")
## Error Rate          : 0.1027
cat("False Positive Rate :", round(false_positive_rate, 4), "\n")
## False Positive Rate : 1
cat("False Negative Rate :", round(false_negative_rate, 4), "\n")
## False Negative Rate : 0
cat("AUC                 :", round(auc_value, 4), "\n")
## AUC                 : 0.7459
# Summary table
metrics_df <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", 
             "NPV", "F1-Score", "Error Rate", "AUC"),
  Value = c(round(accuracy, 4), round(sensitivity, 4), round(specificity, 4), 
            round(precision, 4), round(npv, 4), round(f1_score, 4), 
            round(error_rate, 4), round(auc_value, 4))
)
print(metrics_df)
##        Metric  Value
## 1    Accuracy 0.8973
## 2 Sensitivity 1.0000
## 3 Specificity 0.0000
## 4   Precision 0.8973
## 5         NPV    NaN
## 6    F1-Score 0.9459
## 7  Error Rate 0.1027
## 8         AUC 0.7459