##EXPLORATORY ANALYSIS
# Load required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## corrplot 0.95 loaded
library(RColorBrewer)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(effsize)
## Warning: package 'effsize' was built under R version 4.4.3
library(agricolae)
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
library(maps)
## Warning: package 'maps' was built under R version 4.4.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:viridis':
##
## unemp
library(stringr)
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:agricolae':
##
## kurtosis, skewness
# Load the final dataset
data <- read.csv("final_data_clean.csv")
#Make a copy of the dataset
df<-data
#Check missing values
sum(is.na(df))
## [1] 0
# See the structure
str(df)
## 'data.frame': 1238 obs. of 22 variables:
## $ pfoa_val : num 4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
## $ pfoa_detected : int 1 1 1 1 1 1 1 1 1 1 ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ water_use : chr "Production" "Water supply, other" "Domestic" "Domestic" ...
## $ tritium_tu : num -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
## $ well_depth : num 152.4 141.7 77.7 54 64 ...
## $ tritium_category : chr "PreModern" "PreModern" "PreModern" "Modern" ...
## $ month : chr "August" "June" "June" "July" ...
## $ median_income : int 50536 50536 50536 50536 50536 50536 50536 50536 50536 50536 ...
## $ poverty_rate : num 16.7 16.7 16.7 16.7 16.7 ...
## $ unemployment_rate : num 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
## $ pct_white : num 68.1 68.1 68.1 68.1 68.1 ...
## $ pct_black : num 26.6 26.6 26.6 26.6 26.6 ...
## $ pct_hispanic : num 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
## $ pct_bachelor_plus : num 23.9 23.9 23.9 23.9 23.9 ...
## $ pct_children : num 22.5 22.5 22.5 22.5 22.5 ...
## $ pct_adults : num 61 61 61 61 61 ...
## $ pct_older : num 16.5 16.5 16.5 16.5 16.5 ...
## $ pop_density_per_sq_mile: num 96.3 96.3 96.3 96.3 96.3 ...
## $ longitude : num -86.8 -86.8 -86.8 -86.8 -86.8 ...
## $ latitude : num 32.8 32.8 32.8 32.8 32.8 ...
## $ cancer_rate : num 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
#Convert incorrect data classes to their appropriate classes
df$pfoa_detected <- as.factor(df$pfoa_detected)
df$state <- as.factor(df$state)
df$water_use <- as.factor(df$water_use)
df$tritium_category <- as.factor(df$tritium_category)
df$month <- as.factor(df$month)
df$median_income <- as.numeric(df$median_income)
str(df)
## 'data.frame': 1238 obs. of 22 variables:
## $ pfoa_val : num 4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
## $ pfoa_detected : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ state : Factor w/ 32 levels "Alabama","California",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ water_use : Factor w/ 6 levels "Domestic","Irrigation",..: 4 6 1 1 1 1 1 1 6 1 ...
## $ tritium_tu : num -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
## $ well_depth : num 152.4 141.7 77.7 54 64 ...
## $ tritium_category : Factor w/ 4 levels "Mixed","Modern",..: 3 3 3 2 2 2 3 1 2 1 ...
## $ month : Factor w/ 12 levels "April","August",..: 2 7 7 6 6 6 6 6 6 6 ...
## $ median_income : num 50536 50536 50536 50536 50536 ...
## $ poverty_rate : num 16.7 16.7 16.7 16.7 16.7 ...
## $ unemployment_rate : num 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
## $ pct_white : num 68.1 68.1 68.1 68.1 68.1 ...
## $ pct_black : num 26.6 26.6 26.6 26.6 26.6 ...
## $ pct_hispanic : num 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
## $ pct_bachelor_plus : num 23.9 23.9 23.9 23.9 23.9 ...
## $ pct_children : num 22.5 22.5 22.5 22.5 22.5 ...
## $ pct_adults : num 61 61 61 61 61 ...
## $ pct_older : num 16.5 16.5 16.5 16.5 16.5 ...
## $ pop_density_per_sq_mile: num 96.3 96.3 96.3 96.3 96.3 ...
## $ longitude : num -86.8 -86.8 -86.8 -86.8 -86.8 ...
## $ latitude : num 32.8 32.8 32.8 32.8 32.8 ...
## $ cancer_rate : num 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
# Set theme for better looking plots
theme_set(theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 11),
axis.text.x = element_text(angle = 45, hjust = 1)))
#Distribution of the Target Variable(s)
# For pfoa_val (continuous concentration)
ggplot(df, aes(x = pfoa_val)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
labs(title = "Distribution of PFOA Concentration (pfoa_val)",
x = "PFOA Concentration (pfoa_val)",
y = "Frequency")
# PFOA concentrations is largely right skewed, using log-transformation to improve
df$log_pfoa <- log(df$pfoa_val + 1)
ggplot(df, aes(x = log_pfoa)) +
geom_histogram(binwidth = 0.2, fill = "darkgreen", color = "black") +
labs(title = "Distribution of Log-transformed PFOA Concentration",
x = "Log(PFOA Concentration)",
y = "Frequency")
# For pfoa_detected (binary detection status)
ggplot(df, aes(x = pfoa_detected, fill = pfoa_detected)) +
geom_bar() +
labs(title = "PFOA Detection Status",
x = "Detected (1) / Not Detected (0)",
y = "Count") +
scale_fill_viridis_d()
# Original skewness
skewness(df$pfoa_val)
## [1] 33.9364
# Log-transformed skewness
skewness(log(df$pfoa_val))
## [1] 2.461264
# QQ plot for raw pfoa_val
qq_pfoa <- ggplot(df, aes(sample = pfoa_val)) +
stat_qq(color = "steelblue") +
stat_qq_line(color = "red", linetype = "dashed") +
labs(title = "Q-Q Plot of PFOA Concentration")
# QQ plot for log-transformed pfoa_val
qq_log_pfoa <- ggplot(df, aes(sample = log_pfoa)) +
stat_qq(color = "darkgreen") +
stat_qq_line(color = "red", linetype = "dashed") +
labs(title = "Q-Q Plot of Log_pfoa")
# Display side by side
gridExtra::grid.arrange(qq_pfoa, qq_log_pfoa, ncol = 2)
#Compare log(PFOA) by detection status
ggplot(df, aes(x = factor(pfoa_detected), y = log_pfoa, fill = factor(pfoa_detected))) +
geom_boxplot() +
labs(title = "Log(PFOA) by Detection Status",
x = "Detected (0 = No, 1 = Yes)",
y = "Log(PFOA Concentration)") +
scale_fill_manual(values = c("red", "blue"))
df %>%
group_by(state) %>%
summarise(
total_pfoa = sum(pfoa_val, na.rm = TRUE),
avg_pfoa = mean(pfoa_val, na.rm = TRUE),
n_samples = n()
) %>%
arrange(desc(total_pfoa))
## # A tibble: 32 × 4
## state total_pfoa avg_pfoa n_samples
## <fct> <dbl> <dbl> <int>
## 1 California 1900. 3.28 580
## 2 West Virginia 1704. 284. 6
## 3 Ohio 439. 4.93 89
## 4 Florida 361. 4.76 76
## 5 Massachusetts 335. 8.38 40
## 6 Tennessee 152. 2.77 55
## 7 Alabama 144. 3.06 47
## 8 Louisiana 143. 2.75 52
## 9 Utah 104. 2.73 38
## 10 Mississippi 103. 3.23 32
## # ℹ 22 more rows
# Debug: Check unique states in your data
cat("Unique states in your data:\n")
## Unique states in your data:
unique_states <- unique(df$state)
print(sort(unique_states))
## [1] Alabama California Colorado Delaware Florida
## [6] Georgia Illinois Indiana Kansas Kentucky
## [11] Louisiana Maine Maryland Massachusetts Mississippi
## [16] New Hampshire New Jersey New Mexico New York North Carolina
## [21] Ohio Oklahoma Oregon Pennsylvania Rhode Island
## [26] South Carolina Tennessee Texas Utah Virginia
## [31] West Virginia Wisconsin
## 32 Levels: Alabama California Colorado Delaware Florida Georgia ... Wisconsin
# Create comprehensive state abbreviation lookup
state_abbrev <- data.frame(
state = c("alabama", "california", "colorado", "delaware", "florida", "georgia",
"illinois", "indiana", "kansas", "kentucky", "louisiana",
"maine", "maryland", "massachusetts", "mississippi", "new hampshire",
"new jersey", "new mexico", "new york", "north carolina",
"ohio", "oklahoma", "oregon", "pennsylvania", "south carolina",
"tennessee", "texas", "utah", "virginia", "washington",
"west virginia", "wisconsin", "wyoming"),
abbrev = c("AL", "CA", "CO", "DE", "FL", "GA", "IL", "IN", "KS", "KY", "LA",
"ME", "MD", "MA", "MS", "NH", "NJ", "NM", "NY", "NC",
"OH", "OK", "OR", "PA", "SC", "TN", "TX", "UT", "VA", "WA",
"WV", "WI", "WY")
)
# Convert state names to lowercase
df$state <- tolower(df$state)
# Debug: Compute average PFOA concentration and check results
df_avg <- df %>%
group_by(state) %>%
summarise(
avg_pfoa = mean(pfoa_val, na.rm = TRUE),
n_samples = n(),
.groups = "drop"
) %>%
arrange(desc(avg_pfoa))
cat("\nAverage PFOA by state (top 10):\n")
##
## Average PFOA by state (top 10):
print(head(df_avg, 10))
## # A tibble: 10 × 3
## state avg_pfoa n_samples
## <chr> <dbl> <int>
## 1 west virginia 284. 6
## 2 massachusetts 8.38 40
## 3 georgia 5.33 11
## 4 rhode island 5.29 3
## 5 ohio 4.93 89
## 6 florida 4.76 76
## 7 kentucky 4.50 7
## 8 new hampshire 4.37 11
## 9 new mexico 4 1
## 10 oregon 4 3
# Check if West Virginia is in the averages
wv_data <- df_avg %>% filter(state == "west virginia")
cat("\nWest Virginia data:\n")
##
## West Virginia data:
print(wv_data)
## # A tibble: 1 × 3
## state avg_pfoa n_samples
## <chr> <dbl> <int>
## 1 west virginia 284. 6
# Load US map data
us_states <- map_data("state")
# Debug: Check if West Virginia is in map data
wv_map_check <- us_states %>% filter(region == "west virginia")
cat("\nWest Virginia in map data:", nrow(wv_map_check), "rows\n")
##
## West Virginia in map data: 373 rows
# Merge map data with averages
map_df <- left_join(us_states, df_avg, by = c("region" = "state"))
# Debug: Check merged data for West Virginia
wv_merged <- map_df %>% filter(region == "west virginia")
cat("West Virginia in merged data:", nrow(wv_merged), "rows\n")
## West Virginia in merged data: 373 rows
# Compute centroids for label placement
centroids <- map_df %>%
filter(!is.na(avg_pfoa)) %>%
group_by(region) %>%
summarise(
lon = mean(long, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
avg_pfoa = first(avg_pfoa),
.groups = "drop"
) %>%
left_join(state_abbrev, by = c("region" = "state")) %>%
filter(!is.na(abbrev)) %>%
mutate(
label = paste0(abbrev, " ", sprintf("%.2f", avg_pfoa))
)
cat("\nNumber of centroids calculated:", nrow(centroids), "\n")
##
## Number of centroids calculated: 31
print(sort(centroids$region))
## [1] "alabama" "california" "colorado" "delaware"
## [5] "florida" "georgia" "illinois" "indiana"
## [9] "kansas" "kentucky" "louisiana" "maine"
## [13] "maryland" "massachusetts" "mississippi" "new hampshire"
## [17] "new jersey" "new mexico" "new york" "north carolina"
## [21] "ohio" "oklahoma" "oregon" "pennsylvania"
## [25] "south carolina" "tennessee" "texas" "utah"
## [29] "virginia" "west virginia" "wisconsin"
# Check if West Virginia made it to centroids
wv_centroid <- centroids %>% filter(region == "west virginia")
cat("\nWest Virginia centroid data:\n")
##
## West Virginia centroid data:
print(wv_centroid)
## # A tibble: 1 × 6
## region lon lat avg_pfoa abbrev label
## <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 west virginia -80.3 38.8 284. WV WV 284.04
# Plot
ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
geom_polygon(color = "gray90", size = 0.3) +
geom_text(data = centroids, aes(x = lon, y = lat, label = label),
inherit.aes = FALSE, color = "black", size = 2.5, fontface = "bold",
check_overlap = TRUE) +
scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
coord_fixed(1.3) +
theme_void() +
labs(title = "Average PFOA Concentrations by State") +
theme(
plot.title = element_text(hjust = 0.5, face = "bold")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_summary <- df %>%
group_by(state) %>%
summarise(
total_pfoa = sum(pfoa_val, na.rm = TRUE),
avg_pfoa = mean(pfoa_val, na.rm = TRUE),
n_samples = n()
) %>%
arrange(desc(total_pfoa))
df_summary
## # A tibble: 32 × 4
## state total_pfoa avg_pfoa n_samples
## <chr> <dbl> <dbl> <int>
## 1 california 1900. 3.28 580
## 2 west virginia 1704. 284. 6
## 3 ohio 439. 4.93 89
## 4 florida 361. 4.76 76
## 5 massachusetts 335. 8.38 40
## 6 tennessee 152. 2.77 55
## 7 alabama 144. 3.06 47
## 8 louisiana 143. 2.75 52
## 9 utah 104. 2.73 38
## 10 mississippi 103. 3.23 32
## # ℹ 22 more rows
# Fix 1: Add Rhode Island to state abbreviation lookup
state_abbrev <- data.frame(
state = c("alabama", "california", "colorado", "delaware", "florida", "georgia", "illinois", "indiana", "kansas", "kentucky", "louisiana",
"maine", "maryland", "massachusetts", "mississippi", "new hampshire",
"new jersey", "new mexico", "new york", "north carolina", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina",
"tennessee", "texas", "utah", "virginia", "washington", "west virginia", "wisconsin", "wyoming"),
abbrev = c("AL", "CA", "CO", "DE", "FL", "GA", "IL", "IN", "KS", "KY", "LA", "ME", "MD", "MA", "MS", "NH", "NJ", "NM", "NY", "NC", "OH", "OK", "OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "WA", "WV", "WI", "WY")
)
# Fix 2: Recalculate centroids with the complete lookup
centroids <- map_df %>%
filter(!is.na(avg_pfoa)) %>% # Only states with data
group_by(region) %>%
summarise(
lon = mean(long, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
avg_pfoa = first(avg_pfoa),
.groups = "drop"
) %>%
left_join(state_abbrev, by = c("region" = "state")) %>%
filter(!is.na(abbrev)) %>%
mutate(
label = paste0(abbrev, " ", sprintf("%.2f", avg_pfoa))
)
cat("\nNumber of centroids after fix:", nrow(centroids), "\n")
##
## Number of centroids after fix: 32
print(sort(centroids$region))
## [1] "alabama" "california" "colorado" "delaware"
## [5] "florida" "georgia" "illinois" "indiana"
## [9] "kansas" "kentucky" "louisiana" "maine"
## [13] "maryland" "massachusetts" "mississippi" "new hampshire"
## [17] "new jersey" "new mexico" "new york" "north carolina"
## [21] "ohio" "oklahoma" "oregon" "pennsylvania"
## [25] "rhode island" "south carolina" "tennessee" "texas"
## [29] "utah" "virginia" "west virginia" "wisconsin"
# Fix 3: Create plot without check_overlap to show all labels
# Solution 1: Smaller font size with check_overlap
p3 <- ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
geom_polygon(color = "gray90", size = 0.3) +
geom_text(data = centroids, aes(x = lon, y = lat, label = label),
inherit.aes = FALSE, color = "black", size = 2.0, fontface = "bold",
check_overlap = TRUE) +
scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
coord_fixed(1.3) +
theme_void() +
labs(title = "Average PFOA Concentrations by State (Smaller Font)") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Fix 4: Alternative with check_overlap for cleaner look
# First identify which states are getting hidden
all_states <- centroids$region
visible_states_clean <- c() # You'd need to manually identify from your clean plot
# Create manual adjustments for small northeastern states
centroids_adjusted <- centroids %>%
mutate(
lon_adj = case_when(
region == "rhode island" ~ lon + 1.5, # Move RI label east
region == "new hampshire" ~ lon + 1.0, # Move NH label east
region == "massachusetts" ~ lon + 1.0, # Move MA label east
region == "new jersey" ~ lon + 0.8, # Move NJ label east
region == "delaware" ~ lon + 1.2, # Move DE label east
TRUE ~ lon
),
lat_adj = case_when(
region == "rhode island" ~ lat - 0.5, # Move RI label south
region == "new hampshire" ~ lat + 0.5, # Move NH label north
region == "massachusetts" ~ lat - 0.8, # Move MA label south
TRUE ~ lat
)
)
centroids_sized <- centroids %>%
mutate(
is_northeast = region %in% c("rhode island", "massachusetts", "new hampshire",
"new jersey", "delaware", "maryland"),
label_size = ifelse(is_northeast, 1.8, 2.3)
)
# Show plot
print(p3)
# Fix 5: Create a summary table of all states with data
summary_table <- df_avg %>%
left_join(state_abbrev, by = c("state" = "state")) %>%
arrange(desc(avg_pfoa)) %>%
select(state, abbrev, avg_pfoa, n_samples) %>%
mutate(across(where(is.numeric), round, 2))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
cat("\nComplete summary of all", nrow(summary_table), "states with PFOA data:\n")
##
## Complete summary of all 32 states with PFOA data:
print(summary_table)
## # A tibble: 32 × 4
## state abbrev avg_pfoa n_samples
## <chr> <chr> <dbl> <dbl>
## 1 west virginia WV 284. 6
## 2 massachusetts MA 8.38 40
## 3 georgia GA 5.33 11
## 4 rhode island RI 5.29 3
## 5 ohio OH 4.93 89
## 6 florida FL 4.76 76
## 7 kentucky KY 4.5 7
## 8 new hampshire NH 4.37 11
## 9 new mexico NM 4 1
## 10 oregon OR 4 3
## # ℹ 22 more rows
# solution: Remove check_overlap entirely and use smaller, uniform font
p_all_visible <- ggplot(map_df, aes(long, lat, group = group, fill = avg_pfoa)) +
geom_polygon(color = "gray90", size = 0.3) +
geom_text(data = centroids, aes(x = lon, y = lat, label = label),
inherit.aes = FALSE, color = "black", size = 1.9, fontface = "bold") +
scale_fill_gradient(low = "#c6dbef", high = "#08306b", na.value = "white", name = "Avg PFOA") +
coord_fixed(1.3) +
theme_void() +
labs(title = "Average PFOA Concentrations by State (32 States)") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_all_visible)
# Count labels to verify all 32 are showing
cat("Total number of labels that should appear:", nrow(centroids), "\n")
## Total number of labels that should appear: 32
# Show the exact positions of the missing states
missing_likely <- centroids %>%
filter(region %in% c("rhode island", "indiana")) %>%
select(region, abbrev, lon, lat, avg_pfoa, label)
print(missing_likely)
## # A tibble: 2 × 6
## region abbrev lon lat avg_pfoa label
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 indiana IN -86.7 38.8 2.89 IN 2.89
## 2 rhode island RI -71.5 41.6 5.29 RI 5.29
# Create focused water use categories
df_filtered <- df[df$water_use != "Unknown", ]
# Recode water use categories to focus on 4 main categories
df_filtered$water_use_grouped <- as.character(df_filtered$water_use)
df_filtered$water_use_grouped[df_filtered$water_use_grouped == "Water supply, other"] <- "Water Supply"
# Keep only the 4 main categories (excluding Unknown and Observation)
df_filtered <- df_filtered[df_filtered$water_use_grouped %in% c("Production", "Domestic", "Irrigation", "Water Supply"), ]
# Calculate mean PFOA levels by category
pfoa_summary <- aggregate(pfoa_val ~ water_use_grouped, data = df_filtered,
FUN = function(x) c(mean_pfoa = round(mean(x), 2),
count = length(x),
se = round(sd(x)/sqrt(length(x)), 2)))
pfoa_summary_df <- data.frame(
category = pfoa_summary$water_use_grouped,
mean_pfoa = pfoa_summary$pfoa_val[,1],
count = pfoa_summary$pfoa_val[,2],
se = pfoa_summary$pfoa_val[,3]
)
# Create bar chart with ggplot2
library(ggplot2)
p3 <- ggplot(pfoa_summary_df, aes(x = category, y = mean_pfoa, fill = category)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste("Category:", category, "\nMean PFOA:", mean_pfoa)),
vjust = 0.5, color = "white", size = 3, fontface = "bold") +
scale_fill_manual(values = c('#2E86AB', '#A23B72', '#F18F01', '#4CAF50')) +
labs(title = "Mean PFOA Contamination by Water Use Category",
x = "Water Use Category",
y = "Mean PFOA Level") +
theme_minimal() +
theme(legend.position = "none")
print(p3)
p4 <- df %>%
filter(!is.na(tritium_category) & tritium_category != "unknown") %>%
group_by(tritium_category) %>%
summarise(
mean_pfoa = mean(pfoa_val, na.rm = TRUE),
mean_well_depth = mean(well_depth, na.rm = TRUE),
mean_cancer_rate = mean(cancer_rate, na.rm = TRUE),
sample_count = n(),
.groups = 'drop'
)
# Define consistent colors for each category
category_colors <- c("Mixed" = "#66C2A5", "PreModern" = "#FC8D62", "Unknown" = "#8DA0CB", "Modern" = "#E78AC3")
# Create separate plots for each metric - much clearer!
plot_pfoa <- p4 %>%
ggplot(aes(x = reorder(tritium_category, mean_pfoa), y = mean_pfoa, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = round(mean_pfoa, 1)), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
labs(title = "Mean PFOA Levels", x = "", y = "PFOA Level") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
plot_depth <- p4 %>%
ggplot(aes(x = reorder(tritium_category, mean_well_depth), y = mean_well_depth, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = round(mean_well_depth, 0)), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) + # Use consistent colors
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
labs(title = "Mean Well Depth", x = "", y = "Depth (feet)") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
plot_cancer <- p4 %>%
ggplot(aes(x = reorder(tritium_category, mean_cancer_rate), y = mean_cancer_rate, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = paste0(round(mean_cancer_rate * 100, 1))), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25)),
labels = scales::percent_format()) +
labs(title = "Mean Cancer Rate", x = "Water Age Category", y = "Cancer Rate (%)") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
# Arrange in a 1x3 grid (since we removed the count plot)
grid.arrange(plot_pfoa, plot_depth, plot_cancer, ncol = 3, nrow = 1)
# Print the summary table for reference
print(p4)
## # A tibble: 4 × 5
## tritium_category mean_pfoa mean_well_depth mean_cancer_rate sample_count
## <fct> <dbl> <dbl> <dbl> <int>
## 1 Mixed 2.95 116. 0.495 283
## 2 Modern 6.51 51.7 0.559 654
## 3 PreModern 3.02 125. 0.542 184
## 4 Unknown 3.54 80.6 0.553 117
library(gridExtra)
# Plot: Mean PFOA
plot_pfoa <- p4 %>%
filter(tritium_category != "Unknown") %>%
ggplot(aes(x = reorder(tritium_category, mean_pfoa), y = mean_pfoa, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = round(mean_pfoa, 1)), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
labs(title = "Mean PFOA Levels", x = "", y = "PFOA Level") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
# Plot: Mean Well Depth
plot_depth <- p4 %>%
filter(tritium_category != "Unknown") %>%
ggplot(aes(x = reorder(tritium_category, mean_well_depth), y = mean_well_depth, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = round(mean_well_depth, 0)), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
labs(title = "Mean Well Depth", x = "", y = "Depth (feet)") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
# Plot: Mean Cancer Rate
plot_cancer <- p4 %>%
filter(tritium_category != "Unknown") %>%
ggplot(aes(x = reorder(tritium_category, mean_cancer_rate), y = mean_cancer_rate, fill = tritium_category)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = paste0(round(mean_cancer_rate * 100, 1))), vjust = -0.5, hjust = 0.5,
fontface = "bold", size = 3.5, color = "black") +
scale_fill_manual(values = category_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.25)),
labels = scales::percent_format()) +
labs(title = "Mean Cancer Rate", x = "Water Age Category", y = "Cancer Rate (%)") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.title = element_text(size = 14, face = "bold"))
# Arrange the plots in one row
grid.arrange(plot_pfoa, plot_depth, plot_cancer, ncol = 3, nrow = 1)
# Print the summary table for reference
print(p4)
## # A tibble: 4 × 5
## tritium_category mean_pfoa mean_well_depth mean_cancer_rate sample_count
## <fct> <dbl> <dbl> <dbl> <int>
## 1 Mixed 2.95 116. 0.495 283
## 2 Modern 6.51 51.7 0.559 654
## 3 PreModern 3.02 125. 0.542 184
## 4 Unknown 3.54 80.6 0.553 117
# PLOT 5: Seasonal Patterns and Water Use Analysis
# Monthly patterns
monthly_summary <- df %>%
group_by(month, water_use) %>%
summarise(
avg_pfoa = mean(pfoa_val, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 5) # Only include combinations with sufficient data
# Convert month to factor with proper ordering
month_order <- c("January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December")
monthly_summary$month <- factor(monthly_summary$month, levels = month_order)
p4 <- monthly_summary %>%
ggplot(aes(x = month, y = avg_pfoa, color = water_use, group = water_use)) +
geom_line(linewidth = 1.2, alpha = 0.8) +
geom_point(size = 2.5) +
scale_color_brewer(type = "qual", palette = "Dark2", name = "Water Use Type") +
labs(
title = "Seasonal PFOA Contamination Patterns by Water Use",
x = "Month",
y = "Average PFOA Level"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p4)
# PLOT 5: Seasonal Patterns and Water Use Analysis (Filtered)
# Monthly patterns - excluding Unknown category
monthly_summary <- df %>%
filter(water_use != "Unknown") %>% # Filter out Unknown category
group_by(month, water_use) %>%
summarise(
avg_pfoa = mean(pfoa_val, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 5) # Only include combinations with sufficient data
# Convert month to factor with proper ordering
month_order <- c("January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December")
monthly_summary$month <- factor(monthly_summary$month, levels = month_order)
# Create the plot without Unknown category
p4 <- monthly_summary %>%
ggplot(aes(x = month, y = avg_pfoa, color = water_use, group = water_use)) +
geom_line(linewidth = 1.2, alpha = 0.8) +
geom_point(size = 2.5) +
scale_color_brewer(type = "qual", palette = "Dark2", name = "Water Use Type") +
labs(
title = "Monthly Average PFOA Levels by Water Use Category (ng/L, 2019-2022)",
x = "Month",
y = "Average PFOA Level"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p4)
# Optional: Check what categories remain after filtering
cat("Remaining water use categories after filtering:\n")
## Remaining water use categories after filtering:
remaining_categories <- monthly_summary %>%
distinct(water_use) %>%
pull(water_use)
print(sort(remaining_categories))
## [1] Domestic Irrigation Observation Production
## 6 Levels: Domestic Irrigation Observation Production ... Water supply, other
df$state <- as.factor(df$state)
str(df)
## 'data.frame': 1238 obs. of 23 variables:
## $ pfoa_val : num 4 4 4 3.7 4 3.7 4 3.7 4 3.7 ...
## $ pfoa_detected : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ state : Factor w/ 32 levels "alabama","california",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ water_use : Factor w/ 6 levels "Domestic","Irrigation",..: 4 6 1 1 1 1 1 1 6 1 ...
## $ tritium_tu : num -0.04 -0.03 -0.2 1.96 1.4 2.12 -0.07 0.19 2.4 0.55 ...
## $ well_depth : num 152.4 141.7 77.7 54 64 ...
## $ tritium_category : Factor w/ 4 levels "Mixed","Modern",..: 3 3 3 2 2 2 3 1 2 1 ...
## $ month : Factor w/ 12 levels "April","August",..: 2 7 7 6 6 6 6 6 6 6 ...
## $ median_income : num 50536 50536 50536 50536 50536 ...
## $ poverty_rate : num 16.7 16.7 16.7 16.7 16.7 ...
## $ unemployment_rate : num 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 5.89 ...
## $ pct_white : num 68.1 68.1 68.1 68.1 68.1 ...
## $ pct_black : num 26.6 26.6 26.6 26.6 26.6 ...
## $ pct_hispanic : num 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 4.28 ...
## $ pct_bachelor_plus : num 23.9 23.9 23.9 23.9 23.9 ...
## $ pct_children : num 22.5 22.5 22.5 22.5 22.5 ...
## $ pct_adults : num 61 61 61 61 61 ...
## $ pct_older : num 16.5 16.5 16.5 16.5 16.5 ...
## $ pop_density_per_sq_mile: num 96.3 96.3 96.3 96.3 96.3 ...
## $ longitude : num -86.8 -86.8 -86.8 -86.8 -86.8 ...
## $ latitude : num 32.8 32.8 32.8 32.8 32.8 ...
## $ cancer_rate : num 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
## $ log_pfoa : num 1.61 1.61 1.61 1.55 1.61 ...
library(corrplot)
library(ggplot2)
library(dplyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:maps':
##
## map
## The following object is masked from 'package:car':
##
## some
# DUAL CORRELATION ANALYSIS
# 1. Select variables (excluding pfoa_val to avoid redundancy with log_pfoa)
numeric_vars <- df %>%
select(log_pfoa, tritium_tu, well_depth, median_income,
poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older,
pop_density_per_sq_mile, cancer_rate) %>%
na.omit()
# 2. ANALYSIS 1: log_PFOA as Response Variable
# Extract correlations with log_pfoa
log_pfoa_correlations <- cor(numeric_vars, use = "complete.obs")[,"log_pfoa"] %>%
as.data.frame() %>%
rename(correlation = 1) %>%
mutate(variable = rownames(.)) %>%
filter(variable != "log_pfoa") %>%
arrange(desc(abs(correlation)))
# Create bar plot for log_PFOA correlations
p_log_pfoa <- log_pfoa_correlations %>%
mutate(
direction = ifelse(correlation > 0, "Positive", "Negative"),
variable = reorder(variable, correlation),
abs_corr = abs(correlation)
) %>%
ggplot(aes(x = variable, y = correlation, fill = direction)) +
geom_col(alpha = 0.8) +
scale_fill_manual(values = c("Negative" = "#d73027", "Positive" = "#1a9850")) +
coord_flip() +
labs(
title = "Correlations with log(PFOA) - Response Variable Analysis",
x = "Predictor Variables",
y = "Correlation Coefficient",
fill = "Direction"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "bottom"
) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5)
print(p_log_pfoa)
# Print top correlations with log_PFOA
cat("=== LOG_PFOA AS RESPONSE VARIABLE ===\n")
## === LOG_PFOA AS RESPONSE VARIABLE ===
cat("Top 5 Positive Correlations with log_PFOA:\n")
## Top 5 Positive Correlations with log_PFOA:
positive_log <- log_pfoa_correlations %>%
filter(correlation > 0) %>%
head(5) %>%
mutate(correlation = round(correlation, 4))
print(positive_log)
## correlation variable
## pct_white 0.2264 pct_white
## tritium_tu 0.2119 tritium_tu
## pct_older 0.1623 pct_older
## pop_density_per_sq_mile 0.1347 pop_density_per_sq_mile
## poverty_rate 0.0867 poverty_rate
cat("\nTop 5 Negative Correlations with log_PFOA:\n")
##
## Top 5 Negative Correlations with log_PFOA:
negative_log <- log_pfoa_correlations %>%
filter(correlation < 0) %>%
head(5) %>%
mutate(correlation = round(correlation, 4))
print(negative_log)
## correlation variable
## median_income -0.1922 median_income
## pct_children -0.1692 pct_children
## pct_hispanic -0.1144 pct_hispanic
## well_depth -0.0917 well_depth
## pct_bachelor_plus -0.0838 pct_bachelor_plus
# 3. ANALYSIS 2: Predictor-Only Correlation Matrix
# Remove log_pfoa to focus on predictors only
predictors_only <- numeric_vars %>%
select(-log_pfoa)
predictor_cor_matrix <- cor(predictors_only, use = "complete.obs")
# Create correlation plot for predictors only
corrplot(predictor_cor_matrix, method = "color", type = "upper",
order = "hclust", tl.cex = 0.7, tl.col = "black", tl.srt = 45,
col = colorRampPalette(c("#053061", "#2166ac", "#4393c3", "#92c5de",
"#d1e5f0", "#ffffff", "#fddbc7", "#f4a582",
"#d6604d", "#b2182b", "#67001f"))(200),
title = "Predictor Variables Correlation Matrix", mar = c(0,0,2,0))
# 4. Find strongest predictor-predictor correlations
predictor_pairs <- which(abs(predictor_cor_matrix) > 0.7 &
predictor_cor_matrix != 1, arr.ind = TRUE)
if(nrow(predictor_pairs) > 0) {
strong_correlations <- data.frame(
var1 = rownames(predictor_cor_matrix)[predictor_pairs[,1]],
var2 = colnames(predictor_cor_matrix)[predictor_pairs[,2]],
correlation = predictor_cor_matrix[predictor_pairs]
) %>%
arrange(desc(abs(correlation))) %>%
mutate(correlation = round(correlation, 4))
cat("\n=== PREDICTOR-PREDICTOR RELATIONSHIPS ===\n")
cat("Strong Correlations (|r| > 0.7) Among Predictors:\n")
print(strong_correlations)
} else {
cat("\n=== PREDICTOR-PREDICTOR RELATIONSHIPS ===\n")
cat("No correlations > 0.7 found among predictors\n")
}
##
## === PREDICTOR-PREDICTOR RELATIONSHIPS ===
## Strong Correlations (|r| > 0.7) Among Predictors:
## var1 var2 correlation
## 1 pct_bachelor_plus median_income 0.8618
## 2 median_income pct_bachelor_plus 0.8618
## 3 pct_bachelor_plus poverty_rate -0.8576
## 4 poverty_rate pct_bachelor_plus -0.8576
## 5 pct_white unemployment_rate -0.7858
## 6 unemployment_rate pct_white -0.7858
## 7 pct_older pct_children -0.7716
## 8 pct_children pct_older -0.7716
## 9 pct_hispanic pct_white -0.7494
## 10 pct_white pct_hispanic -0.7494
## 11 pct_black poverty_rate 0.7374
## 12 poverty_rate pct_black 0.7374
## 13 pct_adults median_income 0.7344
## 14 median_income pct_adults 0.7344
## 15 poverty_rate median_income -0.7261
## 16 median_income poverty_rate -0.7261
## 17 pct_hispanic median_income 0.7195
## 18 median_income pct_hispanic 0.7195
# 5. Moderate correlations among predictors (0.5 to 0.7)
moderate_pairs <- which(abs(predictor_cor_matrix) > 0.5 &
abs(predictor_cor_matrix) <= 0.7 &
predictor_cor_matrix != 1, arr.ind = TRUE)
if(nrow(moderate_pairs) > 0) {
moderate_correlations <- data.frame(
var1 = rownames(predictor_cor_matrix)[moderate_pairs[,1]],
var2 = colnames(predictor_cor_matrix)[moderate_pairs[,2]],
correlation = predictor_cor_matrix[moderate_pairs]
) %>%
arrange(desc(abs(correlation))) %>%
mutate(correlation = round(correlation, 4))
cat("\nModerate Correlations (0.5 < |r| ≤ 0.7) Among Predictors:\n")
print(head(moderate_correlations, 10))
}
##
## Moderate Correlations (0.5 < |r| ≤ 0.7) Among Predictors:
## var1 var2 correlation
## 1 pct_adults pct_hispanic 0.6564
## 2 pct_hispanic pct_adults 0.6564
## 3 pct_adults pct_bachelor_plus 0.6494
## 4 pct_bachelor_plus pct_adults 0.6494
## 5 pct_black median_income -0.6430
## 6 median_income pct_black -0.6430
## 7 pct_hispanic pct_black -0.6316
## 8 pct_black pct_hispanic -0.6316
## 9 pct_white median_income -0.6176
## 10 median_income pct_white -0.6176
# 6. Statistical significance for log_PFOA correlations
log_pfoa_sig_tests <- predictors_only %>%
map_dfr(~{
test_result <- cor.test(numeric_vars$log_pfoa, .x)
data.frame(
correlation = test_result$estimate,
p_value = test_result$p.value,
significant = test_result$p.value < 0.05,
conf_low = test_result$conf.int[1],
conf_high = test_result$conf.int[2]
)
}, .id = "variable") %>%
arrange(desc(abs(correlation))) %>%
mutate(across(where(is.numeric), round, 4))
cat("\n=== STATISTICAL SIGNIFICANCE ===\n")
##
## === STATISTICAL SIGNIFICANCE ===
cat("Statistically Significant Correlations with log_PFOA (p < 0.05):\n")
## Statistically Significant Correlations with log_PFOA (p < 0.05):
significant_log <- log_pfoa_sig_tests %>%
filter(significant == TRUE)
print(significant_log)
## variable correlation p_value significant conf_low
## cor...1 pct_white 0.2264 0.0000 TRUE 0.1729
## cor...2 tritium_tu 0.2119 0.0000 TRUE 0.1581
## cor...3 median_income -0.1922 0.0000 TRUE -0.2453
## cor...4 pct_children -0.1692 0.0000 TRUE -0.2229
## cor...5 pct_older 0.1623 0.0000 TRUE 0.1076
## cor...6 pop_density_per_sq_mile 0.1347 0.0000 TRUE 0.0795
## cor...7 pct_hispanic -0.1144 0.0001 TRUE -0.1690
## cor...8 well_depth -0.0917 0.0012 TRUE -0.1467
## cor...9 poverty_rate 0.0867 0.0023 TRUE 0.0312
## cor...10 pct_bachelor_plus -0.0838 0.0032 TRUE -0.1388
## cor...11 cancer_rate 0.0729 0.0102 TRUE 0.0173
## conf_high
## cor...1 0.2786
## cor...2 0.2645
## cor...3 -0.1379
## cor...4 -0.1146
## cor...5 0.2161
## cor...6 0.1890
## cor...7 -0.0591
## cor...8 -0.0362
## cor...9 0.1418
## cor...10 -0.0282
## cor...11 0.1281
# Select only numeric columns and exclude unwanted variables
numeric_data <- df %>%
select(where(is.numeric)) %>%
select(-pfoa_val, -latitude, -longitude) # Remove these specific columns
# Calculate correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Plot correlation heatmap
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.cex = 0.8,
col = colorRampPalette(c("blue", "white", "red"))(200),
title = "Correlation Matrix with log_pfoa", mar = c(0,0,1,0))
#Filter out states with less than equal to 3 observations
df_filtered <- df %>%
group_by(state) %>%
filter(n() >= 3) %>%
ungroup()
# Count how many unique states remain after filtering
states_remaining <- df_filtered %>%
distinct(state) %>%
nrow()
# Display the result
cat("Number of states remaining after filtering:", states_remaining, "\n")
## Number of states remaining after filtering: 30
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks maps::map()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
# 1. OVERALL ANALYSIS: PFOA Detection vs Average Cancer Rate
# Summary statistics by PFOA detection status
# Note: cancer_rate is already a calculated average for each area
summary_stats <- df_filtered %>%
group_by(pfoa_detected) %>%
summarise(
n_observations = n(),
mean_cancer_rate = mean(cancer_rate, na.rm = TRUE),
sd_cancer_rate = sd(cancer_rate, na.rm = TRUE),
median_cancer_rate = median(cancer_rate, na.rm = TRUE),
min_cancer_rate = min(cancer_rate, na.rm = TRUE),
max_cancer_rate = max(cancer_rate, na.rm = TRUE),
.groups = 'drop'
)
print("Summary Statistics - Cancer Rates by PFOA Detection:")
## [1] "Summary Statistics - Cancer Rates by PFOA Detection:"
print(summary_stats)
## # A tibble: 2 × 7
## pfoa_detected n_observations mean_cancer_rate sd_cancer_rate
## <fct> <int> <dbl> <dbl>
## 1 0 126 0.570 0.181
## 2 1 1109 0.538 0.187
## # ℹ 3 more variables: median_cancer_rate <dbl>, min_cancer_rate <dbl>,
## # max_cancer_rate <dbl>
# T-test for difference in cancer rates between detected vs not detected areas
t_test_result <- t.test(cancer_rate ~ pfoa_detected, data = df_filtered)
print("\nT-test Results - PFOA Detection vs Cancer Rates:")
## [1] "\nT-test Results - PFOA Detection vs Cancer Rates:"
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: cancer_rate by pfoa_detected
## t = 1.85, df = 156.86, p-value = 0.0662
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.002140528 0.065390274
## sample estimates:
## mean in group 0 mean in group 1
## 0.5699206 0.5382958
# Calculate effect size (Cohen's d)
library(effectsize)
## Warning: package 'effectsize' was built under R version 4.4.3
cohens_d <- cohens_d(cancer_rate ~ pfoa_detected, data = df_filtered)
print("\nEffect Size (Cohen's d):")
## [1] "\nEffect Size (Cohen's d):"
print(cohens_d)
## Cohen's d | 95% CI
## -------------------------
## 0.17 | [-0.01, 0.35]
##
## - Estimated using pooled SD.
p-value = 0.0662 (not statistically significant at α = 0.05) The difference in cancer rates between PFOA detected vs not detected groups is not statistically significant
To examine if there is a statistical significant relationship between log_pfoa and cancer_rate?
# OVERALL ANALYSIS: Log PFOA vs Cancer Rate
# Basic correlation
correlation_result <- cor.test(df_filtered$log_pfoa, df_filtered$cancer_rate, method = "pearson")
print("Correlation between log_pfoa and cancer_rate:")
## [1] "Correlation between log_pfoa and cancer_rate:"
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: df_filtered$log_pfoa and df_filtered$cancer_rate
## t = 2.5551, df = 1233, p-value = 0.01074
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01685972 0.12783734
## sample estimates:
## cor
## 0.07257316
# Simple linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
print(summary(lm_simple))
##
## Call:
## lm(formula = cancer_rate ~ log_pfoa, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18955 -0.08401 -0.07536 0.04333 1.03328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.50337 0.01584 31.771 <2e-16 ***
## log_pfoa 0.02789 0.01092 2.555 0.0107 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1863 on 1233 degrees of freedom
## Multiple R-squared: 0.005267, Adjusted R-squared: 0.00446
## F-statistic: 6.528 on 1 and 1233 DF, p-value: 0.01074
Higher PFOA levels are associated with higher cancer rates This is a dose-response relationship - more PFOA exposure correlates with more cancer
# Load required libraries
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
library(tidyverse)
# Extract results from your linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
summary_results <- summary(lm_simple)
# Create a clean results table
regression_table <- tibble(
Variable = c("Intercept", "Log PFOA"),
Coefficient = c(summary_results$coefficients[1,1], summary_results$coefficients[2,1]),
`Std Error` = c(summary_results$coefficients[1,2], summary_results$coefficients[2,2]),
`t-value` = c(summary_results$coefficients[1,3], summary_results$coefficients[2,3]),
`p-value` = c(summary_results$coefficients[1,4], summary_results$coefficients[2,4]),
Significance = case_when(
`p-value` < 0.001 ~ "***",
`p-value` < 0.01 ~ "**",
`p-value` < 0.05 ~ "*",
`p-value` < 0.1 ~ ".",
TRUE ~ ""
)
)
# Create model statistics table
model_stats <- tibble(
Statistic = c("R-squared", "Adjusted R-squared", "F-statistic", "p-value (model)", "Sample Size"),
Value = c(
summary_results$r.squared,
summary_results$adj.r.squared,
summary_results$fstatistic[1],
pf(summary_results$fstatistic[1], summary_results$fstatistic[2], summary_results$fstatistic[3], lower.tail = FALSE),
nrow(df_filtered)
)
)
# Create the main regression results table
gt_table1 <- regression_table %>%
gt() %>%
tab_header(
title = "Linear Regression Results: Cancer Rate vs Log PFOA",
subtitle = "Simple linear regression analysis (n = 1,235)"
) %>%
fmt_number(
columns = c(Coefficient, `Std Error`),
decimals = 5
) %>%
fmt_number(
columns = `t-value`,
decimals = 3
) %>%
fmt_scientific(
columns = `p-value`,
decimals = 3
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(
columns = everything(),
rows = Variable == "Log PFOA"
)
) %>%
tab_style(
style = cell_fill(color = "lightblue"),
locations = cells_body(
columns = everything(),
rows = Variable == "Log PFOA"
)
) %>%
tab_footnote(
footnote = "*** p < 0.001, ** p < 0.01, * p < 0.05, . p < 0.1",
locations = cells_column_labels(columns = Significance)
) %>%
tab_source_note(
source_note = "Data: Water sampling locations across 30 states (2019-2022)"
)
print(gt_table1)
## <div id="uwjpqaiwyw" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
## <style>#uwjpqaiwyw table {
## font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
## -webkit-font-smoothing: antialiased;
## -moz-osx-font-smoothing: grayscale;
## }
##
## #uwjpqaiwyw thead, #uwjpqaiwyw tbody, #uwjpqaiwyw tfoot, #uwjpqaiwyw tr, #uwjpqaiwyw td, #uwjpqaiwyw th {
## border-style: none;
## }
##
## #uwjpqaiwyw p {
## margin: 0;
## padding: 0;
## }
##
## #uwjpqaiwyw .gt_table {
## display: table;
## border-collapse: collapse;
## line-height: normal;
## margin-left: auto;
## margin-right: auto;
## color: #333333;
## font-size: 16px;
## font-weight: normal;
## font-style: normal;
## background-color: #FFFFFF;
## width: auto;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #A8A8A8;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #A8A8A8;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_caption {
## padding-top: 4px;
## padding-bottom: 4px;
## }
##
## #uwjpqaiwyw .gt_title {
## color: #333333;
## font-size: 125%;
## font-weight: initial;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-color: #FFFFFF;
## border-bottom-width: 0;
## }
##
## #uwjpqaiwyw .gt_subtitle {
## color: #333333;
## font-size: 85%;
## font-weight: initial;
## padding-top: 3px;
## padding-bottom: 5px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-color: #FFFFFF;
## border-top-width: 0;
## }
##
## #uwjpqaiwyw .gt_heading {
## background-color: #FFFFFF;
## text-align: center;
## border-bottom-color: #FFFFFF;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_bottom_border {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_col_headings {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_col_heading {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 6px;
## padding-left: 5px;
## padding-right: 5px;
## overflow-x: hidden;
## }
##
## #uwjpqaiwyw .gt_column_spanner_outer {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## padding-top: 0;
## padding-bottom: 0;
## padding-left: 4px;
## padding-right: 4px;
## }
##
## #uwjpqaiwyw .gt_column_spanner_outer:first-child {
## padding-left: 0;
## }
##
## #uwjpqaiwyw .gt_column_spanner_outer:last-child {
## padding-right: 0;
## }
##
## #uwjpqaiwyw .gt_column_spanner {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 5px;
## overflow-x: hidden;
## display: inline-block;
## width: 100%;
## }
##
## #uwjpqaiwyw .gt_spanner_row {
## border-bottom-style: hidden;
## }
##
## #uwjpqaiwyw .gt_group_heading {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## text-align: left;
## }
##
## #uwjpqaiwyw .gt_empty_group_heading {
## padding: 0.5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: middle;
## }
##
## #uwjpqaiwyw .gt_from_md > :first-child {
## margin-top: 0;
## }
##
## #uwjpqaiwyw .gt_from_md > :last-child {
## margin-bottom: 0;
## }
##
## #uwjpqaiwyw .gt_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## margin: 10px;
## border-top-style: solid;
## border-top-width: 1px;
## border-top-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## overflow-x: hidden;
## }
##
## #uwjpqaiwyw .gt_stub {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #uwjpqaiwyw .gt_stub_row_group {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## vertical-align: top;
## }
##
## #uwjpqaiwyw .gt_row_group_first td {
## border-top-width: 2px;
## }
##
## #uwjpqaiwyw .gt_row_group_first th {
## border-top-width: 2px;
## }
##
## #uwjpqaiwyw .gt_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #uwjpqaiwyw .gt_first_summary_row {
## border-top-style: solid;
## border-top-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_first_summary_row.thick {
## border-top-width: 2px;
## }
##
## #uwjpqaiwyw .gt_last_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_grand_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #uwjpqaiwyw .gt_first_grand_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-style: double;
## border-top-width: 6px;
## border-top-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_last_grand_summary_row_top {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: double;
## border-bottom-width: 6px;
## border-bottom-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_striped {
## background-color: rgba(128, 128, 128, 0.05);
## }
##
## #uwjpqaiwyw .gt_table_body {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_footnotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_footnote {
## margin: 0px;
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #uwjpqaiwyw .gt_sourcenotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #uwjpqaiwyw .gt_sourcenote {
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #uwjpqaiwyw .gt_left {
## text-align: left;
## }
##
## #uwjpqaiwyw .gt_center {
## text-align: center;
## }
##
## #uwjpqaiwyw .gt_right {
## text-align: right;
## font-variant-numeric: tabular-nums;
## }
##
## #uwjpqaiwyw .gt_font_normal {
## font-weight: normal;
## }
##
## #uwjpqaiwyw .gt_font_bold {
## font-weight: bold;
## }
##
## #uwjpqaiwyw .gt_font_italic {
## font-style: italic;
## }
##
## #uwjpqaiwyw .gt_super {
## font-size: 65%;
## }
##
## #uwjpqaiwyw .gt_footnote_marks {
## font-size: 75%;
## vertical-align: 0.4em;
## position: initial;
## }
##
## #uwjpqaiwyw .gt_asterisk {
## font-size: 100%;
## vertical-align: 0;
## }
##
## #uwjpqaiwyw .gt_indent_1 {
## text-indent: 5px;
## }
##
## #uwjpqaiwyw .gt_indent_2 {
## text-indent: 10px;
## }
##
## #uwjpqaiwyw .gt_indent_3 {
## text-indent: 15px;
## }
##
## #uwjpqaiwyw .gt_indent_4 {
## text-indent: 20px;
## }
##
## #uwjpqaiwyw .gt_indent_5 {
## text-indent: 25px;
## }
##
## #uwjpqaiwyw .katex-display {
## display: inline-flex !important;
## margin-bottom: 0.75em !important;
## }
##
## #uwjpqaiwyw div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
## height: 0px !important;
## }
## </style>
## <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
## <thead>
## <tr class="gt_heading">
## <td colspan="6" class="gt_heading gt_title gt_font_normal" style>Linear Regression Results: Cancer Rate vs Log PFOA</td>
## </tr>
## <tr class="gt_heading">
## <td colspan="6" class="gt_heading gt_subtitle gt_font_normal gt_bottom_border" style>Simple linear regression analysis (n = 1,235)</td>
## </tr>
## <tr class="gt_col_headings">
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Variable">Variable</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Coefficient">Coefficient</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Std-Error">Std Error</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="t-value">t-value</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="p-value">p-value</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Significance">Significance<span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
## </tr>
## </thead>
## <tbody class="gt_table_body">
## <tr><td headers="Variable" class="gt_row gt_left">Intercept</td>
## <td headers="Coefficient" class="gt_row gt_right">0.50337</td>
## <td headers="Std Error" class="gt_row gt_right">0.01584</td>
## <td headers="t-value" class="gt_row gt_right">31.771</td>
## <td headers="p-value" class="gt_row gt_right">2.462 × 10<sup style='font-size: 65%;'>−162</sup></td>
## <td headers="Significance" class="gt_row gt_right">***</td></tr>
## <tr><td headers="Variable" class="gt_row gt_left" style="font-weight: bold; background-color: #ADD8E6;">Log PFOA</td>
## <td headers="Coefficient" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">0.02789</td>
## <td headers="Std Error" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">0.01092</td>
## <td headers="t-value" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">2.555</td>
## <td headers="p-value" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">1.074 × 10<sup style='font-size: 65%;'>−2</sup></td>
## <td headers="Significance" class="gt_row gt_right" style="font-weight: bold; background-color: #ADD8E6;">*</td></tr>
## </tbody>
## <tfoot class="gt_sourcenotes">
## <tr>
## <td class="gt_sourcenote" colspan="6">Data: Water sampling locations across 30 states (2019-2022)</td>
## </tr>
## </tfoot>
## <tfoot class="gt_footnotes">
## <tr>
## <td class="gt_footnote" colspan="6"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span> *** p < 0.001, ** p < 0.01, * p < 0.05, . p < 0.1</td>
## </tr>
## </tfoot>
## </table>
## </div>
# Create model statistics table
gt_table2 <- model_stats %>%
gt() %>%
tab_header(
title = "Model Performance Statistics"
) %>%
fmt_number(
columns = Value,
decimals = 4,
rows = 1:3
) %>%
fmt_scientific(
columns = Value,
decimals = 3,
rows = 4
) %>%
fmt_number(
columns = Value,
decimals = 0,
rows = 5
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(
rows = Statistic %in% c("p-value (model)", "R-squared")
)
)
print(gt_table2)
## <div id="cccxryjpbj" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
## <style>#cccxryjpbj table {
## font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
## -webkit-font-smoothing: antialiased;
## -moz-osx-font-smoothing: grayscale;
## }
##
## #cccxryjpbj thead, #cccxryjpbj tbody, #cccxryjpbj tfoot, #cccxryjpbj tr, #cccxryjpbj td, #cccxryjpbj th {
## border-style: none;
## }
##
## #cccxryjpbj p {
## margin: 0;
## padding: 0;
## }
##
## #cccxryjpbj .gt_table {
## display: table;
## border-collapse: collapse;
## line-height: normal;
## margin-left: auto;
## margin-right: auto;
## color: #333333;
## font-size: 16px;
## font-weight: normal;
## font-style: normal;
## background-color: #FFFFFF;
## width: auto;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #A8A8A8;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #A8A8A8;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_caption {
## padding-top: 4px;
## padding-bottom: 4px;
## }
##
## #cccxryjpbj .gt_title {
## color: #333333;
## font-size: 125%;
## font-weight: initial;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-color: #FFFFFF;
## border-bottom-width: 0;
## }
##
## #cccxryjpbj .gt_subtitle {
## color: #333333;
## font-size: 85%;
## font-weight: initial;
## padding-top: 3px;
## padding-bottom: 5px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-color: #FFFFFF;
## border-top-width: 0;
## }
##
## #cccxryjpbj .gt_heading {
## background-color: #FFFFFF;
## text-align: center;
## border-bottom-color: #FFFFFF;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_bottom_border {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_col_headings {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_col_heading {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 6px;
## padding-left: 5px;
## padding-right: 5px;
## overflow-x: hidden;
## }
##
## #cccxryjpbj .gt_column_spanner_outer {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## padding-top: 0;
## padding-bottom: 0;
## padding-left: 4px;
## padding-right: 4px;
## }
##
## #cccxryjpbj .gt_column_spanner_outer:first-child {
## padding-left: 0;
## }
##
## #cccxryjpbj .gt_column_spanner_outer:last-child {
## padding-right: 0;
## }
##
## #cccxryjpbj .gt_column_spanner {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 5px;
## overflow-x: hidden;
## display: inline-block;
## width: 100%;
## }
##
## #cccxryjpbj .gt_spanner_row {
## border-bottom-style: hidden;
## }
##
## #cccxryjpbj .gt_group_heading {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## text-align: left;
## }
##
## #cccxryjpbj .gt_empty_group_heading {
## padding: 0.5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: middle;
## }
##
## #cccxryjpbj .gt_from_md > :first-child {
## margin-top: 0;
## }
##
## #cccxryjpbj .gt_from_md > :last-child {
## margin-bottom: 0;
## }
##
## #cccxryjpbj .gt_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## margin: 10px;
## border-top-style: solid;
## border-top-width: 1px;
## border-top-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## overflow-x: hidden;
## }
##
## #cccxryjpbj .gt_stub {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cccxryjpbj .gt_stub_row_group {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## vertical-align: top;
## }
##
## #cccxryjpbj .gt_row_group_first td {
## border-top-width: 2px;
## }
##
## #cccxryjpbj .gt_row_group_first th {
## border-top-width: 2px;
## }
##
## #cccxryjpbj .gt_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cccxryjpbj .gt_first_summary_row {
## border-top-style: solid;
## border-top-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_first_summary_row.thick {
## border-top-width: 2px;
## }
##
## #cccxryjpbj .gt_last_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_grand_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cccxryjpbj .gt_first_grand_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-style: double;
## border-top-width: 6px;
## border-top-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_last_grand_summary_row_top {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: double;
## border-bottom-width: 6px;
## border-bottom-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_striped {
## background-color: rgba(128, 128, 128, 0.05);
## }
##
## #cccxryjpbj .gt_table_body {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_footnotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_footnote {
## margin: 0px;
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cccxryjpbj .gt_sourcenotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #cccxryjpbj .gt_sourcenote {
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cccxryjpbj .gt_left {
## text-align: left;
## }
##
## #cccxryjpbj .gt_center {
## text-align: center;
## }
##
## #cccxryjpbj .gt_right {
## text-align: right;
## font-variant-numeric: tabular-nums;
## }
##
## #cccxryjpbj .gt_font_normal {
## font-weight: normal;
## }
##
## #cccxryjpbj .gt_font_bold {
## font-weight: bold;
## }
##
## #cccxryjpbj .gt_font_italic {
## font-style: italic;
## }
##
## #cccxryjpbj .gt_super {
## font-size: 65%;
## }
##
## #cccxryjpbj .gt_footnote_marks {
## font-size: 75%;
## vertical-align: 0.4em;
## position: initial;
## }
##
## #cccxryjpbj .gt_asterisk {
## font-size: 100%;
## vertical-align: 0;
## }
##
## #cccxryjpbj .gt_indent_1 {
## text-indent: 5px;
## }
##
## #cccxryjpbj .gt_indent_2 {
## text-indent: 10px;
## }
##
## #cccxryjpbj .gt_indent_3 {
## text-indent: 15px;
## }
##
## #cccxryjpbj .gt_indent_4 {
## text-indent: 20px;
## }
##
## #cccxryjpbj .gt_indent_5 {
## text-indent: 25px;
## }
##
## #cccxryjpbj .katex-display {
## display: inline-flex !important;
## margin-bottom: 0.75em !important;
## }
##
## #cccxryjpbj div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
## height: 0px !important;
## }
## </style>
## <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
## <thead>
## <tr class="gt_heading">
## <td colspan="2" class="gt_heading gt_title gt_font_normal gt_bottom_border" style>Model Performance Statistics</td>
## </tr>
##
## <tr class="gt_col_headings">
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Statistic">Statistic</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_right" rowspan="1" colspan="1" scope="col" id="Value">Value</th>
## </tr>
## </thead>
## <tbody class="gt_table_body">
## <tr><td headers="Statistic" class="gt_row gt_left" style="font-weight: bold;">R-squared</td>
## <td headers="Value" class="gt_row gt_right" style="font-weight: bold;">0.0053</td></tr>
## <tr><td headers="Statistic" class="gt_row gt_left">Adjusted R-squared</td>
## <td headers="Value" class="gt_row gt_right">0.0045</td></tr>
## <tr><td headers="Statistic" class="gt_row gt_left">F-statistic</td>
## <td headers="Value" class="gt_row gt_right">6.5284</td></tr>
## <tr><td headers="Statistic" class="gt_row gt_left" style="font-weight: bold;">p-value (model)</td>
## <td headers="Value" class="gt_row gt_right" style="font-weight: bold;">1.074 × 10<sup style='font-size: 65%;'>−2</sup></td></tr>
## <tr><td headers="Statistic" class="gt_row gt_left">Sample Size</td>
## <td headers="Value" class="gt_row gt_right">1,235</td></tr>
## </tbody>
##
##
## </table>
## </div>
# Create interpretation summary table
interpretation_table <- tibble(
`Key Finding` = c(
"Statistical Significance",
"Direction of Relationship",
"Effect Size",
"Practical Interpretation",
"Model Fit"
),
`Result` = c(
"p = 0.011 (statistically significant)",
"Positive association (β = 0.02789)",
"Small but consistent effect",
"1-unit increase in log PFOA → 0.028 increase in cancer rate",
"R² = 0.53% (small effect, large sample)"
)
)
gt_table3 <- interpretation_table %>%
gt() %>%
tab_header(
title = "Key Findings Summary",
subtitle = "Log PFOA levels and cancer rate relationship"
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = `Key Finding`)
) %>%
tab_style(
style = cell_fill(color = "lightyellow"),
locations = cells_body(rows = `Key Finding` == "Statistical Significance")
)
print(gt_table3)
## <div id="wmedzglkcm" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
## <style>#wmedzglkcm table {
## font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
## -webkit-font-smoothing: antialiased;
## -moz-osx-font-smoothing: grayscale;
## }
##
## #wmedzglkcm thead, #wmedzglkcm tbody, #wmedzglkcm tfoot, #wmedzglkcm tr, #wmedzglkcm td, #wmedzglkcm th {
## border-style: none;
## }
##
## #wmedzglkcm p {
## margin: 0;
## padding: 0;
## }
##
## #wmedzglkcm .gt_table {
## display: table;
## border-collapse: collapse;
## line-height: normal;
## margin-left: auto;
## margin-right: auto;
## color: #333333;
## font-size: 16px;
## font-weight: normal;
## font-style: normal;
## background-color: #FFFFFF;
## width: auto;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #A8A8A8;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #A8A8A8;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_caption {
## padding-top: 4px;
## padding-bottom: 4px;
## }
##
## #wmedzglkcm .gt_title {
## color: #333333;
## font-size: 125%;
## font-weight: initial;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-color: #FFFFFF;
## border-bottom-width: 0;
## }
##
## #wmedzglkcm .gt_subtitle {
## color: #333333;
## font-size: 85%;
## font-weight: initial;
## padding-top: 3px;
## padding-bottom: 5px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-color: #FFFFFF;
## border-top-width: 0;
## }
##
## #wmedzglkcm .gt_heading {
## background-color: #FFFFFF;
## text-align: center;
## border-bottom-color: #FFFFFF;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_bottom_border {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_col_headings {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_col_heading {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 6px;
## padding-left: 5px;
## padding-right: 5px;
## overflow-x: hidden;
## }
##
## #wmedzglkcm .gt_column_spanner_outer {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## padding-top: 0;
## padding-bottom: 0;
## padding-left: 4px;
## padding-right: 4px;
## }
##
## #wmedzglkcm .gt_column_spanner_outer:first-child {
## padding-left: 0;
## }
##
## #wmedzglkcm .gt_column_spanner_outer:last-child {
## padding-right: 0;
## }
##
## #wmedzglkcm .gt_column_spanner {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 5px;
## overflow-x: hidden;
## display: inline-block;
## width: 100%;
## }
##
## #wmedzglkcm .gt_spanner_row {
## border-bottom-style: hidden;
## }
##
## #wmedzglkcm .gt_group_heading {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## text-align: left;
## }
##
## #wmedzglkcm .gt_empty_group_heading {
## padding: 0.5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: middle;
## }
##
## #wmedzglkcm .gt_from_md > :first-child {
## margin-top: 0;
## }
##
## #wmedzglkcm .gt_from_md > :last-child {
## margin-bottom: 0;
## }
##
## #wmedzglkcm .gt_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## margin: 10px;
## border-top-style: solid;
## border-top-width: 1px;
## border-top-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## overflow-x: hidden;
## }
##
## #wmedzglkcm .gt_stub {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #wmedzglkcm .gt_stub_row_group {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## vertical-align: top;
## }
##
## #wmedzglkcm .gt_row_group_first td {
## border-top-width: 2px;
## }
##
## #wmedzglkcm .gt_row_group_first th {
## border-top-width: 2px;
## }
##
## #wmedzglkcm .gt_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #wmedzglkcm .gt_first_summary_row {
## border-top-style: solid;
## border-top-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_first_summary_row.thick {
## border-top-width: 2px;
## }
##
## #wmedzglkcm .gt_last_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_grand_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #wmedzglkcm .gt_first_grand_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-style: double;
## border-top-width: 6px;
## border-top-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_last_grand_summary_row_top {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: double;
## border-bottom-width: 6px;
## border-bottom-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_striped {
## background-color: rgba(128, 128, 128, 0.05);
## }
##
## #wmedzglkcm .gt_table_body {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_footnotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_footnote {
## margin: 0px;
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #wmedzglkcm .gt_sourcenotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #wmedzglkcm .gt_sourcenote {
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #wmedzglkcm .gt_left {
## text-align: left;
## }
##
## #wmedzglkcm .gt_center {
## text-align: center;
## }
##
## #wmedzglkcm .gt_right {
## text-align: right;
## font-variant-numeric: tabular-nums;
## }
##
## #wmedzglkcm .gt_font_normal {
## font-weight: normal;
## }
##
## #wmedzglkcm .gt_font_bold {
## font-weight: bold;
## }
##
## #wmedzglkcm .gt_font_italic {
## font-style: italic;
## }
##
## #wmedzglkcm .gt_super {
## font-size: 65%;
## }
##
## #wmedzglkcm .gt_footnote_marks {
## font-size: 75%;
## vertical-align: 0.4em;
## position: initial;
## }
##
## #wmedzglkcm .gt_asterisk {
## font-size: 100%;
## vertical-align: 0;
## }
##
## #wmedzglkcm .gt_indent_1 {
## text-indent: 5px;
## }
##
## #wmedzglkcm .gt_indent_2 {
## text-indent: 10px;
## }
##
## #wmedzglkcm .gt_indent_3 {
## text-indent: 15px;
## }
##
## #wmedzglkcm .gt_indent_4 {
## text-indent: 20px;
## }
##
## #wmedzglkcm .gt_indent_5 {
## text-indent: 25px;
## }
##
## #wmedzglkcm .katex-display {
## display: inline-flex !important;
## margin-bottom: 0.75em !important;
## }
##
## #wmedzglkcm div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
## height: 0px !important;
## }
## </style>
## <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
## <thead>
## <tr class="gt_heading">
## <td colspan="2" class="gt_heading gt_title gt_font_normal" style>Key Findings Summary</td>
## </tr>
## <tr class="gt_heading">
## <td colspan="2" class="gt_heading gt_subtitle gt_font_normal gt_bottom_border" style>Log PFOA levels and cancer rate relationship</td>
## </tr>
## <tr class="gt_col_headings">
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Key-Finding">Key Finding</th>
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="Result">Result</th>
## </tr>
## </thead>
## <tbody class="gt_table_body">
## <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold; background-color: #FFFFE0;">Statistical Significance</td>
## <td headers="Result" class="gt_row gt_left" style="background-color: #FFFFE0;">p = 0.011 (statistically significant)</td></tr>
## <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Direction of Relationship</td>
## <td headers="Result" class="gt_row gt_left">Positive association (β = 0.02789)</td></tr>
## <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Effect Size</td>
## <td headers="Result" class="gt_row gt_left">Small but consistent effect</td></tr>
## <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Practical Interpretation</td>
## <td headers="Result" class="gt_row gt_left">1-unit increase in log PFOA → 0.028 increase in cancer rate</td></tr>
## <tr><td headers="Key Finding" class="gt_row gt_left" style="font-weight: bold;">Model Fit</td>
## <td headers="Result" class="gt_row gt_left">R² = 0.53% (small effect, large sample)</td></tr>
## </tbody>
##
##
## </table>
## </div>
# Print code to save tables for PowerPoint
cat("\n# To save tables for PowerPoint, run:\n")
##
## # To save tables for PowerPoint, run:
cat("gtsave(gt_table1, 'regression_results.png', vwidth = 800, vheight = 400)\n")
## gtsave(gt_table1, 'regression_results.png', vwidth = 800, vheight = 400)
cat("gtsave(gt_table2, 'model_stats.png', vwidth = 600, vheight = 300)\n")
## gtsave(gt_table2, 'model_stats.png', vwidth = 600, vheight = 300)
cat("gtsave(gt_table3, 'interpretation.png', vwidth = 800, vheight = 350)\n")
## gtsave(gt_table3, 'interpretation.png', vwidth = 800, vheight = 350)
# Extract results from your linear regression
lm_simple <- lm(cancer_rate ~ log_pfoa, data = df_filtered)
summary_results <- summary(lm_simple)
# Create a clean results dataframe
regression_results <- data.frame(
Variable = c("Intercept", "Log PFOA"),
Coefficient = round(summary_results$coefficients[,1], 5),
Std_Error = round(summary_results$coefficients[,2], 5),
t_value = round(summary_results$coefficients[,3], 3),
p_value = round(summary_results$coefficients[,4], 4),
Significance = c(
ifelse(summary_results$coefficients[1,4] < 0.001, "***",
ifelse(summary_results$coefficients[1,4] < 0.01, "**",
ifelse(summary_results$coefficients[1,4] < 0.05, "*", ""))),
ifelse(summary_results$coefficients[2,4] < 0.001, "***",
ifelse(summary_results$coefficients[2,4] < 0.01, "**",
ifelse(summary_results$coefficients[2,4] < 0.05, "*", "")))
)
)
# Print the table
cat("Linear Regression: Cancer Rate ~ Log PFOA\n")
## Linear Regression: Cancer Rate ~ Log PFOA
cat("==========================================\n")
## ==========================================
print(regression_results, row.names = FALSE)
## Variable Coefficient Std_Error t_value p_value Significance
## Intercept 0.50337 0.01584 31.771 0.0000 ***
## Log PFOA 0.02789 0.01092 2.555 0.0107 *
# Add model statistics
cat("\nModel Statistics:\n")
##
## Model Statistics:
cat("=================\n")
## =================
cat("R-squared:", round(summary_results$r.squared, 6), "\n")
## R-squared: 0.005267
cat("Adjusted R-squared:", round(summary_results$adj.r.squared, 6), "\n")
## Adjusted R-squared: 0.00446
cat("F-statistic:", round(summary_results$fstatistic[1], 3),
"on", summary_results$fstatistic[2], "and", summary_results$fstatistic[3], "DF\n")
## F-statistic: 6.528 on 1 and 1233 DF
cat("p-value:", round(pf(summary_results$fstatistic[1],
summary_results$fstatistic[2],
summary_results$fstatistic[3],
lower.tail = FALSE), 4), "\n")
## p-value: 0.0107
cat("Sample size:", nrow(df_filtered), "\n")
## Sample size: 1235
# Create a copy-paste friendly version
cat("\n\nCopy-Paste Friendly Table:\n")
##
##
## Copy-Paste Friendly Table:
cat("==========================\n")
## ==========================
cat("Variable\tCoefficient\tStd Error\tt-value\tp-value\tSignificance\n")
## Variable Coefficient Std Error t-value p-value Significance
cat("Intercept\t0.50337\t0.01584\t31.771\t<0.0001\t***\n")
## Intercept 0.50337 0.01584 31.771 <0.0001 ***
cat("Log PFOA\t0.02789\t0.01092\t2.555\t0.0107\t*\n")
## Log PFOA 0.02789 0.01092 2.555 0.0107 *
cat("\n")
cat("R² = 0.005267, F = 6.528, p = 0.0107, n = 1,235\n")
## R² = 0.005267, F = 6.528, p = 0.0107, n = 1,235
# Step 1: Fit a full model with all predictors
full_model <- lm(log_pfoa ~ .-pfoa_val-pfoa_detected-tritium_category-month-state-latitude-longitude, data = df_filtered)
# Step 2: Fit a null model with only the intercept
null_model <- lm(log_pfoa ~ 1, data = df_filtered)
# Step 3: Perform stepwise regression (both directions)
step_model <- step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both",
trace = TRUE)
## Start: AIC=-1782.11
## log_pfoa ~ 1
##
## Df Sum of Sq RSS AIC
## + pct_white 1 15.1094 276.15 -1845.9
## + tritium_tu 1 13.0066 278.25 -1836.5
## + median_income 1 10.6492 280.61 -1826.1
## + pct_children 1 8.4056 282.85 -1816.3
## + pct_older 1 7.5947 283.66 -1812.7
## + pop_density_per_sq_mile 1 5.2520 286.00 -1802.6
## + pct_hispanic 1 3.8754 287.38 -1796.7
## + well_depth 1 2.4368 288.82 -1790.5
## + water_use 5 4.2522 287.00 -1790.3
## + poverty_rate 1 2.0759 289.18 -1789.0
## + pct_bachelor_plus 1 1.9224 289.33 -1788.3
## + cancer_rate 1 1.5340 289.72 -1786.6
## + unemployment_rate 1 0.8110 290.44 -1783.6
## <none> 291.25 -1782.1
## + pct_adults 1 0.0950 291.16 -1780.5
## + pct_black 1 0.0199 291.24 -1780.2
##
## Step: AIC=-1845.9
## log_pfoa ~ pct_white
##
## Df Sum of Sq RSS AIC
## + unemployment_rate 1 12.2045 263.94 -1899.7
## + pct_children 1 7.2066 268.94 -1876.6
## + pct_adults 1 6.8976 269.25 -1875.1
## + water_use 5 6.9649 269.18 -1867.5
## + pop_density_per_sq_mile 1 4.9470 271.20 -1866.2
## + tritium_tu 1 4.9158 271.23 -1866.1
## + poverty_rate 1 2.4262 273.72 -1854.8
## + pct_hispanic 1 2.1002 274.05 -1853.3
## + pct_older 1 1.3170 274.83 -1849.8
## + median_income 1 1.1880 274.96 -1849.2
## + cancer_rate 1 0.5576 275.59 -1846.4
## <none> 276.15 -1845.9
## + pct_bachelor_plus 1 0.0660 276.08 -1844.2
## + pct_black 1 0.0406 276.11 -1844.1
## + well_depth 1 0.0002 276.14 -1843.9
## - pct_white 1 15.1094 291.25 -1782.1
##
## Step: AIC=-1899.73
## log_pfoa ~ pct_white + unemployment_rate
##
## Df Sum of Sq RSS AIC
## + pct_adults 1 9.3076 254.63 -1942.1
## + water_use 5 7.4193 256.52 -1924.9
## + pop_density_per_sq_mile 1 4.8511 259.09 -1920.6
## + tritium_tu 1 4.5882 259.35 -1919.4
## + pct_bachelor_plus 1 4.5610 259.38 -1919.3
## + pct_hispanic 1 3.4454 260.50 -1914.0
## + pct_children 1 3.3646 260.58 -1913.6
## + cancer_rate 1 1.6806 262.26 -1905.6
## + median_income 1 1.3161 262.62 -1903.9
## + pct_black 1 1.1865 262.75 -1903.3
## + poverty_rate 1 0.9063 263.04 -1902.0
## + well_depth 1 0.4402 263.50 -1899.8
## <none> 263.94 -1899.7
## + pct_older 1 0.0005 263.94 -1897.7
## - unemployment_rate 1 12.2045 276.15 -1845.9
## - pct_white 1 26.5029 290.44 -1783.6
##
## Step: AIC=-1942.07
## log_pfoa ~ pct_white + unemployment_rate + pct_adults
##
## Df Sum of Sq RSS AIC
## + tritium_tu 1 3.095 251.54 -1955.2
## + water_use 5 4.695 249.94 -1955.0
## + pop_density_per_sq_mile 1 1.121 253.51 -1945.5
## + cancer_rate 1 1.001 253.63 -1944.9
## + pct_older 1 0.953 253.68 -1944.7
## + pct_children 1 0.944 253.69 -1944.7
## + median_income 1 0.923 253.71 -1944.5
## + poverty_rate 1 0.732 253.90 -1943.6
## + pct_hispanic 1 0.581 254.05 -1942.9
## + pct_black 1 0.580 254.05 -1942.9
## <none> 254.63 -1942.1
## + well_depth 1 0.185 254.45 -1941.0
## + pct_bachelor_plus 1 0.007 254.63 -1940.1
## - pct_adults 1 9.308 263.94 -1899.7
## - unemployment_rate 1 14.615 269.25 -1875.1
## - pct_white 1 35.804 290.44 -1781.6
##
## Step: AIC=-1955.17
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu
##
## Df Sum of Sq RSS AIC
## + water_use 5 5.9181 245.62 -1974.6
## + median_income 1 1.3642 250.17 -1959.9
## + pct_older 1 1.3142 250.22 -1959.6
## + pct_children 1 1.3037 250.24 -1959.6
## + pop_density_per_sq_mile 1 1.2969 250.24 -1959.5
## + poverty_rate 1 1.0151 250.52 -1958.2
## + pct_black 1 0.8009 250.74 -1957.1
## + cancer_rate 1 0.7938 250.74 -1957.1
## + pct_hispanic 1 0.7356 250.80 -1956.8
## + well_depth 1 0.4848 251.05 -1955.5
## <none> 251.54 -1955.2
## + pct_bachelor_plus 1 0.0044 251.53 -1953.2
## - tritium_tu 1 3.0954 254.63 -1942.1
## - pct_adults 1 7.8148 259.35 -1919.4
## - unemployment_rate 1 14.1031 265.64 -1889.8
## - pct_white 1 26.6251 278.16 -1832.9
##
## Step: AIC=-1974.57
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use
##
## Df Sum of Sq RSS AIC
## + median_income 1 1.7237 243.90 -1981.3
## + poverty_rate 1 1.5606 244.06 -1980.5
## + pop_density_per_sq_mile 1 1.4992 244.12 -1980.1
## + pct_black 1 1.2272 244.39 -1978.8
## + pct_older 1 1.0279 244.59 -1977.8
## + pct_children 1 1.0195 244.60 -1977.7
## + cancer_rate 1 0.5733 245.05 -1975.5
## <none> 245.62 -1974.6
## + pct_hispanic 1 0.3589 245.26 -1974.4
## + well_depth 1 0.0441 245.58 -1972.8
## + pct_bachelor_plus 1 0.0001 245.62 -1972.6
## - water_use 5 5.9181 251.54 -1955.2
## - tritium_tu 1 4.3186 249.94 -1955.0
## - pct_adults 1 4.8119 250.43 -1952.6
## - unemployment_rate 1 13.8328 259.45 -1908.9
## - pct_white 1 26.6354 272.25 -1849.4
##
## Step: AIC=-1981.27
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use + median_income
##
## Df Sum of Sq RSS AIC
## + pop_density_per_sq_mile 1 3.5326 240.36 -1997.3
## + pct_bachelor_plus 1 2.5194 241.38 -1992.1
## + pct_hispanic 1 1.7058 242.19 -1987.9
## + pct_older 1 0.8769 243.02 -1983.7
## + pct_children 1 0.8698 243.03 -1983.7
## + cancer_rate 1 0.8366 243.06 -1983.5
## <none> 243.90 -1981.3
## + pct_black 1 0.0892 243.81 -1979.7
## + poverty_rate 1 0.0330 243.86 -1979.4
## + well_depth 1 0.0001 243.90 -1979.3
## - median_income 1 1.7237 245.62 -1974.6
## - unemployment_rate 1 4.3902 248.29 -1961.2
## - water_use 5 6.2776 250.17 -1959.9
## - tritium_tu 1 4.8903 248.79 -1958.8
## - pct_adults 1 6.4641 250.36 -1951.0
## - pct_white 1 9.1652 253.06 -1937.7
##
## Step: AIC=-1997.29
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use + median_income + pop_density_per_sq_mile
##
## Df Sum of Sq RSS AIC
## + pct_hispanic 1 2.5639 237.80 -2008.5
## + cancer_rate 1 1.1646 239.20 -2001.3
## + pct_black 1 0.3914 239.97 -1997.3
## <none> 240.36 -1997.3
## + pct_bachelor_plus 1 0.2840 240.08 -1996.8
## + poverty_rate 1 0.1358 240.23 -1996.0
## + well_depth 1 0.1345 240.23 -1996.0
## + pct_children 1 0.0002 240.36 -1995.3
## + pct_older 1 0.0001 240.36 -1995.3
## - unemployment_rate 1 1.9103 242.27 -1989.5
## - pop_density_per_sq_mile 1 3.5326 243.90 -1981.3
## - median_income 1 3.7572 244.12 -1980.1
## - pct_white 1 3.8675 244.23 -1979.6
## - pct_adults 1 4.8767 245.24 -1974.5
## - water_use 5 6.7945 247.16 -1972.9
## - tritium_tu 1 5.8518 246.22 -1969.6
##
## Step: AIC=-2008.54
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use + median_income + pop_density_per_sq_mile + pct_hispanic
##
## Df Sum of Sq RSS AIC
## + pct_black 1 2.8489 234.95 -2021.4
## + poverty_rate 1 2.1023 235.70 -2017.5
## + cancer_rate 1 0.9552 236.84 -2011.5
## <none> 237.80 -2008.5
## + pct_bachelor_plus 1 0.1382 237.66 -2007.2
## + well_depth 1 0.0811 237.72 -2007.0
## + pct_children 1 0.0002 237.80 -2006.5
## + pct_older 1 0.0001 237.80 -2006.5
## - unemployment_rate 1 0.8735 238.67 -2006.0
## - pct_hispanic 1 2.5639 240.36 -1997.3
## - pct_white 1 4.0455 241.84 -1989.7
## - water_use 5 5.9454 243.75 -1988.0
## - pop_density_per_sq_mile 1 4.3907 242.19 -1987.9
## - pct_adults 1 4.5531 242.35 -1987.1
## - median_income 1 6.0274 243.83 -1979.6
## - tritium_tu 1 6.5592 244.36 -1976.9
##
## Step: AIC=-2021.42
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use + median_income + pop_density_per_sq_mile + pct_hispanic +
## pct_black
##
## Df Sum of Sq RSS AIC
## + poverty_rate 1 2.2031 232.75 -2031.0
## - median_income 1 0.0176 234.97 -2023.3
## + cancer_rate 1 0.4757 234.47 -2021.9
## - pop_density_per_sq_mile 1 0.3349 235.29 -2021.7
## <none> 234.95 -2021.4
## + pct_bachelor_plus 1 0.1364 234.81 -2020.1
## + well_depth 1 0.1243 234.83 -2020.1
## + pct_children 1 0.0168 234.93 -2019.5
## + pct_older 1 0.0158 234.94 -2019.5
## - unemployment_rate 1 2.6586 237.61 -2009.5
## - pct_black 1 2.8489 237.80 -2008.5
## - water_use 5 5.1680 240.12 -2004.5
## - pct_hispanic 1 5.0214 239.97 -1997.3
## - pct_white 1 5.8468 240.80 -1993.1
## - pct_adults 1 6.6193 241.57 -1989.1
## - tritium_tu 1 7.5882 242.54 -1984.2
##
## Step: AIC=-2031.05
## log_pfoa ~ pct_white + unemployment_rate + pct_adults + tritium_tu +
## water_use + median_income + pop_density_per_sq_mile + pct_hispanic +
## pct_black + poverty_rate
##
## Df Sum of Sq RSS AIC
## <none> 232.75 -2031.0
## + cancer_rate 1 0.2280 232.52 -2030.3
## - unemployment_rate 1 0.5662 233.31 -2030.0
## + pct_bachelor_plus 1 0.1418 232.61 -2029.8
## - pop_density_per_sq_mile 1 0.6296 233.38 -2029.7
## + well_depth 1 0.0311 232.72 -2029.2
## + pct_older 1 0.0160 232.73 -2029.1
## + pct_children 1 0.0154 232.73 -2029.1
## - median_income 1 0.8614 233.61 -2028.5
## - poverty_rate 1 2.2031 234.95 -2021.4
## - pct_black 1 2.9496 235.70 -2017.5
## - water_use 5 5.6428 238.39 -2011.5
## - pct_adults 1 5.2936 238.04 -2005.3
## - pct_hispanic 1 6.6701 239.42 -1998.2
## - pct_white 1 7.6787 240.43 -1993.0
## - tritium_tu 1 8.0059 240.75 -1991.3
# Step 4: View the final selected model
summary(step_model)
##
## Call:
## lm(formula = log_pfoa ~ pct_white + unemployment_rate + pct_adults +
## tritium_tu + water_use + median_income + pop_density_per_sq_mile +
## pct_hispanic + pct_black + poverty_rate, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1277 -0.2191 -0.0722 0.1459 5.2367
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.085e+01 1.684e+00 -6.442 1.70e-10 ***
## pct_white 4.426e-02 6.976e-03 6.344 3.14e-10 ***
## unemployment_rate 7.965e-02 4.623e-02 1.723 0.08517 .
## pct_adults 1.004e-01 1.905e-02 5.268 1.63e-07 ***
## tritium_tu 2.880e-02 4.447e-03 6.478 1.34e-10 ***
## water_useIrrigation 9.020e-02 8.866e-02 1.017 0.30914
## water_useObservation -1.132e-01 4.328e-02 -2.616 0.00901 **
## water_useProduction 7.993e-02 3.111e-02 2.569 0.01030 *
## water_useUnknown -5.609e-02 9.292e-02 -0.604 0.54619
## water_useWater supply, other 4.219e-02 1.817e-01 0.232 0.81643
## median_income 1.128e-05 5.309e-06 2.125 0.03380 *
## pop_density_per_sq_mile 2.049e-04 1.128e-04 1.817 0.06951 .
## pct_hispanic 2.015e-02 3.408e-03 5.913 4.36e-09 ***
## pct_black 2.309e-02 5.873e-03 3.932 8.90e-05 ***
## poverty_rate 7.759e-02 2.283e-02 3.398 0.00070 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4368 on 1220 degrees of freedom
## Multiple R-squared: 0.2009, Adjusted R-squared: 0.1917
## F-statistic: 21.91 on 14 and 1220 DF, p-value: < 2.2e-16
library(car)
vif(step_model)
## GVIF Df GVIF^(1/(2*Df))
## pct_white 46.032167 1 6.784701
## unemployment_rate 8.854461 1 2.975645
## pct_adults 3.308788 1 1.819007
## tritium_tu 1.329555 1 1.153063
## water_use 1.767752 5 1.058625
## median_income 32.056732 1 5.661866
## pop_density_per_sq_mile 2.547550 1 1.596105
## pct_hispanic 18.252410 1 4.272284
## pct_black 17.484098 1 4.181399
## poverty_rate 17.068816 1 4.131442
model = lm(log_pfoa ~ pct_adults +
tritium_tu + water_use + median_income + pop_density_per_sq_mile +
pct_black, data = df_filtered)
summary(model)
##
## Call:
## lm(formula = log_pfoa ~ pct_adults + tritium_tu + water_use +
## median_income + pop_density_per_sq_mile + pct_black, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1687 -0.2436 -0.0625 0.1433 5.4976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.456e+00 9.922e-01 -2.475 0.01346 *
## pct_adults 8.177e-02 1.687e-02 4.848 1.41e-06 ***
## tritium_tu 2.692e-02 4.260e-03 6.319 3.69e-10 ***
## water_useIrrigation 8.407e-02 9.004e-02 0.934 0.35060
## water_useObservation -1.148e-01 4.225e-02 -2.716 0.00670 **
## water_useProduction 9.130e-02 3.067e-02 2.977 0.00297 **
## water_useUnknown 5.840e-03 9.363e-02 0.062 0.95027
## water_useWater supply, other 2.048e-02 1.844e-01 0.111 0.91159
## median_income -1.897e-05 1.675e-06 -11.322 < 2e-16 ***
## pop_density_per_sq_mile 5.663e-04 7.961e-05 7.114 1.91e-12 ***
## pct_black -8.156e-03 1.967e-03 -4.146 3.61e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4438 on 1224 degrees of freedom
## Multiple R-squared: 0.1722, Adjusted R-squared: 0.1655
## F-statistic: 25.47 on 10 and 1224 DF, p-value: < 2.2e-16
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## pct_adults 2.511911 1 1.584901
## tritium_tu 1.181861 1 1.087134
## water_use 1.554752 5 1.045120
## median_income 3.091922 1 1.758386
## pop_density_per_sq_mile 1.229472 1 1.108816
## pct_black 1.899671 1 1.378286
m=lm(formula = log_pfoa ~ pct_adults +
tritium_tu + water_use + median_income + pop_density_per_sq_mile +
pct_black, data = df_filtered)
summary(m)
##
## Call:
## lm(formula = log_pfoa ~ pct_adults + tritium_tu + water_use +
## median_income + pop_density_per_sq_mile + pct_black, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1687 -0.2436 -0.0625 0.1433 5.4976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.456e+00 9.922e-01 -2.475 0.01346 *
## pct_adults 8.177e-02 1.687e-02 4.848 1.41e-06 ***
## tritium_tu 2.692e-02 4.260e-03 6.319 3.69e-10 ***
## water_useIrrigation 8.407e-02 9.004e-02 0.934 0.35060
## water_useObservation -1.148e-01 4.225e-02 -2.716 0.00670 **
## water_useProduction 9.130e-02 3.067e-02 2.977 0.00297 **
## water_useUnknown 5.840e-03 9.363e-02 0.062 0.95027
## water_useWater supply, other 2.048e-02 1.844e-01 0.111 0.91159
## median_income -1.897e-05 1.675e-06 -11.322 < 2e-16 ***
## pop_density_per_sq_mile 5.663e-04 7.961e-05 7.114 1.91e-12 ***
## pct_black -8.156e-03 1.967e-03 -4.146 3.61e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4438 on 1224 degrees of freedom
## Multiple R-squared: 0.1722, Adjusted R-squared: 0.1655
## F-statistic: 25.47 on 10 and 1224 DF, p-value: < 2.2e-16
vif(m)
## GVIF Df GVIF^(1/(2*Df))
## pct_adults 2.511911 1 1.584901
## tritium_tu 1.181861 1 1.087134
## water_use 1.554752 5 1.045120
## median_income 3.091922 1 1.758386
## pop_density_per_sq_mile 1.229472 1 1.108816
## pct_black 1.899671 1 1.378286
# Load libraries
library(dplyr)
library(broom)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(car)
# Fit the model
m <- lm(log_pfoa ~ pct_adults + tritium_tu + water_use +
median_income + pop_density_per_sq_mile + pct_black,
data = df_filtered)
# Create tidy regression table with significance
reg_table <- tidy(m) %>%
mutate(Significance = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ ""
),
is_significant = p.value < 0.01) # for coloring
# Create and display the styled regression table
reg_table %>%
select(term, estimate, std.error, statistic, p.value, Significance) %>%
kable(
digits = 4,
caption = "Regression Results",
col.names = c("Predictor", "Estimate", "Std. Error", "t value", "p-value", "Sig")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(which(reg_table$is_significant), bold = TRUE, color = "black", background = "#dff0d8")
Predictor | Estimate | Std. Error | t value | p-value | Sig |
---|---|---|---|---|---|
(Intercept) | -2.4555 | 0.9922 | -2.4749 | 0.0135 |
|
pct_adults | 0.0818 | 0.0169 | 4.8478 | 0.0000 | *** |
tritium_tu | 0.0269 | 0.0043 | 6.3186 | 0.0000 | *** |
water_useIrrigation | 0.0841 | 0.0900 | 0.9338 | 0.3506 | |
water_useObservation | -0.1148 | 0.0423 | -2.7159 | 0.0067 | ** |
water_useProduction | 0.0913 | 0.0307 | 2.9773 | 0.0030 | ** |
water_useUnknown | 0.0058 | 0.0936 | 0.0624 | 0.9503 | |
water_useWater supply, other | 0.0205 | 0.1844 | 0.1111 | 0.9116 | |
median_income | 0.0000 | 0.0000 | -11.3216 | 0.0000 | *** |
pop_density_per_sq_mile | 0.0006 | 0.0001 | 7.1143 | 0.0000 | *** |
pct_black | -0.0082 | 0.0020 | -4.1464 | 0.0000 | *** |
# Load libraries
library(dplyr)
library(broom)
library(kableExtra)
library(car)
# Fit the model
m <- lm(log_pfoa ~ pct_adults + tritium_tu + water_use +
median_income + pop_density_per_sq_mile + pct_black,
data = df_filtered)
# Create tidy regression table with significance
reg_table <- tidy(m) %>%
mutate(Significance = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ ""
),
is_significant = p.value < 0.01) # for coloring
# Filter for only the specific variables of interest
key_variables <- c("pct_adults", "tritium_tu", "water_useProduction",
"median_income", "pop_density_per_sq_mile")
# Create filtered table first
filtered_table <- reg_table %>%
filter(term %in% key_variables)
# Create and display the styled regression table
filtered_table %>%
select(term, estimate, std.error, statistic, p.value, Significance) %>%
kable(
digits = 4,
caption = "Key PFOA Contamination Predictors - Regression Results",
col.names = c("Predictor", "Estimate", "Std. Error", "t value", "p-value", "Sig")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(which(filtered_table$is_significant),
bold = TRUE, color = "black", background = "#dff0d8")
Predictor | Estimate | Std. Error | t value | p-value | Sig |
---|---|---|---|---|---|
pct_adults | 0.0818 | 0.0169 | 4.8478 | 0.000 | *** |
tritium_tu | 0.0269 | 0.0043 | 6.3186 | 0.000 | *** |
water_useProduction | 0.0913 | 0.0307 | 2.9773 | 0.003 | ** |
median_income | 0.0000 | 0.0000 | -11.3216 | 0.000 | *** |
pop_density_per_sq_mile | 0.0006 | 0.0001 | 7.1143 | 0.000 | *** |
#AutoML for log_pfoa (Regression Task)
# Load the h2o library
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Start H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\doris\AppData\Local\Temp\RtmpQNrnuW\file5df85c16132/h2o_doris_started_from_r.out
## C:\Users\doris\AppData\Local\Temp\RtmpQNrnuW\file5df82ab1a74/h2o_doris_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 172 milliseconds
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 9 months and 21 days
## H2O cluster name: H2O_started_from_R_doris_oru630
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.88 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 9 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert your dataframe to an H2OFrame
df_h2o <- as.h2o(df_filtered)
## | | | 0% | |======================================================================| 100%
# Define response and predictors
variables_to_remove <- c("latitude", "longitude", "tritium_category", "pfoa_val", "pfoa_detected", "log_pfoa", "month")
x_reg <- setdiff(names(df_filtered), variables_to_remove)
y_reg <- "log_pfoa"
# Print predictor information
cat("Predictors used:", paste(x_reg, collapse = ", "), "\n")
## Predictors used: state, water_use, tritium_tu, well_depth, median_income, poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older, pop_density_per_sq_mile, cancer_rate
cat("Number of predictors:", length(x_reg), "\n")
## Number of predictors: 16
# Split the data
splits_reg <- h2o.splitFrame(data = df_h2o, ratios = 0.7, seed = 123)
train_reg <- splits_reg[[1]]
test_reg <- splits_reg[[2]]
# Run AutoML
aml_reg <- h2o.automl(
x = x_reg,
y = y_reg,
training_frame = train_reg,
leaderboard_frame = test_reg,
max_models = 20,
seed = 1234
)
## | | | 0% | |== | 3%
## 18:23:38.394: AutoML: XGBoost is not available; skipping it. | |========== | 15% | |=================== | 26% | |=============================== | 44% | |===================================== | 53% | |=========================================== | 62% | |====================================================== | 76% | |======================================================================| 100%
# Show full leaderboard
lb_reg <- h2o.get_leaderboard(aml_reg, extra_columns = "ALL")
print(lb_reg, n = nrow(lb_reg))
## model_id rmse mse
## 1 GBM_grid_1_AutoML_1_20251011_182338_model_2 0.4091900 0.1674365
## 2 GBM_grid_1_AutoML_1_20251011_182338_model_1 0.4118630 0.1696311
## 3 GBM_1_AutoML_1_20251011_182338 0.4132166 0.1707479
## 4 GBM_grid_1_AutoML_1_20251011_182338_model_3 0.4133137 0.1708282
## 5 GBM_2_AutoML_1_20251011_182338 0.4134107 0.1709084
## 6 DeepLearning_grid_3_AutoML_1_20251011_182338_model_1 0.4142181 0.1715767
## 7 StackedEnsemble_BestOfFamily_1_AutoML_1_20251011_182338 0.4177212 0.1744910
## 8 GBM_4_AutoML_1_20251011_182338 0.4185395 0.1751753
## 9 GBM_3_AutoML_1_20251011_182338 0.4186187 0.1752416
## 10 GBM_grid_1_AutoML_1_20251011_182338_model_4 0.4211605 0.1773762
## 11 GBM_grid_1_AutoML_1_20251011_182338_model_5 0.4222271 0.1782757
## 12 DeepLearning_grid_1_AutoML_1_20251011_182338_model_1 0.4245086 0.1802075
## 13 GLM_1_AutoML_1_20251011_182338 0.4257319 0.1812476
## 14 DeepLearning_grid_2_AutoML_1_20251011_182338_model_1 0.4278609 0.1830650
## 15 DRF_1_AutoML_1_20251011_182338 0.4337441 0.1881339
## 16 DeepLearning_1_AutoML_1_20251011_182338 0.4343779 0.1886842
## 17 XRT_1_AutoML_1_20251011_182338 0.4409525 0.1944391
## 18 DeepLearning_grid_1_AutoML_1_20251011_182338_model_2 0.4459299 0.1988535
## 19 StackedEnsemble_AllModels_1_AutoML_1_20251011_182338 0.4509164 0.2033256
## 20 DeepLearning_grid_3_AutoML_1_20251011_182338_model_2 0.4685734 0.2195610
## 21 GBM_5_AutoML_1_20251011_182338 0.4752991 0.2259092
## 22 DeepLearning_grid_2_AutoML_1_20251011_182338_model_2 0.5343135 0.2854909
## mae rmsle mean_residual_deviance training_time_ms
## 1 0.2412755 0.1458620 0.1674365 80
## 2 0.2373114 0.1467636 0.1696311 133
## 3 0.2452379 0.1476060 0.1707479 131
## 4 0.2478323 0.1477924 0.1708282 57
## 5 0.2411996 0.1468352 0.1709084 221
## 6 0.2447922 0.1477984 0.1715767 12219
## 7 0.2411486 0.1466883 0.1744910 1161
## 8 0.2437177 0.1490390 0.1751753 92
## 9 0.2459783 0.1491281 0.1752416 404
## 10 0.2391034 0.1487703 0.1773762 83
## 11 0.2480273 0.1491586 0.1782757 52
## 12 0.2354055 0.1486766 0.1802075 11498
## 13 0.2566747 0.1515693 0.1812476 18
## 14 0.2768064 0.1559760 0.1830650 13157
## 15 0.2373061 0.1491110 0.1881339 191
## 16 0.2541884 0.1536553 0.1886842 53
## 17 0.2367766 0.1500611 0.1944391 192
## 18 0.2427991 0.1535918 0.1988535 10247
## 19 0.2516250 0.1532961 0.2033256 767
## 20 0.2592057 0.1579481 0.2195610 15294
## 21 0.2519053 0.1545553 0.2259092 55
## 22 0.3149483 0.1774324 0.2854909 15515
## predict_time_per_row_ms algo
## 1 0.018810 GBM
## 2 0.022039 GBM
## 3 0.017213 GBM
## 4 0.013082 GBM
## 5 0.016750 GBM
## 6 0.051639 DeepLearning
## 7 0.094652 StackedEnsemble
## 8 0.022533 GBM
## 9 0.018551 GBM
## 10 0.018444 GBM
## 11 0.011734 GBM
## 12 0.013001 DeepLearning
## 13 0.004852 GLM
## 14 0.028543 DeepLearning
## 15 0.031277 DRF
## 16 0.008921 DeepLearning
## 17 0.016480 DRF
## 18 0.011057 DeepLearning
## 19 0.049441 StackedEnsemble
## 20 0.022150 DeepLearning
## 21 0.011370 GBM
## 22 0.015444 DeepLearning
##
## [22 rows x 9 columns]
# Best model
best_model_reg <- aml_reg@leader
print(best_model_reg)
## Model Details:
## ==============
##
## H2ORegressionModel: gbm
## Model ID: GBM_grid_1_AutoML_1_20251011_182338_model_2
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 30 30 7132 7
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 7 7.00000 9 18 14.30000
##
##
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.1620572
## RMSE: 0.4025633
## MAE: 0.2327796
## RMSLE: 0.1365644
## Mean Residual Deviance : 0.1620572
##
##
##
## H2ORegressionMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.1913423
## RMSE: 0.4374269
## MAE: 0.2554557
## RMSLE: 0.1507954
## Mean Residual Deviance : 0.1913423
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## mae 0.254589 0.014197 0.251779 0.242526 0.274830
## mean_residual_deviance 0.192532 0.073301 0.193840 0.137706 0.159939
## mse 0.192532 0.073301 0.193840 0.137706 0.159939
## r2 0.220505 0.093034 0.186806 0.254176 0.128440
## residual_deviance 0.192532 0.073301 0.193840 0.137706 0.159939
## rmse 0.433284 0.077437 0.440273 0.371087 0.399924
## rmsle 0.151173 0.009947 0.154119 0.146017 0.152461
## cv_4_valid cv_5_valid
## mae 0.241211 0.262599
## mean_residual_deviance 0.152778 0.318397
## mse 0.152778 0.318397
## r2 0.365635 0.167467
## residual_deviance 0.152778 0.318397
## rmse 0.390868 0.564267
## rmsle 0.138246 0.165020
# Evaluate model on test set
perf_test <- h2o.performance(best_model_reg, newdata = test_reg)
print(perf_test)
## H2ORegressionMetrics: gbm
##
## MSE: 0.1674365
## RMSE: 0.40919
## MAE: 0.2412755
## RMSLE: 0.145862
## Mean Residual Deviance : 0.1674365
# Predict on test set
predictions_reg <- h2o.predict(best_model_reg, test_reg)
## | | | 0% | |======================================================================| 100%
print(head(predictions_reg))
## predict
## 1 1.627676
## 2 1.604784
## 3 1.584260
## 4 1.576299
## 5 1.592688
## 6 1.588860
# Compare actual vs predicted
actual_vs_predicted <- h2o.cbind(test_reg[[y_reg]], predictions_reg)
colnames(actual_vs_predicted) <- c("actual_log_pfoa", "predicted_log_pfoa")
print(head(actual_vs_predicted, 10))
## actual_log_pfoa predicted_log_pfoa
## 1 1.5475625 1.627676
## 2 1.6094379 1.604784
## 3 1.5475625 1.584260
## 4 1.6486586 1.576299
## 5 1.6094379 1.592688
## 6 1.6094379 1.588860
## 7 1.6094379 1.591966
## 8 1.0647107 1.151556
## 9 1.0647107 1.148198
## 10 0.8329091 1.252979
# Variable importance (if available)
if ("variable_importances" %in% names(best_model_reg@model)) {
var_imp <- h2o.varimp(best_model_reg)
print(head(var_imp, 10))
} else {
cat("Variable importance is not available for this model type.\n")
}
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 state 135.455292 1.000000 0.366447
## 2 tritium_tu 44.184357 0.326191 0.119532
## 3 median_income 42.411461 0.313103 0.114736
## 4 well_depth 26.667093 0.196870 0.072142
## 5 water_use 22.160738 0.163602 0.059951
## 6 pct_adults 19.215893 0.141862 0.051985
## 7 poverty_rate 14.598371 0.107773 0.039493
## 8 pct_older 12.807136 0.094549 0.034647
## 9 pct_bachelor_plus 11.924089 0.088030 0.032258
## 10 pct_children 10.011117 0.073907 0.027083
#AutoML for pfoa_detected (Binary Classification)
# Load the h2o library
library(h2o)
# Start H2O
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 9 minutes 16 seconds
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 9 months and 21 days
## H2O cluster name: H2O_started_from_R_doris_oru630
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.82 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 9 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert your dataframe to an H2OFrame
df_h2o <- as.h2o(df_filtered)
## | | | 0% | |======================================================================| 100%
# Convert to factor (in the H2OFrame)
df_h2o[, "pfoa_detected"] <- as.factor(df_h2o[, "pfoa_detected"])
# Define response and predictors - REMOVING specified variables
variables_to_remove <- c("latitude", "longitude", "tritium_category", "log_pfoa", "pfoa_val", "pfoa_detected", "month")
x_clf <- setdiff(names(df_filtered), variables_to_remove)
y_clf <- "pfoa_detected"
print(paste("Predictors used:", paste(x_clf, collapse = ", ")))
## [1] "Predictors used: state, water_use, tritium_tu, well_depth, median_income, poverty_rate, unemployment_rate, pct_white, pct_black, pct_hispanic, pct_bachelor_plus, pct_children, pct_adults, pct_older, pop_density_per_sq_mile, cancer_rate"
print(paste("Number of predictors:", length(x_clf)))
## [1] "Number of predictors: 16"
# Split the data
splits_clf <- h2o.splitFrame(data = df_h2o, ratios = 0.7, seed = 123)
train_clf <- splits_clf[[1]]
test_clf <- splits_clf[[2]]
# Check class distribution before balancing
print(h2o.table(train_clf[, y_clf]))
## pfoa_detected Count
## 1 0 88
## 2 1 777
##
## [2 rows x 2 columns]
# Run AutoML (Classification) with class balancing
aml_clf <- h2o.automl(x = x_clf,
y = y_clf,
training_frame = train_clf,
leaderboard_frame = test_clf,
max_models = 20,
seed = 1234,
balance_classes = TRUE, # Enable class balancing
max_after_balance_size = 3.0)
## | | | 0% | |== | 3%
## 18:32:55.633: AutoML: XGBoost is not available; skipping it. | |============ | 18% | |=================== | 26% | |=============================== | 44% | |===================================== | 53% | |=========================================== | 62% | |====================================================== | 76% | |======================================================================| 100%
# Show leaderboard with all columns
lb_clf <- h2o.get_leaderboard(aml_clf, extra_columns = "ALL")
print(lb_clf, n = nrow(lb_clf))
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_2_20251011_183255 0.7458782 0.3036463
## 2 GBM_2_AutoML_2_20251011_183255 0.7296290 0.3126845
## 3 GBM_grid_1_AutoML_2_20251011_183255_model_3 0.7140536 0.3106991
## 4 DeepLearning_1_AutoML_2_20251011_183255 0.7120323 0.3206283
## 5 GBM_grid_1_AutoML_2_20251011_183255_model_5 0.7093373 0.3178046
## 6 GBM_3_AutoML_2_20251011_183255 0.7069990 0.3181845
## 7 GBM_5_AutoML_2_20251011_183255 0.7037096 0.3180435
## 8 GBM_1_AutoML_2_20251011_183255 0.6997464 0.3131314
## 9 GBM_grid_1_AutoML_2_20251011_183255_model_4 0.6955850 0.3360300
## 10 GBM_4_AutoML_2_20251011_183255 0.6933656 0.3275421
## 11 GBM_grid_1_AutoML_2_20251011_183255_model_1 0.6932070 0.3301203
## 12 GBM_grid_1_AutoML_2_20251011_183255_model_2 0.6927315 0.3209874
## 13 DRF_1_AutoML_2_20251011_183255 0.6905913 0.4240308
## 14 GLM_1_AutoML_2_20251011_183255 0.6873811 0.3186461
## 15 XRT_1_AutoML_2_20251011_183255 0.6807229 0.3775919
## 16 DeepLearning_grid_2_AutoML_2_20251011_183255_model_1 0.6777108 0.4533509
## 17 DeepLearning_grid_3_AutoML_2_20251011_183255_model_1 0.6722416 0.3948481
## 18 DeepLearning_grid_1_AutoML_2_20251011_183255_model_1 0.6713697 0.5170865
## 19 StackedEnsemble_AllModels_1_AutoML_2_20251011_183255 0.6671687 0.3721049
## 20 DeepLearning_grid_2_AutoML_2_20251011_183255_model_2 0.6630866 0.8983242
## 21 DeepLearning_grid_3_AutoML_2_20251011_183255_model_2 0.6612238 0.6877595
## 22 DeepLearning_grid_1_AutoML_2_20251011_183255_model_2 0.6484623 0.8641396
## aucpr mean_per_class_error rmse mse training_time_ms
## 1 0.9565417 0.4868421 0.2945397 0.08675365 1468
## 2 0.9522173 0.4270767 0.2952365 0.08716458 126
## 3 0.9519246 0.5000000 0.2972149 0.08833672 79
## 4 0.9501891 0.5000000 0.3054933 0.09332617 178
## 5 0.9497658 0.5000000 0.3000993 0.09005960 149
## 6 0.9453798 0.4387286 0.2948400 0.08693061 130
## 7 0.9506859 0.4270767 0.2954323 0.08728023 126
## 8 0.9481461 0.5000000 0.2977178 0.08863589 132
## 9 0.9448205 0.5000000 0.2988058 0.08928489 206
## 10 0.9414377 0.4037730 0.2968646 0.08812859 140
## 11 0.9464142 0.4868421 0.2959362 0.08757824 145
## 12 0.9445046 0.5000000 0.2968006 0.08809061 128
## 13 0.9446372 0.5000000 0.2993358 0.08960192 344
## 14 0.9486930 0.5000000 0.3008084 0.09048568 75
## 15 0.9419124 0.4635384 0.3012470 0.09074976 414
## 16 0.9455646 0.5000000 0.3178608 0.10103551 12241
## 17 0.9443273 0.5000000 0.3129565 0.09794174 14406
## 18 0.9442389 0.5000000 0.3373033 0.11377354 10338
## 19 0.9395425 0.5000000 0.3085790 0.09522099 1075
## 20 0.9421426 0.5000000 0.3756124 0.14108465 6890
## 21 0.9384233 0.5000000 0.3595837 0.12930044 18847
## 22 0.9393107 0.5000000 0.3737734 0.13970656 8669
## predict_time_per_row_ms algo
## 1 0.056464 StackedEnsemble
## 2 0.021168 GBM
## 3 0.017100 GBM
## 4 0.017208 DeepLearning
## 5 0.022432 GBM
## 6 0.023116 GBM
## 7 0.021131 GBM
## 8 0.015421 GBM
## 9 0.022992 GBM
## 10 0.022384 GBM
## 11 0.020750 GBM
## 12 0.018048 GBM
## 13 0.026251 DRF
## 14 0.010720 GLM
## 15 0.027443 DRF
## 16 0.034029 DeepLearning
## 17 0.042069 DeepLearning
## 18 0.022774 DeepLearning
## 19 0.054624 StackedEnsemble
## 20 0.026879 DeepLearning
## 21 0.036121 DeepLearning
## 22 0.028200 DeepLearning
##
## [22 rows x 10 columns]
# Best classification model
best_model_clf <- aml_clf@leader
print(best_model_clf)
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_1_AutoML_2_20251011_183255
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 4/5
## 3 # GBM base models (used / total) 1/1
## 4 # DeepLearning base models (used / total) 1/1
## 5 # DRF base models (used / total) 1/2
## 6 # GLM base models (used / total) 1/1
## 7 Metalearner algorithm GLM
## 8 Metalearner fold assignment scheme Random
## 9 Metalearner nfolds 5
## 10 Metalearner fold_column NA
## 11 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.0719238
## RMSE: 0.2681861
## LogLoss: 0.2499077
## Mean Per-Class Error: 0.3510954
## AUC: 0.913632
## AUCPR: 0.989563
## Gini: 0.827264
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 27 61 0.693182 =61/88
## 1 7 770 0.009009 =7/777
## Totals 34 831 0.078613 =68/865
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.696123 0.957711 370
## 2 max f2 0.669396 0.979251 382
## 3 max f0point5 0.820134 0.948207 301
## 4 max accuracy 0.696123 0.921387 370
## 5 max precision 0.983946 1.000000 0
## 6 max recall 0.573735 1.000000 395
## 7 max specificity 0.983946 1.000000 0
## 8 max absolute_mcc 0.790070 0.481104 321
## 9 max min_per_class_accuracy 0.883536 0.829545 243
## 10 max mean_per_class_accuracy 0.898598 0.858978 214
## 11 max tns 0.983946 88.000000 0
## 12 max fns 0.983946 775.000000 0
## 13 max fps 0.551082 88.000000 399
## 14 max tps 0.573735 777.000000 395
## 15 max tnr 0.983946 1.000000 0
## 16 max fnr 0.983946 0.997426 0
## 17 max fpr 0.551082 1.000000 399
## 18 max tpr 0.573735 1.000000 395
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.08593275
## RMSE: 0.2931429
## LogLoss: 0.3140038
## Mean Per-Class Error: 0.5
## AUC: 0.6706666
## AUCPR: 0.9259375
## Gini: 0.3413332
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 0 88 1.000000 =88/88
## 1 0 777 0.000000 =0/777
## Totals 0 865 0.101734 =88/865
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.295440 0.946407 399
## 2 max f2 0.295440 0.977850 399
## 3 max f0point5 0.820032 0.927472 311
## 4 max accuracy 0.295440 0.898266 399
## 5 max precision 0.953846 0.942308 58
## 6 max recall 0.295440 1.000000 399
## 7 max specificity 0.999134 0.988636 0
## 8 max absolute_mcc 0.820032 0.294436 311
## 9 max min_per_class_accuracy 0.908920 0.613636 185
## 10 max mean_per_class_accuracy 0.880175 0.676955 253
## 11 max tns 0.999134 87.000000 0
## 12 max fns 0.999134 777.000000 0
## 13 max fps 0.631417 88.000000 393
## 14 max tps 0.295440 777.000000 399
## 15 max tnr 0.999134 0.988636 0
## 16 max fnr 0.999134 1.000000 0
## 17 max fpr 0.631417 1.000000 393
## 18 max tpr 0.295440 1.000000 399
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.904066 0.024688 0.898876 0.947674 0.891429 0.888235
## auc 0.699835 0.110445 0.760443 0.852761 0.670041 0.652492
## err 0.095934 0.024688 0.101124 0.052326 0.108571 0.111765
## err_count 16.600000 4.277850 18.000000 9.000000 19.000000 19.000000
## f0point5 0.924564 0.020414 0.930657 0.957697 0.911215 0.908544
## cv_5_valid
## accuracy 0.894118
## auc 0.563439
## err 0.105882
## err_count 18.000000
## f0point5 0.914709
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.909199 0.025205 0.921687 0.947674 0.891429
## r2 0.051139 0.040307 0.103258 0.069426 0.043097
## recall 0.992329 0.013713 0.968354 1.000000 1.000000
## residual_deviance 105.856520 23.547125 110.006320 65.839100 112.519400
## rmse 0.291197 0.044320 0.299059 0.214814 0.304323
## specificity 0.108095 0.158397 0.350000 0.000000 0.000000
## cv_4_valid cv_5_valid
## precision 0.888235 0.896970
## r2 0.046777 -0.006863
## recall 1.000000 0.993289
## residual_deviance 112.446350 128.471420
## rmse 0.307619 0.330171
## specificity 0.000000 0.190476
# Get model performance on test set
perf_test <- h2o.performance(best_model_clf, test_clf)
print(perf_test)
## H2OBinomialMetrics: stackedensemble
##
## MSE: 0.08675365
## RMSE: 0.2945397
## LogLoss: 0.3036463
## Mean Per-Class Error: 0.4868421
## AUC: 0.7458782
## AUCPR: 0.9565417
## Gini: 0.4917565
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 1 37 0.973684 =37/38
## 1 0 332 0.000000 =0/332
## Totals 1 369 0.100000 =37/370
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.612512 0.947218 365
## 2 max f2 0.612512 0.978197 365
## 3 max f0point5 0.698370 0.921946 355
## 4 max accuracy 0.612512 0.900000 365
## 5 max precision 0.980705 1.000000 0
## 6 max recall 0.612512 1.000000 365
## 7 max specificity 0.980705 1.000000 0
## 8 max absolute_mcc 0.905335 0.287909 245
## 9 max min_per_class_accuracy 0.905335 0.710843 245
## 10 max mean_per_class_accuracy 0.905335 0.723843 245
## 11 max tns 0.980705 38.000000 0
## 12 max fns 0.980705 331.000000 0
## 13 max fps 0.600712 38.000000 366
## 14 max tps 0.612512 332.000000 365
## 15 max tnr 0.980705 1.000000 0
## 16 max fnr 0.980705 0.996988 0
## 17 max fpr 0.600712 1.000000 366
## 18 max tpr 0.612512 1.000000 365
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Generate confusion matrix
cm <- h2o.confusionMatrix(perf_test)
print(cm)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.61251175891771:
## 0 1 Error Rate
## 0 1 37 0.973684 =37/38
## 1 0 332 0.000000 =0/332
## Totals 1 369 0.100000 =37/370
# Make predictions on test set
predictions_clf <- h2o.predict(best_model_clf, test_clf)
## | | | 0% | |======================================================================| 100%
print(head(predictions_clf))
## predict p0 p1
## 1 1 0.05208713 0.9479129
## 2 1 0.05152928 0.9484707
## 3 1 0.05041690 0.9495831
## 4 1 0.04322418 0.9567758
## 5 1 0.05718886 0.9428111
## 6 1 0.08632103 0.9136790
# Create a proper confusion matrix using R's table function
# Convert actual and predicted values to vectors
actual <- as.vector(test_clf[, "pfoa_detected"])
predicted <- as.vector(h2o.predict(best_model_clf, test_clf)$predict)
## | | | 0% | |======================================================================| 100%
# Convert to factors with the same levels
actual <- factor(as.character(actual))
predicted <- factor(as.character(predicted), levels = levels(actual))
# Create confusion matrix
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
conf_matrix <- table(Predicted = predicted, Actual = actual)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 0 0
## 1 38 332
TP <- as.numeric(conf_matrix["1", "1"])
TN <- as.numeric(conf_matrix["0", "0"])
FP <- as.numeric(conf_matrix["1", "0"])
FN <- as.numeric(conf_matrix["0", "1"])
# Calculate all metrics
total <- sum(conf_matrix)
accuracy <- (TP + TN) / total
sensitivity <- TP / (TP + FN) # Recall or True Positive Rate
specificity <- TN / (TN + FP) # True Negative Rate
precision <- TP / (TP + FP) # Positive Predictive Value
npv <- TN / (TN + FN) # Negative Predictive Value
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
error_rate <- 1 - accuracy
false_positive_rate <- FP / (FP + TN)
false_negative_rate <- FN / (TP + FN)
# Get AUC from H2O
perf_test <- h2o.performance(best_model_clf, test_clf)
auc_value <- h2o.auc(perf_test)
# Print detailed metrics
cat("\n=== DETAILED CLASSIFICATION METRICS ===\n")
##
## === DETAILED CLASSIFICATION METRICS ===
cat("True Positives (TP) :", TP, "\n")
## True Positives (TP) : 332
cat("True Negatives (TN) :", TN, "\n")
## True Negatives (TN) : 0
cat("False Positives (FP):", FP, "\n")
## False Positives (FP): 38
cat("False Negatives (FN):", FN, "\n")
## False Negatives (FN): 0
cat("Total Samples :", total, "\n")
## Total Samples : 370
cat("\n=== PERFORMANCE METRICS ===\n")
##
## === PERFORMANCE METRICS ===
cat("Accuracy :", round(accuracy, 4), "\n")
## Accuracy : 0.8973
cat("Sensitivity (Recall):", round(sensitivity, 4), "\n")
## Sensitivity (Recall): 1
cat("Specificity :", round(specificity, 4), "\n")
## Specificity : 0
cat("Precision (PPV) :", round(precision, 4), "\n")
## Precision (PPV) : 0.8973
cat("Negative Pred Value :", round(npv, 4), "\n")
## Negative Pred Value : NaN
cat("F1 Score :", round(f1_score, 4), "\n")
## F1 Score : 0.9459
cat("Error Rate :", round(error_rate, 4), "\n")
## Error Rate : 0.1027
cat("False Positive Rate :", round(false_positive_rate, 4), "\n")
## False Positive Rate : 1
cat("False Negative Rate :", round(false_negative_rate, 4), "\n")
## False Negative Rate : 0
cat("AUC :", round(auc_value, 4), "\n")
## AUC : 0.7459
# Summary table
metrics_df <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision",
"NPV", "F1-Score", "Error Rate", "AUC"),
Value = c(round(accuracy, 4), round(sensitivity, 4), round(specificity, 4),
round(precision, 4), round(npv, 4), round(f1_score, 4),
round(error_rate, 4), round(auc_value, 4))
)
print(metrics_df)
## Metric Value
## 1 Accuracy 0.8973
## 2 Sensitivity 1.0000
## 3 Specificity 0.0000
## 4 Precision 0.8973
## 5 NPV NaN
## 6 F1-Score 0.9459
## 7 Error Rate 0.1027
## 8 AUC 0.7459