This analysis explores EDDMapS (Early Detection & Distribution Mapping System) data as part of developing an automated invasive species risk assessment system using knowledge graphs.
Research Goal: Extract spatial-temporal patterns, species traits, and invasion dynamics from occurrence data to feed into a knowledge graph that predicts species at risk of invasion.
Key Questions: - What species are most frequently reported? - Where are invasion hotspots? - How do invasions spread over time? - What patterns emerge that could inform risk assessment?
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(scales)
library(viridis)
library(knitr)
library(sf)
library(maps)
# Set theme
theme_set(theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11, color = "gray30"),
legend.position = "bottom"))
# Load EDDMapS data
# UPDATE THIS PATH to match your file location
eddmaps <- read_csv("C:/Users/annasokol/Downloads/EDDMapSPoints_view_6818701269405066160.csv",
show_col_types = FALSE)
cat("✓ Data loaded!\n")
✓ Data loaded!
cat("Dimensions:", nrow(eddmaps), "rows ×", ncol(eddmaps), "columns\n")
Dimensions: 5630454 rows × 18 columns
# Show column names and types
glimpse(eddmaps)
Rows: 5,630,454
Columns: 18
$ TableID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, …
$ Reporter_First <chr> "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "…
$ Reporter_Last <chr> "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "…
$ State_Name <chr> "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georg…
$ County_Name <chr> "Tift County", "Tift County", "Montgomery County", "Berrien County", "Berrien County", "Tift County", "Tift County", "Berrien C…
$ ObservationDate <chr> "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "3/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4…
$ Date_Updated <chr> "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "3/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4…
$ SUB_ID <dbl> 2425, 3035, 3045, 2779, 2779, 3035, 3039, 3035, 3009, 3039, 3039, 2425, 3049, 3035, 3083, 2425, 3049, 3035, 3039, 3039, 3035, 3…
$ SUB_name <chr> "kudzu", "Chinese privet", "Japanese climbing fern", "alligatorweed", "alligatorweed", "Chinese privet", "Japanese honeysuckle"…
$ SUB_genus <chr> "Pueraria", "Ligustrum", "Lygodium", "Alternanthera", "Alternanthera", "Ligustrum", "Lonicera", "Ligustrum", "Arundo", "Lonicer…
$ SUB_species <chr> "montana", "sinense", "japonicum", "philoxeroides", "philoxeroides", "sinense", "japonica", "sinense", "donax", "japonica", "ja…
$ SUB_variety <chr> "var. lobata", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "var. lobata", NA, NA, NA, "var. lobata", NA, NA, NA, NA, NA, NA, NA, NA…
$ Scientific_Name <chr> "Pueraria montana var. lobata", "Ligustrum sinense", "Lygodium japonicum", "Alternanthera philoxeroides", "Alternanthera philox…
$ year <dbl> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2…
$ StateFIPS <dbl> 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,…
$ InfestedAreaAcres <dbl> 3.000000000, 0.000344352, 3.000000000, 0.009182720, 0.004591360, NA, 0.000344352, 0.000229568, NA, 0.000688704, 0.045913600, 1.…
$ x <dbl> 3155857, 3155857, 3224207, 3172772, 3173432, 3153715, 3153715, 3172772, 3152477, 3152477, 3157085, 3157411, 3155444, 3149938, 3…
$ y <dbl> 3838672, 3838672, 3931809, 3828005, 3826923, 3838166, 3838166, 3828005, 3836609, 3836609, 3832718, 3832783, 3838497, 3834044, 3…
head(eddmaps, 20)
# Understand what each column contains
data.frame(
Column = names(eddmaps),
Type = sapply(eddmaps, class),
Sample_Value = sapply(eddmaps, function(x) as.character(x[1])),
Non_Missing = sapply(eddmaps, function(x) sum(!is.na(x))),
Percent_Complete = round(sapply(eddmaps, function(x) sum(!is.na(x))/length(x)*100), 1)
) %>%
arrange(desc(Percent_Complete))
# Clean and prepare data
eddmaps_clean <- eddmaps %>%
# Parse dates properly
mutate(
ObservationDate = as.Date(ObservationDate, format = "%m/%d/%Y"),
Date_Updated = as.Date(Date_Updated, format = "%m/%d/%Y"),
year = year(ObservationDate),
month = month(ObservationDate, label = TRUE),
# Handle infested area
InfestedAreaAcres = as.numeric(InfestedAreaAcres),
# Clean species names
Scientific_Name = str_trim(Scientific_Name),
SUB_name = str_trim(SUB_name)
) %>%
# Remove records without dates or locations
filter(!is.na(ObservationDate), !is.na(x), !is.na(y))
cat("✓ Data cleaned!\n")
cat("Records after cleaning:", nrow(eddmaps_clean), "\n")
cat("Date range:", min(eddmaps_clean$ObservationDate, na.rm = TRUE), "to",
max(eddmaps_clean$ObservationDate, na.rm = TRUE), "\n")
n_species <- length(unique(eddmaps_clean$Scientific_Name))
n_genera <- length(unique(eddmaps_clean$SUB_genus))
n_families <- length(unique(eddmaps_clean$SUB_name))
cat("Total unique species:", n_species, "\n")
cat("Total genera:", n_genera, "\n")
cat("Total common names:", n_families, "\n")
top_species <- eddmaps_clean %>%
count(Scientific_Name, SUB_name, sort = TRUE) %>%
head(20)
top_species
eddmaps_clean %>%
count(Scientific_Name, SUB_name) %>%
arrange(desc(n)) %>%
head(20) %>%
mutate(
label = paste0(SUB_name, "\n(", Scientific_Name, ")"),
label = str_wrap(label, width = 30)
) %>%
ggplot(aes(x = reorder(label, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = comma(n)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_fill_viridis_c(option = "plasma", direction = -1) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Top 20 Most Reported Invasive Species in EDDMapS",
subtitle = "Number of occurrence records",
x = NULL,
y = "Number of Reports"
) +
theme(panel.grid.major.y = element_blank())
state_summary <- eddmaps_clean %>%
count(State_Name, name = "reports") %>%
arrange(desc(reports))
state_summary %>%
head(20)
state_summary %>%
head(20) %>%
ggplot(aes(x = reorder(State_Name, reports), y = reports, fill = reports)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = comma(reports)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#3B9AB2", high = "#E1AF00") +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Invasive Species Reports by State",
subtitle = "Top 20 states with most EDDMapS reports",
x = NULL,
y = "Number of Reports"
) +
theme(panel.grid.major.y = element_blank())
# Get US states map
us_states <- map_data("state")
# Create hexbin summary
eddmaps_map <- eddmaps_clean %>%
filter(!is.na(x), !is.na(y))
# Plot all occurrence points
ggplot() +
geom_polygon(data = us_states,
aes(x = long, y = lat, group = group),
fill = "white", color = "gray60", size = 0.3) +
geom_point(data = eddmaps_map,
aes(x = x, y = y),
color = "#D55E00", alpha = 0.3, size = 0.8) +
coord_map() +
labs(
title = "Geographic Distribution of Invasive Species Reports",
subtitle = paste("All", nrow(eddmaps_map), "occurrence records from EDDMapS"),
x = "Longitude",
y = "Latitude"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11)
)
temporal_summary <- eddmaps_clean %>%
count(year) %>%
arrange(year)
temporal_summary
eddmaps_clean %>%
count(year) %>%
ggplot(aes(x = year, y = n)) +
geom_line(color = "#0072B2", size = 1.2) +
geom_point(color = "#D55E00", size = 3) +
scale_y_continuous(labels = comma) +
scale_x_continuous(breaks = seq(min(eddmaps_clean$year, na.rm = TRUE),
max(eddmaps_clean$year, na.rm = TRUE),
by = 2)) +
labs(
title = "Invasive Species Reports Over Time",
subtitle = "Annual trend in EDDMapS submissions",
x = "Year",
y = "Number of Reports"
)
eddmaps_clean %>%
count(month) %>%
ggplot(aes(x = month, y = n, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_viridis_c(option = "magma", direction = -1) +
scale_y_continuous(labels = comma) +
labs(
title = "Seasonal Pattern of Invasive Species Reports",
subtitle = "When are most observations submitted?",
x = "Month",
y = "Number of Reports"
)
area_data <- eddmaps_clean %>%
filter(!is.na(InfestedAreaAcres), InfestedAreaAcres > 0)
cat("Records with area data:", nrow(area_data),
"out of", nrow(eddmaps_clean),
"(", round(100*nrow(area_data)/nrow(eddmaps_clean), 1), "%)\n")
if(nrow(area_data) > 0) {
area_summary <- area_data %>%
summarize(
min_acres = min(InfestedAreaAcres, na.rm = TRUE),
median_acres = median(InfestedAreaAcres, na.rm = TRUE),
mean_acres = mean(InfestedAreaAcres, na.rm = TRUE),
max_acres = max(InfestedAreaAcres, na.rm = TRUE),
total_acres = sum(InfestedAreaAcres, na.rm = TRUE)
)
area_summary
if(nrow(area_data) > 0) {
area_data %>%
filter(InfestedAreaAcres < quantile(InfestedAreaAcres, 0.95, na.rm = TRUE)) %>%
ggplot(aes(x = InfestedAreaAcres)) +
geom_histogram(bins = 50, fill = "#E69F00", color = "white") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(
title = "Distribution of Infested Area Sizes",
subtitle = "Most reports (95th percentile shown to reduce outlier effect)",
x = "Infested Area (Acres)",
y = "Number of Reports"
)
}
if(nrow(area_data) > 0) {
area_data %>%
group_by(Scientific_Name, SUB_name) %>%
summarize(
total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
max_single = max(InfestedAreaAcres, na.rm = TRUE),
n_reports = n(),
.groups = "drop"
) %>%
arrange(desc(total_acres)) %>%
head(15)
}
Let’s analyze the most reported species in detail:
# Get most reported species
focus_species_name <- top_species$Scientific_Name[1]
focus_common_name <- top_species$SUB_name[1]
cat("Analyzing:", focus_species_name, "(", focus_common_name, ")\n")
species_data <- eddmaps_clean %>%
filter(Scientific_Name == focus_species_name)
cat("Total reports:", nrow(species_data), "\n")
ggplot() +
geom_polygon(data = us_states,
aes(x = long, y = lat, group = group),
fill = "white", color = "gray60", size = 0.3) +
geom_point(data = species_data,
aes(x = x, y = y, color = year),
alpha = 0.6, size = 2) +
scale_color_viridis_c(option = "plasma") +
coord_map() +
labs(
title = paste("Distribution of", focus_common_name),
subtitle = paste(focus_species_name, "- All EDDMapS reports"),
x = "Longitude",
y = "Latitude",
color = "Year"
) +
theme_minimal()
top5_species <- top_species$Scientific_Name[1:5]
comparison_data <- eddmaps_clean %>%
filter(Scientific_Name %in% top5_species) %>%
count(year, Scientific_Name, SUB_name)
comparison_data %>%
ggplot(aes(x = year, y = n, color = SUB_name, group = SUB_name)) +
geom_line(size = 1.2) +
geom_point(size = 2.5) +
scale_color_viridis_d(option = "turbo") +
scale_y_continuous(labels = comma) +
labs(
title = "Temporal Trends: Top 5 Most Reported Species",
subtitle = "How have report frequencies changed over time?",
x = "Year",
y = "Number of Reports",
color = "Species"
)
eddmaps_clean %>%
filter(Scientific_Name %in% top5_species) %>%
count(State_Name, SUB_name) %>%
group_by(SUB_name) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(State_Name, n, SUB_name), y = n, fill = SUB_name)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
scale_fill_viridis_d(option = "turbo") +
facet_wrap(~SUB_name, scales = "free_y", ncol = 2) +
labs(
title = "Top States for Each of the 5 Most Reported Species",
x = NULL,
y = "Number of Reports"
) +
theme(strip.text = element_text(face = "bold"))
quality_metrics <- data.frame(
Field = c("Scientific Name", "Common Name", "Date", "Location (x,y)",
"State", "County", "Infested Area", "Genus", "Species"),
Records_With_Data = c(
sum(!is.na(eddmaps$Scientific_Name) & eddmaps$Scientific_Name != ""),
sum(!is.na(eddmaps$SUB_name) & eddmaps$SUB_name != ""),
sum(!is.na(eddmaps$ObservationDate)),
sum(!is.na(eddmaps$x) & !is.na(eddmaps$y)),
sum(!is.na(eddmaps$State_Name)),
sum(!is.na(eddmaps$County_Name)),
sum(!is.na(eddmaps$InfestedAreaAcres) & eddmaps$InfestedAreaAcres > 0),
sum(!is.na(eddmaps$SUB_genus)),
sum(!is.na(eddmaps$SUB_species))
)
) %>%
mutate(
Percent_Complete = round(100 * Records_With_Data / nrow(eddmaps), 1),
Quality_Rating = case_when(
Percent_Complete >= 95 ~ "Excellent",
Percent_Complete >= 80 ~ "Good",
Percent_Complete >= 50 ~ "Fair",
TRUE ~ "Poor"
)
)
quality_metrics %>%
arrange(desc(Percent_Complete))
eddmaps_clean %>%
mutate(
has_area = !is.na(InfestedAreaAcres) & InfestedAreaAcres > 0,
has_county = !is.na(County_Name)
) %>%
group_by(year) %>%
summarize(
total = n(),
with_area = sum(has_area),
with_county = sum(has_county),
.groups = "drop"
) %>%
pivot_longer(cols = c(with_area, with_county),
names_to = "metric", values_to = "count") %>%
mutate(
percent = 100 * count / total,
metric = recode(metric,
"with_area" = "Has Area Data",
"with_county" = "Has County Info")
) %>%
ggplot(aes(x = year, y = percent, color = metric, group = metric)) +
geom_line(size = 1.2) +
geom_point(size = 2.5) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Data Quality Trends Over Time",
subtitle = "Percentage of records with additional information",
x = "Year",
y = "Percent of Records (%)",
color = "Data Type"
)
Based on this exploration, here are key patterns that could feed into your knowledge graph:
kg_metrics <- eddmaps_clean %>%
group_by(Scientific_Name, SUB_name) %>%
summarize(
n_reports = n(),
n_states = n_distinct(State_Name),
n_counties = n_distinct(County_Name),
years_active = max(year, na.rm = TRUE) - min(year, na.rm = TRUE) + 1,
first_year = min(year, na.rm = TRUE),
latest_year = max(year, na.rm = TRUE),
total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
spread_rate = n_states / pmax(years_active, 1),
geographic_breadth = n_states,
reporting_intensity = n_reports / pmax(years_active, 1)
) %>%
arrange(desc(spread_rate))
kg_metrics %>%
head(20) %>%
select(SUB_name, Scientific_Name, n_states, years_active, spread_rate,
reporting_intensity, total_acres)
Species with rapid spread and broad geographic range:
kg_metrics %>%
filter(n_reports >= 10, years_active >= 3) %>%
ggplot(aes(x = spread_rate, y = n_reports,
size = total_acres, color = years_active)) +
geom_point(alpha = 0.6) +
scale_color_viridis_c(option = "plasma") +
scale_size_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(
title = "High-Risk Species Profile",
subtitle = "Spread rate vs. detection frequency (bubble size = infested area)",
x = "Spread Rate (states/year)",
y = "Number of Reports",
size = "Total Acres",
color = "Years Active"
) +
theme(legend.position = "right")
variables_table <- data.frame(
Category = c("Species Identity", "Spatial", "Temporal", "Extent",
"Traits (to link)", "Risk Factors"),
Variables = c(
"Scientific name, genus, family, common name",
"x, y coordinates, state, county, region",
"First record, latest record, year, season",
"Infested acres, number of sites, geographic spread",
"Fecundity, dispersal, tolerance, pathways (from other sources)",
"Spread rate, persistence, area invaded, detection lag"
)
)
variables_table
cat("\n=== EDDMAPS DATA SUMMARY ===\n\n")
cat("Total Records:", nrow(eddmaps_clean), "\n")
cat("Unique Species:", n_species, "\n")
cat("States Covered:", n_distinct(eddmaps_clean$State_Name), "\n")
cat("Counties Covered:", n_distinct(eddmaps_clean$County_Name), "\n")
cat("Date Range:", format(min(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "to",
format(max(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "\n")
cat("Years of Data:", max(eddmaps_clean$year) - min(eddmaps_clean$year) + 1, "\n\n")
cat("Data Completeness:\n")
cat(" - With area data:",
round(100*sum(!is.na(eddmaps_clean$InfestedAreaAcres))/nrow(eddmaps_clean), 1), "%\n")
cat(" - With county info:",
round(100*sum(!is.na(eddmaps_clean$County_Name))/nrow(eddmaps_clean), 1), "%\n")
cat(" - With coordinates:",
round(100*sum(!is.na(eddmaps_clean$x) & !is.na(eddmaps_clean$y))/nrow(eddmaps_clean), 1), "%\n")