Project Context

This analysis explores EDDMapS (Early Detection & Distribution Mapping System) data as part of developing an automated invasive species risk assessment system using knowledge graphs.

Research Goal: Extract spatial-temporal patterns, species traits, and invasion dynamics from occurrence data to feed into a knowledge graph that predicts species at risk of invasion.

Key Questions: - What species are most frequently reported? - Where are invasion hotspots? - How do invasions spread over time? - What patterns emerge that could inform risk assessment?

Load Packages

library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(scales)
library(viridis)
library(knitr)
library(sf)
library(maps)

# Set theme
theme_set(theme_minimal(base_size = 12) +
            theme(plot.title = element_text(face = "bold", size = 14),
                  plot.subtitle = element_text(size = 11, color = "gray30"),
                  legend.position = "bottom"))

1. Load and Inspect Data

# Load EDDMapS data
# UPDATE THIS PATH to match your file location
eddmaps <- read_csv("C:/Users/annasokol/Downloads/EDDMapSPoints_view_6818701269405066160.csv",
                    show_col_types = FALSE)

cat("✓ Data loaded!\n")
✓ Data loaded!
cat("Dimensions:", nrow(eddmaps), "rows ×", ncol(eddmaps), "columns\n")
Dimensions: 5630454 rows × 18 columns

Data Structure

# Show column names and types
glimpse(eddmaps)
Rows: 5,630,454
Columns: 18
$ TableID           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, …
$ Reporter_First    <chr> "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "Chris", "…
$ Reporter_Last     <chr> "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "Evans", "…
$ State_Name        <chr> "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georgia", "Georg…
$ County_Name       <chr> "Tift County", "Tift County", "Montgomery County", "Berrien County", "Berrien County", "Tift County", "Tift County", "Berrien C…
$ ObservationDate   <chr> "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "3/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4…
$ Date_Updated      <chr> "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "3/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4/17/2006 12:00:00 AM", "4…
$ SUB_ID            <dbl> 2425, 3035, 3045, 2779, 2779, 3035, 3039, 3035, 3009, 3039, 3039, 2425, 3049, 3035, 3083, 2425, 3049, 3035, 3039, 3039, 3035, 3…
$ SUB_name          <chr> "kudzu", "Chinese privet", "Japanese climbing fern", "alligatorweed", "alligatorweed", "Chinese privet", "Japanese honeysuckle"…
$ SUB_genus         <chr> "Pueraria", "Ligustrum", "Lygodium", "Alternanthera", "Alternanthera", "Ligustrum", "Lonicera", "Ligustrum", "Arundo", "Lonicer…
$ SUB_species       <chr> "montana", "sinense", "japonicum", "philoxeroides", "philoxeroides", "sinense", "japonica", "sinense", "donax", "japonica", "ja…
$ SUB_variety       <chr> "var. lobata", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "var. lobata", NA, NA, NA, "var. lobata", NA, NA, NA, NA, NA, NA, NA, NA…
$ Scientific_Name   <chr> "Pueraria montana var. lobata", "Ligustrum sinense", "Lygodium japonicum", "Alternanthera philoxeroides", "Alternanthera philox…
$ year              <dbl> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2…
$ StateFIPS         <dbl> 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,…
$ InfestedAreaAcres <dbl> 3.000000000, 0.000344352, 3.000000000, 0.009182720, 0.004591360, NA, 0.000344352, 0.000229568, NA, 0.000688704, 0.045913600, 1.…
$ x                 <dbl> 3155857, 3155857, 3224207, 3172772, 3173432, 3153715, 3153715, 3172772, 3152477, 3152477, 3157085, 3157411, 3155444, 3149938, 3…
$ y                 <dbl> 3838672, 3838672, 3931809, 3828005, 3826923, 3838166, 3838166, 3828005, 3836609, 3836609, 3832718, 3832783, 3838497, 3834044, 3…

First Look at the Data

head(eddmaps, 20)

Column Descriptions

# Understand what each column contains
data.frame(
  Column = names(eddmaps),
  Type = sapply(eddmaps, class),
  Sample_Value = sapply(eddmaps, function(x) as.character(x[1])),
  Non_Missing = sapply(eddmaps, function(x) sum(!is.na(x))),
  Percent_Complete = round(sapply(eddmaps, function(x) sum(!is.na(x))/length(x)*100), 1)
) %>%
  arrange(desc(Percent_Complete))

2. Data Cleaning & Preparation

# Clean and prepare data
eddmaps_clean <- eddmaps %>%
  # Parse dates properly
  mutate(
    ObservationDate = as.Date(ObservationDate, format = "%m/%d/%Y"),
    Date_Updated = as.Date(Date_Updated, format = "%m/%d/%Y"),
    year = year(ObservationDate),
    month = month(ObservationDate, label = TRUE),
    # Handle infested area
    InfestedAreaAcres = as.numeric(InfestedAreaAcres),
    # Clean species names
    Scientific_Name = str_trim(Scientific_Name),
    SUB_name = str_trim(SUB_name)
  ) %>%
  # Remove records without dates or locations
  filter(!is.na(ObservationDate), !is.na(x), !is.na(y))

cat("✓ Data cleaned!\n")
cat("Records after cleaning:", nrow(eddmaps_clean), "\n")
cat("Date range:", min(eddmaps_clean$ObservationDate, na.rm = TRUE), "to", 
    max(eddmaps_clean$ObservationDate, na.rm = TRUE), "\n")

3. Species Overview

How Many Species?

n_species <- length(unique(eddmaps_clean$Scientific_Name))
n_genera <- length(unique(eddmaps_clean$SUB_genus))
n_families <- length(unique(eddmaps_clean$SUB_name))

cat("Total unique species:", n_species, "\n")
cat("Total genera:", n_genera, "\n")
cat("Total common names:", n_families, "\n")

Top 20 Most Reported Species

top_species <- eddmaps_clean %>%
  count(Scientific_Name, SUB_name, sort = TRUE) %>%
  head(20)

top_species
eddmaps_clean %>%
  count(Scientific_Name, SUB_name) %>%
  arrange(desc(n)) %>%
  head(20) %>%
  mutate(
    label = paste0(SUB_name, "\n(", Scientific_Name, ")"),
    label = str_wrap(label, width = 30)
  ) %>%
  ggplot(aes(x = reorder(label, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = comma(n)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_viridis_c(option = "plasma", direction = -1) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Top 20 Most Reported Invasive Species in EDDMapS",
    subtitle = "Number of occurrence records",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(panel.grid.major.y = element_blank())

4. Geographic Distribution

Reports by State

state_summary <- eddmaps_clean %>%
  count(State_Name, name = "reports") %>%
  arrange(desc(reports))

state_summary %>%
  head(20) 
state_summary %>%
  head(20) %>%
  ggplot(aes(x = reorder(State_Name, reports), y = reports, fill = reports)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = comma(reports)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#3B9AB2", high = "#E1AF00") +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Invasive Species Reports by State",
    subtitle = "Top 20 states with most EDDMapS reports",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(panel.grid.major.y = element_blank())

Geographic Hotspots Map

# Get US states map
us_states <- map_data("state")

# Create hexbin summary
eddmaps_map <- eddmaps_clean %>%
  filter(!is.na(x), !is.na(y))

# Plot all occurrence points
ggplot() +
  geom_polygon(data = us_states, 
               aes(x = long, y = lat, group = group),
               fill = "white", color = "gray60", size = 0.3) +
  geom_point(data = eddmaps_map, 
             aes(x = x, y = y), 
             color = "#D55E00", alpha = 0.3, size = 0.8) +
  coord_map() +
  labs(
    title = "Geographic Distribution of Invasive Species Reports",
    subtitle = paste("All", nrow(eddmaps_map), "occurrence records from EDDMapS"),
    x = "Longitude",
    y = "Latitude"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11)
  )

5. Temporal Patterns

Reports Over Time

temporal_summary <- eddmaps_clean %>%
  count(year) %>%
  arrange(year)

temporal_summary
eddmaps_clean %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_line(color = "#0072B2", size = 1.2) +
  geom_point(color = "#D55E00", size = 3) +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(breaks = seq(min(eddmaps_clean$year, na.rm = TRUE),
                                  max(eddmaps_clean$year, na.rm = TRUE), 
                                  by = 2)) +
  labs(
    title = "Invasive Species Reports Over Time",
    subtitle = "Annual trend in EDDMapS submissions",
    x = "Year",
    y = "Number of Reports"
  )

Seasonal Patterns

eddmaps_clean %>%
  count(month) %>%
  ggplot(aes(x = month, y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_c(option = "magma", direction = -1) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Seasonal Pattern of Invasive Species Reports",
    subtitle = "When are most observations submitted?",
    x = "Month",
    y = "Number of Reports"
  )

6. Invasion Extent Analysis

Infested Area Distribution

area_data <- eddmaps_clean %>%
  filter(!is.na(InfestedAreaAcres), InfestedAreaAcres > 0)

cat("Records with area data:", nrow(area_data), 
    "out of", nrow(eddmaps_clean), 
    "(", round(100*nrow(area_data)/nrow(eddmaps_clean), 1), "%)\n")

if(nrow(area_data) > 0) {
  area_summary <- area_data %>%
    summarize(
      min_acres = min(InfestedAreaAcres, na.rm = TRUE),
      median_acres = median(InfestedAreaAcres, na.rm = TRUE),
      mean_acres = mean(InfestedAreaAcres, na.rm = TRUE),
      max_acres = max(InfestedAreaAcres, na.rm = TRUE),
      total_acres = sum(InfestedAreaAcres, na.rm = TRUE)
    )
  
  area_summary 
if(nrow(area_data) > 0) {
  area_data %>%
    filter(InfestedAreaAcres < quantile(InfestedAreaAcres, 0.95, na.rm = TRUE)) %>%
    ggplot(aes(x = InfestedAreaAcres)) +
    geom_histogram(bins = 50, fill = "#E69F00", color = "white") +
    scale_x_continuous(labels = comma) +
    scale_y_continuous(labels = comma) +
    labs(
      title = "Distribution of Infested Area Sizes",
      subtitle = "Most reports (95th percentile shown to reduce outlier effect)",
      x = "Infested Area (Acres)",
      y = "Number of Reports"
    )
}

Species with Largest Infestations

if(nrow(area_data) > 0) {
  area_data %>%
    group_by(Scientific_Name, SUB_name) %>%
    summarize(
      total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
      max_single = max(InfestedAreaAcres, na.rm = TRUE),
      n_reports = n(),
      .groups = "drop"
    ) %>%
    arrange(desc(total_acres)) %>%
    head(15)
}

7. Species-Specific Deep Dives

Pick a Species to Analyze in Detail

Let’s analyze the most reported species in detail:

# Get most reported species
focus_species_name <- top_species$Scientific_Name[1]
focus_common_name <- top_species$SUB_name[1]

cat("Analyzing:", focus_species_name, "(", focus_common_name, ")\n")

species_data <- eddmaps_clean %>%
  filter(Scientific_Name == focus_species_name)

cat("Total reports:", nrow(species_data), "\n")

Geographic Distribution of corn earworm, tomato fruitworm

ggplot() +
  geom_polygon(data = us_states, 
               aes(x = long, y = lat, group = group),
               fill = "white", color = "gray60", size = 0.3) +
  geom_point(data = species_data, 
             aes(x = x, y = y, color = year), 
             alpha = 0.6, size = 2) +
  scale_color_viridis_c(option = "plasma") +
  coord_map() +
  labs(
    title = paste("Distribution of", focus_common_name),
    subtitle = paste(focus_species_name, "- All EDDMapS reports"),
    x = "Longitude",
    y = "Latitude",
    color = "Year"
  ) +
  theme_minimal()

8. Multi-Species Comparison

Compare Top 5 Species

top5_species <- top_species$Scientific_Name[1:5]

comparison_data <- eddmaps_clean %>%
  filter(Scientific_Name %in% top5_species) %>%
  count(year, Scientific_Name, SUB_name)

Geographic Spread

eddmaps_clean %>%
  filter(Scientific_Name %in% top5_species) %>%
  count(State_Name, SUB_name) %>%
  group_by(SUB_name) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(State_Name, n, SUB_name), y = n, fill = SUB_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  scale_fill_viridis_d(option = "turbo") +
  facet_wrap(~SUB_name, scales = "free_y", ncol = 2) +
  labs(
    title = "Top States for Each of the 5 Most Reported Species",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(strip.text = element_text(face = "bold"))

9. Data Quality Assessment

Completeness Analysis

quality_metrics <- data.frame(
  Field = c("Scientific Name", "Common Name", "Date", "Location (x,y)", 
            "State", "County", "Infested Area", "Genus", "Species"),
  Records_With_Data = c(
    sum(!is.na(eddmaps$Scientific_Name) & eddmaps$Scientific_Name != ""),
    sum(!is.na(eddmaps$SUB_name) & eddmaps$SUB_name != ""),
    sum(!is.na(eddmaps$ObservationDate)),
    sum(!is.na(eddmaps$x) & !is.na(eddmaps$y)),
    sum(!is.na(eddmaps$State_Name)),
    sum(!is.na(eddmaps$County_Name)),
    sum(!is.na(eddmaps$InfestedAreaAcres) & eddmaps$InfestedAreaAcres > 0),
    sum(!is.na(eddmaps$SUB_genus)),
    sum(!is.na(eddmaps$SUB_species))
  )
) %>%
  mutate(
    Percent_Complete = round(100 * Records_With_Data / nrow(eddmaps), 1),
    Quality_Rating = case_when(
      Percent_Complete >= 95 ~ "Excellent",
      Percent_Complete >= 80 ~ "Good",
      Percent_Complete >= 50 ~ "Fair",
      TRUE ~ "Poor"
    )
  )

quality_metrics %>%
  arrange(desc(Percent_Complete)) 

Temporal Data Quality

eddmaps_clean %>%
  mutate(
    has_area = !is.na(InfestedAreaAcres) & InfestedAreaAcres > 0,
    has_county = !is.na(County_Name)
  ) %>%
  group_by(year) %>%
  summarize(
    total = n(),
    with_area = sum(has_area),
    with_county = sum(has_county),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(with_area, with_county),
               names_to = "metric", values_to = "count") %>%
  mutate(
    percent = 100 * count / total,
    metric = recode(metric,
                   "with_area" = "Has Area Data",
                   "with_county" = "Has County Info")
  ) %>%
  ggplot(aes(x = year, y = percent, color = metric, group = metric)) +
  geom_line(size = 1.2) +
  geom_point(size = 2.5) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Data Quality Trends Over Time",
    subtitle = "Percentage of records with additional information",
    x = "Year",
    y = "Percent of Records (%)",
    color = "Data Type"
  )

10. Knowledge Graph Insights

Patterns for Risk Assessment

Based on this exploration, here are key patterns that could feed into your knowledge graph:

Species Traits to Extract:

  • Spread rate: Year-over-year expansion in number of states
  • Geographic breadth: Number of unique states/counties
  • Infestation intensity: Acres infested per report
  • Detection timing: When first detected vs. when spreading
  • Seasonal activity: Peak reporting months (biological timing)

Spatial-Temporal Metrics:

kg_metrics <- eddmaps_clean %>%
  group_by(Scientific_Name, SUB_name) %>%
  summarize(
    n_reports = n(),
    n_states = n_distinct(State_Name),
    n_counties = n_distinct(County_Name),
    years_active = max(year, na.rm = TRUE) - min(year, na.rm = TRUE) + 1,
    first_year = min(year, na.rm = TRUE),
    latest_year = max(year, na.rm = TRUE),
    total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    spread_rate = n_states / pmax(years_active, 1),
    geographic_breadth = n_states,
    reporting_intensity = n_reports / pmax(years_active, 1)
  ) %>%
  arrange(desc(spread_rate))

kg_metrics %>%
  head(20) %>%
  select(SUB_name, Scientific_Name, n_states, years_active, spread_rate, 
         reporting_intensity, total_acres)

High-Risk Species Identification

Species with rapid spread and broad geographic range:

kg_metrics %>%
  filter(n_reports >= 10, years_active >= 3) %>%
  ggplot(aes(x = spread_rate, y = n_reports, 
             size = total_acres, color = years_active)) +
  geom_point(alpha = 0.6) +
  scale_color_viridis_c(option = "plasma") +
  scale_size_continuous(labels = comma) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "High-Risk Species Profile",
    subtitle = "Spread rate vs. detection frequency (bubble size = infested area)",
    x = "Spread Rate (states/year)",
    y = "Number of Reports",
    size = "Total Acres",
    color = "Years Active"
  ) +
  theme(legend.position = "right")

11. Connection to Research Framework

How This Data Informs Your Knowledge Graph

For Arrival Assessment:

  • First detection dates → When species entered regions
  • Geographic coordinates → Climate matching with native range
  • State/county patterns → Human-mediated pathways

For Establishment Assessment:

  • Years active → Persistence in new environments
  • Report frequency → Population stability indicator
  • Seasonal patterns → Life cycle completion

For Spread Assessment:

  • States invaded over time → Dispersal rate
  • Infested area growth → Local population expansion
  • County-level patterns → Within-state movement

For Impact Assessment:

  • Total infested acres → Ecological footprint
  • Report intensity → Management concern level
  • Multi-year presence → Eradication difficulty

12. Next Steps for Knowledge Graph

Data Integration Priorities:

  1. Match to RIIS database → Get establishment means, pathways
  2. Link to USGS NAS → Get narrative vectors, transport info
  3. Connect to trait databases → Add fecundity, dispersal ability
  4. Overlay climate data → Climate matching scores
  5. Link to literature → Mechanistic evidence

Key Variables to Extract:

variables_table <- data.frame(
  Category = c("Species Identity", "Spatial", "Temporal", "Extent", 
               "Traits (to link)", "Risk Factors"),
  Variables = c(
    "Scientific name, genus, family, common name",
    "x, y coordinates, state, county, region",
    "First record, latest record, year, season",
    "Infested acres, number of sites, geographic spread",
    "Fecundity, dispersal, tolerance, pathways (from other sources)",
    "Spread rate, persistence, area invaded, detection lag"
  )
)

variables_table

Summary Statistics

cat("\n=== EDDMAPS DATA SUMMARY ===\n\n")
cat("Total Records:", nrow(eddmaps_clean), "\n")
cat("Unique Species:", n_species, "\n")
cat("States Covered:", n_distinct(eddmaps_clean$State_Name), "\n")
cat("Counties Covered:", n_distinct(eddmaps_clean$County_Name), "\n")
cat("Date Range:", format(min(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "to", 
    format(max(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "\n")
cat("Years of Data:", max(eddmaps_clean$year) - min(eddmaps_clean$year) + 1, "\n\n")

cat("Data Completeness:\n")
cat("  - With area data:", 
    round(100*sum(!is.na(eddmaps_clean$InfestedAreaAcres))/nrow(eddmaps_clean), 1), "%\n")
cat("  - With county info:", 
    round(100*sum(!is.na(eddmaps_clean$County_Name))/nrow(eddmaps_clean), 1), "%\n")
cat("  - With coordinates:", 
    round(100*sum(!is.na(eddmaps_clean$x) & !is.na(eddmaps_clean$y))/nrow(eddmaps_clean), 1), "%\n")
---
title: "EDDMapS Invasive Species Data Exploration"
subtitle: "Understanding Occurrence Patterns for Knowledge Graph Development"
author: "Invasive Species Research"
date: "`r Sys.Date()`"
output: html_notebook
---

# Project Context

This analysis explores **EDDMapS** (Early Detection & Distribution Mapping System) data as part of developing an automated invasive species risk assessment system using knowledge graphs.

**Research Goal:** Extract spatial-temporal patterns, species traits, and invasion dynamics from occurrence data to feed into a knowledge graph that predicts species at risk of invasion.

**Key Questions:** - What species are most frequently reported? - Where are invasion hotspots? - How do invasions spread over time? - What patterns emerge that could inform risk assessment?

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,
  message = TRUE,
  warning = FALSE,
  fig.width = 12,
  fig.height = 7,
  dpi = 150
)
```

# Load Packages

```{r packages, message=FALSE, warning=FALSE}
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(scales)
library(viridis)
library(knitr)
library(sf)
library(maps)

# Set theme
theme_set(theme_minimal(base_size = 12) +
            theme(plot.title = element_text(face = "bold", size = 14),
                  plot.subtitle = element_text(size = 11, color = "gray30"),
                  legend.position = "bottom"))
```

# 1. Load and Inspect Data

```{r load-data}
# Load EDDMapS data
# UPDATE THIS PATH to match your file location
eddmaps <- read_csv("C:/Users/annasokol/Downloads/EDDMapSPoints_view_6818701269405066160.csv",
                    show_col_types = FALSE)

cat("✓ Data loaded!\n")
cat("Dimensions:", nrow(eddmaps), "rows ×", ncol(eddmaps), "columns\n")
```

## Data Structure

```{r structure}
# Show column names and types
glimpse(eddmaps)
```

## First Look at the Data

```{r head}
head(eddmaps, 20)
```

## Column Descriptions

```{r column-info}
# Understand what each column contains
data.frame(
  Column = names(eddmaps),
  Type = sapply(eddmaps, class),
  Sample_Value = sapply(eddmaps, function(x) as.character(x[1])),
  Non_Missing = sapply(eddmaps, function(x) sum(!is.na(x))),
  Percent_Complete = round(sapply(eddmaps, function(x) sum(!is.na(x))/length(x)*100), 1)
) %>%
  arrange(desc(Percent_Complete))

```

------------------------------------------------------------------------

# 2. Data Cleaning & Preparation

```{r clean-data}
# Clean and prepare data
eddmaps_clean <- eddmaps %>%
  # Parse dates properly
  mutate(
    ObservationDate = as.Date(ObservationDate, format = "%m/%d/%Y"),
    Date_Updated = as.Date(Date_Updated, format = "%m/%d/%Y"),
    year = year(ObservationDate),
    month = month(ObservationDate, label = TRUE),
    # Handle infested area
    InfestedAreaAcres = as.numeric(InfestedAreaAcres),
    # Clean species names
    Scientific_Name = str_trim(Scientific_Name),
    SUB_name = str_trim(SUB_name)
  ) %>%
  # Remove records without dates or locations
  filter(!is.na(ObservationDate), !is.na(x), !is.na(y))

cat("✓ Data cleaned!\n")
cat("Records after cleaning:", nrow(eddmaps_clean), "\n")
cat("Date range:", min(eddmaps_clean$ObservationDate, na.rm = TRUE), "to", 
    max(eddmaps_clean$ObservationDate, na.rm = TRUE), "\n")
```

------------------------------------------------------------------------

# 3. Species Overview

## How Many Species?

```{r species-count}
n_species <- length(unique(eddmaps_clean$Scientific_Name))
n_genera <- length(unique(eddmaps_clean$SUB_genus))
n_families <- length(unique(eddmaps_clean$SUB_name))

cat("Total unique species:", n_species, "\n")
cat("Total genera:", n_genera, "\n")
cat("Total common names:", n_families, "\n")
```

## Top 20 Most Reported Species

```{r top-species-table}
top_species <- eddmaps_clean %>%
  count(Scientific_Name, SUB_name, sort = TRUE) %>%
  head(20)

top_species
```

```{r top-species-plot, fig.width=12, fig.height=8}
eddmaps_clean %>%
  count(Scientific_Name, SUB_name) %>%
  arrange(desc(n)) %>%
  head(20) %>%
  mutate(
    label = paste0(SUB_name, "\n(", Scientific_Name, ")"),
    label = str_wrap(label, width = 30)
  ) %>%
  ggplot(aes(x = reorder(label, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = comma(n)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_viridis_c(option = "plasma", direction = -1) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Top 20 Most Reported Invasive Species in EDDMapS",
    subtitle = "Number of occurrence records",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(panel.grid.major.y = element_blank())
```

------------------------------------------------------------------------

# 4. Geographic Distribution

## Reports by State

```{r state-summary}
state_summary <- eddmaps_clean %>%
  count(State_Name, name = "reports") %>%
  arrange(desc(reports))

state_summary %>%
  head(20) 
```

```{r state-plot, fig.width=12, fig.height=8}
state_summary %>%
  head(20) %>%
  ggplot(aes(x = reorder(State_Name, reports), y = reports, fill = reports)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = comma(reports)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#3B9AB2", high = "#E1AF00") +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Invasive Species Reports by State",
    subtitle = "Top 20 states with most EDDMapS reports",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(panel.grid.major.y = element_blank())
```

## Geographic Hotspots Map

```{r map-all, fig.width=14, fig.height=10}
# Get US states map
us_states <- map_data("state")

# Create hexbin summary
eddmaps_map <- eddmaps_clean %>%
  filter(!is.na(x), !is.na(y))

# Plot all occurrence points
ggplot() +
  geom_polygon(data = us_states, 
               aes(x = long, y = lat, group = group),
               fill = "white", color = "gray60", size = 0.3) +
  geom_point(data = eddmaps_map, 
             aes(x = x, y = y), 
             color = "#D55E00", alpha = 0.3, size = 0.8) +
  coord_map() +
  labs(
    title = "Geographic Distribution of Invasive Species Reports",
    subtitle = paste("All", nrow(eddmaps_map), "occurrence records from EDDMapS"),
    x = "Longitude",
    y = "Latitude"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11)
  )
```

------------------------------------------------------------------------

# 5. Temporal Patterns

## Reports Over Time

```{r temporal-overview}
temporal_summary <- eddmaps_clean %>%
  count(year) %>%
  arrange(year)

temporal_summary
```

```{r temporal-plot, fig.width=12, fig.height=6}
eddmaps_clean %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_line(color = "#0072B2", size = 1.2) +
  geom_point(color = "#D55E00", size = 3) +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(breaks = seq(min(eddmaps_clean$year, na.rm = TRUE),
                                  max(eddmaps_clean$year, na.rm = TRUE), 
                                  by = 2)) +
  labs(
    title = "Invasive Species Reports Over Time",
    subtitle = "Annual trend in EDDMapS submissions",
    x = "Year",
    y = "Number of Reports"
  )
```

## Seasonal Patterns

```{r seasonal-plot, fig.width=12, fig.height=6}
eddmaps_clean %>%
  count(month) %>%
  ggplot(aes(x = month, y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_c(option = "magma", direction = -1) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Seasonal Pattern of Invasive Species Reports",
    subtitle = "When are most observations submitted?",
    x = "Month",
    y = "Number of Reports"
  )
```

------------------------------------------------------------------------

# 6. Invasion Extent Analysis

## Infested Area Distribution

```{r infested-area}
area_data <- eddmaps_clean %>%
  filter(!is.na(InfestedAreaAcres), InfestedAreaAcres > 0)

cat("Records with area data:", nrow(area_data), 
    "out of", nrow(eddmaps_clean), 
    "(", round(100*nrow(area_data)/nrow(eddmaps_clean), 1), "%)\n")

if(nrow(area_data) > 0) {
  area_summary <- area_data %>%
    summarize(
      min_acres = min(InfestedAreaAcres, na.rm = TRUE),
      median_acres = median(InfestedAreaAcres, na.rm = TRUE),
      mean_acres = mean(InfestedAreaAcres, na.rm = TRUE),
      max_acres = max(InfestedAreaAcres, na.rm = TRUE),
      total_acres = sum(InfestedAreaAcres, na.rm = TRUE)
    )
  
  area_summary 
```

```{r infested-area-plot, fig.width=12, fig.height=6}
if(nrow(area_data) > 0) {
  area_data %>%
    filter(InfestedAreaAcres < quantile(InfestedAreaAcres, 0.95, na.rm = TRUE)) %>%
    ggplot(aes(x = InfestedAreaAcres)) +
    geom_histogram(bins = 50, fill = "#E69F00", color = "white") +
    scale_x_continuous(labels = comma) +
    scale_y_continuous(labels = comma) +
    labs(
      title = "Distribution of Infested Area Sizes",
      subtitle = "Most reports (95th percentile shown to reduce outlier effect)",
      x = "Infested Area (Acres)",
      y = "Number of Reports"
    )
}
```

## Species with Largest Infestations

```{r largest-infestations}
if(nrow(area_data) > 0) {
  area_data %>%
    group_by(Scientific_Name, SUB_name) %>%
    summarize(
      total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
      max_single = max(InfestedAreaAcres, na.rm = TRUE),
      n_reports = n(),
      .groups = "drop"
    ) %>%
    arrange(desc(total_acres)) %>%
    head(15)
}
```

------------------------------------------------------------------------

# 7. Species-Specific Deep Dives

## Pick a Species to Analyze in Detail

Let's analyze the most reported species in detail:

```{r focus-species}
# Get most reported species
focus_species_name <- top_species$Scientific_Name[1]
focus_common_name <- top_species$SUB_name[1]

cat("Analyzing:", focus_species_name, "(", focus_common_name, ")\n")

species_data <- eddmaps_clean %>%
  filter(Scientific_Name == focus_species_name)

cat("Total reports:", nrow(species_data), "\n")
```

### Geographic Distribution of `r focus_common_name`

```{r species-map, fig.width=14, fig.height=10}
ggplot() +
  geom_polygon(data = us_states, 
               aes(x = long, y = lat, group = group),
               fill = "white", color = "gray60", size = 0.3) +
  geom_point(data = species_data, 
             aes(x = x, y = y, color = year), 
             alpha = 0.6, size = 2) +
  scale_color_viridis_c(option = "plasma") +
  coord_map() +
  labs(
    title = paste("Distribution of", focus_common_name),
    subtitle = paste(focus_species_name, "- All EDDMapS reports"),
    x = "Longitude",
    y = "Latitude",
    color = "Year"
  ) +
  theme_minimal()
```

------------------------------------------------------------------------

# 8. Multi-Species Comparison

## Compare Top 5 Species

```{r top5-comparison}
top5_species <- top_species$Scientific_Name[1:5]

comparison_data <- eddmaps_clean %>%
  filter(Scientific_Name %in% top5_species) %>%
  count(year, Scientific_Name, SUB_name)
```

### Temporal Trends

```{r top5-temporal, fig.width=12, fig.height=7}
comparison_data %>%
  ggplot(aes(x = year, y = n, color = SUB_name, group = SUB_name)) +
  geom_line(size = 1.2) +
  geom_point(size = 2.5) +
  scale_color_viridis_d(option = "turbo") +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Temporal Trends: Top 5 Most Reported Species",
    subtitle = "How have report frequencies changed over time?",
    x = "Year",
    y = "Number of Reports",
    color = "Species"
  )
```

### Geographic Spread

```{r top5-states, fig.width=12, fig.height=8}
eddmaps_clean %>%
  filter(Scientific_Name %in% top5_species) %>%
  count(State_Name, SUB_name) %>%
  group_by(SUB_name) %>%
  top_n(10, n) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(State_Name, n, SUB_name), y = n, fill = SUB_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  scale_fill_viridis_d(option = "turbo") +
  facet_wrap(~SUB_name, scales = "free_y", ncol = 2) +
  labs(
    title = "Top States for Each of the 5 Most Reported Species",
    x = NULL,
    y = "Number of Reports"
  ) +
  theme(strip.text = element_text(face = "bold"))
```

------------------------------------------------------------------------

# 9. Data Quality Assessment

## Completeness Analysis

```{r data-quality}
quality_metrics <- data.frame(
  Field = c("Scientific Name", "Common Name", "Date", "Location (x,y)", 
            "State", "County", "Infested Area", "Genus", "Species"),
  Records_With_Data = c(
    sum(!is.na(eddmaps$Scientific_Name) & eddmaps$Scientific_Name != ""),
    sum(!is.na(eddmaps$SUB_name) & eddmaps$SUB_name != ""),
    sum(!is.na(eddmaps$ObservationDate)),
    sum(!is.na(eddmaps$x) & !is.na(eddmaps$y)),
    sum(!is.na(eddmaps$State_Name)),
    sum(!is.na(eddmaps$County_Name)),
    sum(!is.na(eddmaps$InfestedAreaAcres) & eddmaps$InfestedAreaAcres > 0),
    sum(!is.na(eddmaps$SUB_genus)),
    sum(!is.na(eddmaps$SUB_species))
  )
) %>%
  mutate(
    Percent_Complete = round(100 * Records_With_Data / nrow(eddmaps), 1),
    Quality_Rating = case_when(
      Percent_Complete >= 95 ~ "Excellent",
      Percent_Complete >= 80 ~ "Good",
      Percent_Complete >= 50 ~ "Fair",
      TRUE ~ "Poor"
    )
  )

quality_metrics %>%
  arrange(desc(Percent_Complete)) 

```

## Temporal Data Quality

```{r temporal-quality, fig.width=12, fig.height=6}
eddmaps_clean %>%
  mutate(
    has_area = !is.na(InfestedAreaAcres) & InfestedAreaAcres > 0,
    has_county = !is.na(County_Name)
  ) %>%
  group_by(year) %>%
  summarize(
    total = n(),
    with_area = sum(has_area),
    with_county = sum(has_county),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(with_area, with_county),
               names_to = "metric", values_to = "count") %>%
  mutate(
    percent = 100 * count / total,
    metric = recode(metric,
                   "with_area" = "Has Area Data",
                   "with_county" = "Has County Info")
  ) %>%
  ggplot(aes(x = year, y = percent, color = metric, group = metric)) +
  geom_line(size = 1.2) +
  geom_point(size = 2.5) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Data Quality Trends Over Time",
    subtitle = "Percentage of records with additional information",
    x = "Year",
    y = "Percent of Records (%)",
    color = "Data Type"
  )
```

------------------------------------------------------------------------

# 10. Knowledge Graph Insights

## Patterns for Risk Assessment

Based on this exploration, here are key patterns that could feed into your knowledge graph:

### Species Traits to Extract:

-   **Spread rate**: Year-over-year expansion in number of states
-   **Geographic breadth**: Number of unique states/counties
-   **Infestation intensity**: Acres infested per report
-   **Detection timing**: When first detected vs. when spreading
-   **Seasonal activity**: Peak reporting months (biological timing)

### Spatial-Temporal Metrics:

```{r kg-metrics}
kg_metrics <- eddmaps_clean %>%
  group_by(Scientific_Name, SUB_name) %>%
  summarize(
    n_reports = n(),
    n_states = n_distinct(State_Name),
    n_counties = n_distinct(County_Name),
    years_active = max(year, na.rm = TRUE) - min(year, na.rm = TRUE) + 1,
    first_year = min(year, na.rm = TRUE),
    latest_year = max(year, na.rm = TRUE),
    total_acres = sum(InfestedAreaAcres, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    spread_rate = n_states / pmax(years_active, 1),
    geographic_breadth = n_states,
    reporting_intensity = n_reports / pmax(years_active, 1)
  ) %>%
  arrange(desc(spread_rate))

kg_metrics %>%
  head(20) %>%
  select(SUB_name, Scientific_Name, n_states, years_active, spread_rate, 
         reporting_intensity, total_acres)

```

### High-Risk Species Identification

Species with rapid spread and broad geographic range:

```{r high-risk-species, fig.width=12, fig.height=8}
kg_metrics %>%
  filter(n_reports >= 10, years_active >= 3) %>%
  ggplot(aes(x = spread_rate, y = n_reports, 
             size = total_acres, color = years_active)) +
  geom_point(alpha = 0.6) +
  scale_color_viridis_c(option = "plasma") +
  scale_size_continuous(labels = comma) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "High-Risk Species Profile",
    subtitle = "Spread rate vs. detection frequency (bubble size = infested area)",
    x = "Spread Rate (states/year)",
    y = "Number of Reports",
    size = "Total Acres",
    color = "Years Active"
  ) +
  theme(legend.position = "right")
```

------------------------------------------------------------------------

# 11. Connection to Research Framework

## How This Data Informs Your Knowledge Graph

### For Arrival Assessment:

-   **First detection dates** → When species entered regions
-   **Geographic coordinates** → Climate matching with native range
-   **State/county patterns** → Human-mediated pathways

### For Establishment Assessment:

-   **Years active** → Persistence in new environments
-   **Report frequency** → Population stability indicator
-   **Seasonal patterns** → Life cycle completion

### For Spread Assessment:

-   **States invaded over time** → Dispersal rate
-   **Infested area growth** → Local population expansion
-   **County-level patterns** → Within-state movement

### For Impact Assessment:

-   **Total infested acres** → Ecological footprint
-   **Report intensity** → Management concern level
-   **Multi-year presence** → Eradication difficulty

------------------------------------------------------------------------

# 12. Next Steps for Knowledge Graph

## Data Integration Priorities:

1.  **Match to RIIS database** → Get establishment means, pathways
2.  **Link to USGS NAS** → Get narrative vectors, transport info
3.  **Connect to trait databases** → Add fecundity, dispersal ability
4.  **Overlay climate data** → Climate matching scores
5.  **Link to literature** → Mechanistic evidence

## Key Variables to Extract:

```{r variables-needed}
variables_table <- data.frame(
  Category = c("Species Identity", "Spatial", "Temporal", "Extent", 
               "Traits (to link)", "Risk Factors"),
  Variables = c(
    "Scientific name, genus, family, common name",
    "x, y coordinates, state, county, region",
    "First record, latest record, year, season",
    "Infested acres, number of sites, geographic spread",
    "Fecundity, dispersal, tolerance, pathways (from other sources)",
    "Spread rate, persistence, area invaded, detection lag"
  )
)

variables_table

```

------------------------------------------------------------------------

# Summary Statistics

```{r final-summary}
cat("\n=== EDDMAPS DATA SUMMARY ===\n\n")
cat("Total Records:", nrow(eddmaps_clean), "\n")
cat("Unique Species:", n_species, "\n")
cat("States Covered:", n_distinct(eddmaps_clean$State_Name), "\n")
cat("Counties Covered:", n_distinct(eddmaps_clean$County_Name), "\n")
cat("Date Range:", format(min(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "to", 
    format(max(eddmaps_clean$ObservationDate), "%Y-%m-%d"), "\n")
cat("Years of Data:", max(eddmaps_clean$year) - min(eddmaps_clean$year) + 1, "\n\n")

cat("Data Completeness:\n")
cat("  - With area data:", 
    round(100*sum(!is.na(eddmaps_clean$InfestedAreaAcres))/nrow(eddmaps_clean), 1), "%\n")
cat("  - With county info:", 
    round(100*sum(!is.na(eddmaps_clean$County_Name))/nrow(eddmaps_clean), 1), "%\n")
cat("  - With coordinates:", 
    round(100*sum(!is.na(eddmaps_clean$x) & !is.na(eddmaps_clean$y))/nrow(eddmaps_clean), 1), "%\n")
```
