Matching RISE and Geoconnex Dam Locations

Overview

This analysis matches dam locations between two authoritative sources:

  1. RISE (Reclamation Information Sharing Environment) - Bureau of Reclamation’s dam database
  2. Geoconnex - A reference feature service for water data features of interest

The matching process considers both spatial proximity and name similarity to establish relationships between these datasets.

Data Loading and Preparation

First, we load the required R packages for spatial analysis, data transformation, and string matching:

Code
library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tables

Helper Function: Converting Coordinates to SF Objects

We define a helper function to convert coordinate data into spatial features:

Code
convert_to_sf <- function(df) {
  # Check if coordinates column exists
  if (!"coordinates" %in% names(df)) {
    stop("Input data frame must have a 'coordinates' column")
  }
  
  # Extract coordinates into a matrix more safely
  # First check if coordinates are already a matrix
  if (is.matrix(df$coordinates)) {
    coords_matrix <- df$coordinates
  } else {
    # Try to convert from list format
    coords_matrix <- t(sapply(df$coordinates, function(x) {
      if (is.numeric(x)) return(x)
      # If it's character, try to parse it
      as.numeric(strsplit(gsub("[][]", "", x), "\\s+")[[1]])
    }))
  }
  
  # Create sf object
  sf_points <- st_as_sf(
    df,
    coords = coords_matrix,
    crs = 4326,  # Assuming WGS84
    agr = "constant"
  ) %>%
    # Remove the original coordinates column as it's now in geometry
    select(-coordinates)
  
  return(sf_points)
}

Loading Source Datasets

We’ll load data from both Geoconnex and RISE APIs:

Code
# Load geoconnex reference dams from the reference feature service
gcnx <- sf::read_sf("https://reference.geoconnex.us/collections/dams/items?limit=1000000")

# Load all locations from RISE (paginated API)
rise1 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=1&itemsPerPage=100&locationTypeId=3")$data
rise2 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=2&itemsPerPage=100&locationTypeId=3")$data
rise3 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=3&itemsPerPage=100&locationTypeId=3")$data
rise <- bind_rows(rise1,rise2,rise3)
rm(rise1,rise2,rise3)

Processing RISE Dam Data

Filter and prepare RISE data to include only dam and reservoir locations:

Code
rise_dams <- rise |>
  filter(grepl("Dam", attributes$locationName, ignore.case = TRUE) |
         grepl("Reservoir", attributes$locationName, ignore.case = TRUE) |
         grepl("Dam", attributes$locationDescription, ignore.case = TRUE))

rise_dams <- cbind(rise_dams$id,
                   rise_dams$attributes$locationName,
                   rise_dams$attributes$locationDescription,
                   rise_dams$attributes$locationGeometry,
                   rise_dams$attributes$locationCoordinates)

rise_dams <- rise_dams |> 
  rename(id = `rise_dams$id`,
         name = `rise_dams$attributes$locationName`,
         description = `rise_dams$attributes$locationDescription`)

rise_dams <- rise_dams |> filter(locationGeometry == 'point')

# Extract coordinates
for (i in 1:length(rise_dams$id)) {
  rise_dams$lon[i] <- rise_dams$coordinates[[i]][1]
  rise_dams$lat[i] <- rise_dams$coordinates[[i]][2]
}

rise_dams_sf <- st_as_sf(rise_dams, coords = c("lon","lat"), crs=4326)
rise_dams_sf <- rise_dams_sf |> select(-coordinates)

Spatial Matching Function

Define a function to find matches between spatial datasets based on location and name similarity:

Code
library(sf)
library(dplyr)
library(stringdist)

find_spatial_matches <- function(points_a, points_b, initial_max_distance = 1000,
                               min_name_similarity = 0.3,
                               points_a_id_col = NULL, 
                               points_b_id_col = NULL) {
  
  # Validate ID columns if specified
  if (!is.null(points_a_id_col) && !(points_a_id_col %in% names(points_a))) {
    stop(sprintf("ID column '%s' not found in points_a dataset", points_a_id_col))
  }
  if (!is.null(points_b_id_col) && !(points_b_id_col %in% names(points_b))) {
    stop(sprintf("ID column '%s' not found in points_b dataset", points_b_id_col))
  }
  
  # Function to get matches for a single point with adaptive distance
  get_point_matches <- function(point_idx, current_max_distance) {
    # Create self-match row first
    self_match <- points_a[point_idx,] %>%
      mutate(
        point_a_name = name,
        point_b_name = name,
        distance = 0,
        name_similarity = 1,
        adjusted_distance = FALSE,
        point_a_wkt = st_as_text(st_geometry(.)),
        point_b_wkt = st_as_text(st_geometry(.)),
        match_type = "self"
      )
    
    # Get distances to all points in B
    distances <- st_distance(points_a[point_idx,], points_b)
    dist_vector <- units::drop_units(distances[1,])
    
    while(TRUE) {
      nearby_indices <- which(dist_vector <= current_max_distance)
      
      if (length(nearby_indices) > 0) {
        # Calculate name similarities for nearby points
        similarities <- sapply(points_b$name[nearby_indices], function(name_b) {
          calculate_name_similarity(points_a$name[point_idx], name_b)
        })
        
        # Check if any meet similarity threshold
        keep_indices <- which(similarities >= min_name_similarity)
        
        if (length(keep_indices) > 0) {
          # Get matching points from points_b with geometry
          matched_points <- points_b[nearby_indices[keep_indices],]
          
          # Create candidate matches
          candidate_matches <- matched_points %>%
            mutate(
              point_a_name = points_a$name[point_idx],
              point_b_name = name,
              distance = dist_vector[nearby_indices[keep_indices]],
              name_similarity = similarities[keep_indices],
              adjusted_distance = current_max_distance > initial_max_distance,
              point_a_wkt = st_as_text(st_geometry(points_a[point_idx,])),
              point_b_wkt = st_as_text(st_geometry(matched_points)),
              match_type = "candidate"
            )
          
          # Add ID columns if specified
          if (!is.null(points_a_id_col)) {
            self_match$point_a_id <- points_a[[points_a_id_col]][point_idx]
            self_match$point_b_id <- points_a[[points_a_id_col]][point_idx]
            candidate_matches$point_a_id <- points_a[[points_a_id_col]][point_idx]
            candidate_matches$point_b_id <- candidate_matches[[points_b_id_col]]
          }
          
          # Combine self-match with candidate matches
          result <- bind_rows(self_match, candidate_matches)
          return(result)
        }
      }
      
      current_max_distance <- current_max_distance * 2
      
      if (current_max_distance > 50000) {  # 50 km
        warning(sprintf("Very large search distance needed for point: %s", 
                       points_a$name[point_idx]))
      }
    }
  }
  
  # Calculate name similarity score (helper function)
  calculate_name_similarity <- function(name1, name2) {
    name1 <- tolower(name1)
    name2 <- tolower(name2)
    1 - stringdist(name1, name2, method = "jw")
  }
  
  # Get matches for each point in points_a
  all_matches <- lapply(1:nrow(points_a), function(i) {
    get_point_matches(i, initial_max_distance)
  })
  
  # Combine all matches and sort
  results <- bind_rows(all_matches) %>%
    group_by(point_a_name) %>%
    arrange(match_type, distance, desc(name_similarity), .by_group = TRUE) %>%
    ungroup()
  
  return(results)
}

Finding Matches

Apply the matching function to our datasets:

Code
matches <- find_spatial_matches(
  rise_dams_sf,
  gcnx,
  points_a_id_col = "id",
  points_b_id_col = "uri",
  initial_max_distance = 1000,
  min_name_similarity = 0.3
)

# Prepare results for visualization
matches$rise_name <- matches$point_a_name
matches$geoconnex_name <- matches$point_b_name
matches$rise_id <- matches$point_a_id
matches$geoconnex_id <- matches$uri
matches$src <- NA
matches$src[which(matches$match_type=="self")] <- "rise"
matches$src[which(matches$match_type=="candidate")] <- "geoconnex"

# Create final match set
match_set <- matches |> 
  select(rise_name, rise_id, geoconnex_name, geoconnex_id, 
         distance, name_similarity, point_a_wkt, point_b_wkt,
         mainstem_uri, nhdpv2_reachcode_uri, nhdpv2_reach_measure, src)

Interactive Visualization

The map below shows the spatial distribution of matched dams. Points are colored by source: - Yellow points represent RISE dams/reservoirs - Purple points represent matched Geoconnex dams

The table before the map can be sorted, searched, and exported to csv or excel.

Code
mapview::mapview(match_set, zcol="src")
Code
# Prepare display table (excluding WKT fields for readability)
display_table <- match_set %>% filter(src == "geoconnex") %>%
  select(rise_name, rise_id, geoconnex_name, geoconnex_id, mainstem_uri,
         distance, name_similarity) %>%
  mutate(
    distance = round(distance, 1),
    name_similarity = round(name_similarity, 3)
  ) 

# Create interactive table with download buttons
DT::datatable(
  display_table,
  extensions = c('Buttons', 'SearchBuilder'),
  options = list(
    dom = 'Bfrtip',
    buttons = list(
      'copy', 
      list(
        extend = 'collection',
        buttons = c('csv', 'excel'),
        text = 'Download'
      )
    ),
    pageLength = 10,
    order = list(list(4, 'asc')),  # Sort by distance by default
    scrollX = TRUE
  ),
  filter = 'top',
  caption = htmltools::tags$caption(
    style = 'caption-side: bottom; text-align: center;',
    'Table: Matched Dams from RISE and Geoconnex Databases'
  )
)