Matching RISE and Geoconnex Dam Locations

Overview

This analysis matches dam locations between two authoritative sources:

RISE (Reclamation Information Sharing Environment) - Bureau of Reclamation’s dam database
Geoconnex - A reference feature service for water data features of interest

The matching process considers both spatial proximity and name similarity to establish relationships between these datasets.

Data Loading and Preparation

First, we load the required R packages for spatial analysis, data transformation, and string matching:

Code

library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tables

Helper Function: Converting Coordinates to SF Objects

We define a helper function to convert coordinate data into spatial features:

Code

convert_to_sf <- function(df) {
  # Check if coordinates column exists
  if (!"coordinates" %in% names(df)) {
    stop("Input data frame must have a 'coordinates' column")
  }
  
  # Extract coordinates into a matrix more safely
  # First check if coordinates are already a matrix
  if (is.matrix(df$coordinates)) {
    coords_matrix <- df$coordinates
  } else {
    # Try to convert from list format
    coords_matrix <- t(sapply(df$coordinates, function(x) {
      if (is.numeric(x)) return(x)
      # If it's character, try to parse it
      as.numeric(strsplit(gsub("[][]", "", x), "\\s+")[[1]])
    }))
  }
  
  # Create sf object
  sf_points <- st_as_sf(
    df,
    coords = coords_matrix,
    crs = 4326,  # Assuming WGS84
    agr = "constant"
  ) %>%
    # Remove the original coordinates column as it's now in geometry
    select(-coordinates)
  
  return(sf_points)
}

Loading Source Datasets

We’ll load data from both Geoconnex and RISE APIs:

Code

# Load geoconnex reference dams from the reference feature service
gcnx <- sf::read_sf("https://reference.geoconnex.us/collections/dams/items?limit=1000000")

# Load all locations from RISE (paginated API)
rise1 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=1&itemsPerPage=100&locationTypeId=3")$data
rise2 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=2&itemsPerPage=100&locationTypeId=3")$data
rise3 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=3&itemsPerPage=100&locationTypeId=3")$data
rise <- bind_rows(rise1,rise2,rise3)
rm(rise1,rise2,rise3)

Processing RISE Dam Data

Filter and prepare RISE data to include only dam and reservoir locations:

Code

rise_dams <- rise |>
  filter(grepl("Dam", attributes$locationName, ignore.case = TRUE) |
         grepl("Reservoir", attributes$locationName, ignore.case = TRUE) |
         grepl("Dam", attributes$locationDescription, ignore.case = TRUE))

rise_dams <- cbind(rise_dams$id,
                   rise_dams$attributes$locationName,
                   rise_dams$attributes$locationDescription,
                   rise_dams$attributes$locationGeometry,
                   rise_dams$attributes$locationCoordinates)

rise_dams <- rise_dams |> 
  rename(id = `rise_dams$id`,
         name = `rise_dams$attributes$locationName`,
         description = `rise_dams$attributes$locationDescription`)

rise_dams <- rise_dams |> filter(locationGeometry == 'point')

# Extract coordinates
for (i in 1:length(rise_dams$id)) {
  rise_dams$lon[i] <- rise_dams$coordinates[[i]][1]
  rise_dams$lat[i] <- rise_dams$coordinates[[i]][2]
}

rise_dams_sf <- st_as_sf(rise_dams, coords = c("lon","lat"), crs=4326)
rise_dams_sf <- rise_dams_sf |> select(-coordinates)

Spatial Matching Function

Define a function to find matches between spatial datasets based on location and name similarity:

Code

library(sf)
library(dplyr)
library(stringdist)

find_spatial_matches <- function(points_a, points_b, initial_max_distance = 1000,
                               min_name_similarity = 0.3,
                               points_a_id_col = NULL, 
                               points_b_id_col = NULL) {
  
  # Validate ID columns if specified
  if (!is.null(points_a_id_col) && !(points_a_id_col %in% names(points_a))) {
    stop(sprintf("ID column '%s' not found in points_a dataset", points_a_id_col))
  }
  if (!is.null(points_b_id_col) && !(points_b_id_col %in% names(points_b))) {
    stop(sprintf("ID column '%s' not found in points_b dataset", points_b_id_col))
  }
  
  # Function to get matches for a single point with adaptive distance
  get_point_matches <- function(point_idx, current_max_distance) {
    # Create self-match row first
    self_match <- points_a[point_idx,] %>%
      mutate(
        point_a_name = name,
        point_b_name = name,
        distance = 0,
        name_similarity = 1,
        adjusted_distance = FALSE,
        point_a_wkt = st_as_text(st_geometry(.)),
        point_b_wkt = st_as_text(st_geometry(.)),
        match_type = "self"
      )
    
    # Get distances to all points in B
    distances <- st_distance(points_a[point_idx,], points_b)
    dist_vector <- units::drop_units(distances[1,])
    
    while(TRUE) {
      nearby_indices <- which(dist_vector <= current_max_distance)
      
      if (length(nearby_indices) > 0) {
        # Calculate name similarities for nearby points
        similarities <- sapply(points_b$name[nearby_indices], function(name_b) {
          calculate_name_similarity(points_a$name[point_idx], name_b)
        })
        
        # Check if any meet similarity threshold
        keep_indices <- which(similarities >= min_name_similarity)
        
        if (length(keep_indices) > 0) {
          # Get matching points from points_b with geometry
          matched_points <- points_b[nearby_indices[keep_indices],]
          
          # Create candidate matches
          candidate_matches <- matched_points %>%
            mutate(
              point_a_name = points_a$name[point_idx],
              point_b_name = name,
              distance = dist_vector[nearby_indices[keep_indices]],
              name_similarity = similarities[keep_indices],
              adjusted_distance = current_max_distance > initial_max_distance,
              point_a_wkt = st_as_text(st_geometry(points_a[point_idx,])),
              point_b_wkt = st_as_text(st_geometry(matched_points)),
              match_type = "candidate"
            )
          
          # Add ID columns if specified
          if (!is.null(points_a_id_col)) {
            self_match$point_a_id <- points_a[[points_a_id_col]][point_idx]
            self_match$point_b_id <- points_a[[points_a_id_col]][point_idx]
            candidate_matches$point_a_id <- points_a[[points_a_id_col]][point_idx]
            candidate_matches$point_b_id <- candidate_matches[[points_b_id_col]]
          }
          
          # Combine self-match with candidate matches
          result <- bind_rows(self_match, candidate_matches)
          return(result)
        }
      }
      
      current_max_distance <- current_max_distance * 2
      
      if (current_max_distance > 50000) {  # 50 km
        warning(sprintf("Very large search distance needed for point: %s", 
                       points_a$name[point_idx]))
      }
    }
  }
  
  # Calculate name similarity score (helper function)
  calculate_name_similarity <- function(name1, name2) {
    name1 <- tolower(name1)
    name2 <- tolower(name2)
    1 - stringdist(name1, name2, method = "jw")
  }
  
  # Get matches for each point in points_a
  all_matches <- lapply(1:nrow(points_a), function(i) {
    get_point_matches(i, initial_max_distance)
  })
  
  # Combine all matches and sort
  results <- bind_rows(all_matches) %>%
    group_by(point_a_name) %>%
    arrange(match_type, distance, desc(name_similarity), .by_group = TRUE) %>%
    ungroup()
  
  return(results)
}

Finding Matches

Apply the matching function to our datasets:

Code

matches <- find_spatial_matches(
  rise_dams_sf,
  gcnx,
  points_a_id_col = "id",
  points_b_id_col = "uri",
  initial_max_distance = 1000,
  min_name_similarity = 0.3
)

# Prepare results for visualization
matches$rise_name <- matches$point_a_name
matches$geoconnex_name <- matches$point_b_name
matches$rise_id <- matches$point_a_id
matches$geoconnex_id <- matches$uri
matches$src <- NA
matches$src[which(matches$match_type=="self")] <- "rise"
matches$src[which(matches$match_type=="candidate")] <- "geoconnex"

# Create final match set
match_set <- matches |> 
  select(rise_name, rise_id, geoconnex_name, geoconnex_id, 
         distance, name_similarity, point_a_wkt, point_b_wkt,
         mainstem_uri, nhdpv2_reachcode_uri, nhdpv2_reach_measure, src)

Interactive Visualization

The map below shows the spatial distribution of matched dams. Points are colored by source: - Yellow points represent RISE dams/reservoirs - Purple points represent matched Geoconnex dams

The table before the map can be sorted, searched, and exported to csv or excel.

Code

mapview::mapview(match_set, zcol="src")

Code

# Prepare display table (excluding WKT fields for readability)
display_table <- match_set %>% filter(src == "geoconnex") %>%
  select(rise_name, rise_id, geoconnex_name, geoconnex_id, mainstem_uri,
         distance, name_similarity) %>%
  mutate(
    distance = round(distance, 1),
    name_similarity = round(name_similarity, 3)
  ) 

# Create interactive table with download buttons
DT::datatable(
  display_table,
  extensions = c('Buttons', 'SearchBuilder'),
  options = list(
    dom = 'Bfrtip',
    buttons = list(
      'copy', 
      list(
        extend = 'collection',
        buttons = c('csv', 'excel'),
        text = 'Download'
      )
    ),
    pageLength = 10,
    order = list(list(4, 'asc')),  # Sort by distance by default
    scrollX = TRUE
  ),
  filter = 'top',
  caption = htmltools::tags$caption(
    style = 'caption-side: bottom; text-align: center;',
    'Table: Matched Dams from RISE and Geoconnex Databases'
  )
)

--- title: "Matching RISE and Geoconnex Dam Locations" format: html: code-fold: true code-tools: true toc: true toc-float: true --- ## Overview This analysis matches dam locations between two authoritative sources: 1. RISE (Reclamation Information Sharing Environment) - Bureau of Reclamation's dam database 2. Geoconnex - A reference feature service for water data features of interest The matching process considers both spatial proximity and name similarity to establish relationships between these datasets. ## Data Loading and Preparation First, we load the required R packages for spatial analysis, data transformation, and string matching: ```{r} #| label: setup #| message: false #| warning: false library(sf) # manipulate spatial data library(dplyr) # perform data transformations library(stringdist) # create name similarity library(DT) # interactive tables ``` ### Helper Function: Converting Coordinates to SF Objects We define a helper function to convert coordinate data into spatial features: ```{r} #| label: convert-function #| code-fold: show #| output: false convert_to_sf <- function(df) { # Check if coordinates column exists if (!"coordinates" %in% names(df)) { stop("Input data frame must have a 'coordinates' column") } # Extract coordinates into a matrix more safely # First check if coordinates are already a matrix if (is.matrix(df$coordinates)) { coords_matrix <- df$coordinates } else { # Try to convert from list format coords_matrix <- t(sapply(df$coordinates, function(x) { if (is.numeric(x)) return(x) # If it's character, try to parse it as.numeric(strsplit(gsub("[][]", "", x), "\\s+")[[1]]) })) } # Create sf object sf_points <- st_as_sf( df, coords = coords_matrix, crs = 4326, # Assuming WGS84 agr = "constant" ) %>% # Remove the original coordinates column as it's now in geometry select(-coordinates) return(sf_points) } ``` ### Loading Source Datasets We'll load data from both Geoconnex and RISE APIs: ```{r} #| label: load-data #| message: false #| warning: false #| output: false # Load geoconnex reference dams from the reference feature service gcnx <- sf::read_sf("https://reference.geoconnex.us/collections/dams/items?limit=1000000") # Load all locations from RISE (paginated API) rise1 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=1&itemsPerPage=100&locationTypeId=3")$data rise2 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=2&itemsPerPage=100&locationTypeId=3")$data rise3 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=3&itemsPerPage=100&locationTypeId=3")$data rise <- bind_rows(rise1,rise2,rise3) rm(rise1,rise2,rise3) ``` ### Processing RISE Dam Data Filter and prepare RISE data to include only dam and reservoir locations: ```{r} #| label: process-rise #| output: false rise_dams <- rise |> filter(grepl("Dam", attributes$locationName, ignore.case = TRUE) | grepl("Reservoir", attributes$locationName, ignore.case = TRUE) | grepl("Dam", attributes$locationDescription, ignore.case = TRUE)) rise_dams <- cbind(rise_dams$id, rise_dams$attributes$locationName, rise_dams$attributes$locationDescription, rise_dams$attributes$locationGeometry, rise_dams$attributes$locationCoordinates) rise_dams <- rise_dams |> rename(id = `rise_dams$id`, name = `rise_dams$attributes$locationName`, description = `rise_dams$attributes$locationDescription`) rise_dams <- rise_dams |> filter(locationGeometry == 'point') # Extract coordinates for (i in 1:length(rise_dams$id)) { rise_dams$lon[i] <- rise_dams$coordinates[[i]][1] rise_dams$lat[i] <- rise_dams$coordinates[[i]][2] } rise_dams_sf <- st_as_sf(rise_dams, coords = c("lon","lat"), crs=4326) rise_dams_sf <- rise_dams_sf |> select(-coordinates) ``` ## Spatial Matching Function Define a function to find matches between spatial datasets based on location and name similarity: ```{r} #| label: matching-function #| code-fold: show #| output: false library(sf) library(dplyr) library(stringdist) find_spatial_matches <- function(points_a, points_b, initial_max_distance = 1000, min_name_similarity = 0.3, points_a_id_col = NULL, points_b_id_col = NULL) { # Validate ID columns if specified if (!is.null(points_a_id_col) && !(points_a_id_col %in% names(points_a))) { stop(sprintf("ID column '%s' not found in points_a dataset", points_a_id_col)) } if (!is.null(points_b_id_col) && !(points_b_id_col %in% names(points_b))) { stop(sprintf("ID column '%s' not found in points_b dataset", points_b_id_col)) } # Function to get matches for a single point with adaptive distance get_point_matches <- function(point_idx, current_max_distance) { # Create self-match row first self_match <- points_a[point_idx,] %>% mutate( point_a_name = name, point_b_name = name, distance = 0, name_similarity = 1, adjusted_distance = FALSE, point_a_wkt = st_as_text(st_geometry(.)), point_b_wkt = st_as_text(st_geometry(.)), match_type = "self" ) # Get distances to all points in B distances <- st_distance(points_a[point_idx,], points_b) dist_vector <- units::drop_units(distances[1,]) while(TRUE) { nearby_indices <- which(dist_vector <= current_max_distance) if (length(nearby_indices) > 0) { # Calculate name similarities for nearby points similarities <- sapply(points_b$name[nearby_indices], function(name_b) { calculate_name_similarity(points_a$name[point_idx], name_b) }) # Check if any meet similarity threshold keep_indices <- which(similarities >= min_name_similarity) if (length(keep_indices) > 0) { # Get matching points from points_b with geometry matched_points <- points_b[nearby_indices[keep_indices],] # Create candidate matches candidate_matches <- matched_points %>% mutate( point_a_name = points_a$name[point_idx], point_b_name = name, distance = dist_vector[nearby_indices[keep_indices]], name_similarity = similarities[keep_indices], adjusted_distance = current_max_distance > initial_max_distance, point_a_wkt = st_as_text(st_geometry(points_a[point_idx,])), point_b_wkt = st_as_text(st_geometry(matched_points)), match_type = "candidate" ) # Add ID columns if specified if (!is.null(points_a_id_col)) { self_match$point_a_id <- points_a[[points_a_id_col]][point_idx] self_match$point_b_id <- points_a[[points_a_id_col]][point_idx] candidate_matches$point_a_id <- points_a[[points_a_id_col]][point_idx] candidate_matches$point_b_id <- candidate_matches[[points_b_id_col]] } # Combine self-match with candidate matches result <- bind_rows(self_match, candidate_matches) return(result) } } current_max_distance <- current_max_distance * 2 if (current_max_distance > 50000) { # 50 km warning(sprintf("Very large search distance needed for point: %s", points_a$name[point_idx])) } } } # Calculate name similarity score (helper function) calculate_name_similarity <- function(name1, name2) { name1 <- tolower(name1) name2 <- tolower(name2) 1 - stringdist(name1, name2, method = "jw") } # Get matches for each point in points_a all_matches <- lapply(1:nrow(points_a), function(i) { get_point_matches(i, initial_max_distance) }) # Combine all matches and sort results <- bind_rows(all_matches) %>% group_by(point_a_name) %>% arrange(match_type, distance, desc(name_similarity), .by_group = TRUE) %>% ungroup() return(results) } ``` ## Finding Matches Apply the matching function to our datasets: ```{r} #| label: find-matches #| output: false matches <- find_spatial_matches( rise_dams_sf, gcnx, points_a_id_col = "id", points_b_id_col = "uri", initial_max_distance = 1000, min_name_similarity = 0.3 ) # Prepare results for visualization matches$rise_name <- matches$point_a_name matches$geoconnex_name <- matches$point_b_name matches$rise_id <- matches$point_a_id matches$geoconnex_id <- matches$uri matches$src <- NA matches$src[which(matches$match_type=="self")] <- "rise" matches$src[which(matches$match_type=="candidate")] <- "geoconnex" # Create final match set match_set <- matches |> select(rise_name, rise_id, geoconnex_name, geoconnex_id, distance, name_similarity, point_a_wkt, point_b_wkt, mainstem_uri, nhdpv2_reachcode_uri, nhdpv2_reach_measure, src) ``` ## Interactive Visualization The map below shows the spatial distribution of matched dams. Points are colored by source: - Yellow points represent RISE dams/reservoirs - Purple points represent matched Geoconnex dams The table before the map can be sorted, searched, and exported to csv or excel. ```{r} #| label: map-view #| message: false #| warning: false #| output: true #| column: page mapview::mapview(match_set, zcol="src") ``` ```{r} #| label: results-table #| message: false #| warning: false #| output: true #| column: page # Prepare display table (excluding WKT fields for readability) display_table <- match_set %>% filter(src == "geoconnex") %>% select(rise_name, rise_id, geoconnex_name, geoconnex_id, mainstem_uri, distance, name_similarity) %>% mutate( distance = round(distance, 1), name_similarity = round(name_similarity, 3) ) # Create interactive table with download buttons DT::datatable( display_table, extensions = c('Buttons', 'SearchBuilder'), options = list( dom = 'Bfrtip', buttons = list( 'copy', list( extend = 'collection', buttons = c('csv', 'excel'), text = 'Download' ) ), pageLength = 10, order = list(list(4, 'asc')), # Sort by distance by default scrollX = TRUE ), filter = 'top', caption = htmltools::tags$caption( style = 'caption-side: bottom; text-align: center;', 'Table: Matched Dams from RISE and Geoconnex Databases' ) ) ```