Code
library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tablesThis analysis matches dam locations between two authoritative sources:
The matching process considers both spatial proximity and name similarity to establish relationships between these datasets.
First, we load the required R packages for spatial analysis, data transformation, and string matching:
library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tablesWe define a helper function to convert coordinate data into spatial features:
convert_to_sf <- function(df) {
# Check if coordinates column exists
if (!"coordinates" %in% names(df)) {
stop("Input data frame must have a 'coordinates' column")
}
# Extract coordinates into a matrix more safely
# First check if coordinates are already a matrix
if (is.matrix(df$coordinates)) {
coords_matrix <- df$coordinates
} else {
# Try to convert from list format
coords_matrix <- t(sapply(df$coordinates, function(x) {
if (is.numeric(x)) return(x)
# If it's character, try to parse it
as.numeric(strsplit(gsub("[][]", "", x), "\\s+")[[1]])
}))
}
# Create sf object
sf_points <- st_as_sf(
df,
coords = coords_matrix,
crs = 4326, # Assuming WGS84
agr = "constant"
) %>%
# Remove the original coordinates column as it's now in geometry
select(-coordinates)
return(sf_points)
}We’ll load data from both Geoconnex and RISE APIs:
# Load geoconnex reference dams from the reference feature service
gcnx <- sf::read_sf("https://reference.geoconnex.us/collections/dams/items?limit=1000000")
# Load all locations from RISE (paginated API)
rise1 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=1&itemsPerPage=100&locationTypeId=3")$data
rise2 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=2&itemsPerPage=100&locationTypeId=3")$data
rise3 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=3&itemsPerPage=100&locationTypeId=3")$data
rise <- bind_rows(rise1,rise2,rise3)
rm(rise1,rise2,rise3)Filter and prepare RISE data to include only dam and reservoir locations:
rise_dams <- rise |>
filter(grepl("Dam", attributes$locationName, ignore.case = TRUE) |
grepl("Reservoir", attributes$locationName, ignore.case = TRUE) |
grepl("Dam", attributes$locationDescription, ignore.case = TRUE))
rise_dams <- cbind(rise_dams$id,
rise_dams$attributes$locationName,
rise_dams$attributes$locationDescription,
rise_dams$attributes$locationGeometry,
rise_dams$attributes$locationCoordinates)
rise_dams <- rise_dams |>
rename(id = `rise_dams$id`,
name = `rise_dams$attributes$locationName`,
description = `rise_dams$attributes$locationDescription`)
rise_dams <- rise_dams |> filter(locationGeometry == 'point')
# Extract coordinates
for (i in 1:length(rise_dams$id)) {
rise_dams$lon[i] <- rise_dams$coordinates[[i]][1]
rise_dams$lat[i] <- rise_dams$coordinates[[i]][2]
}
rise_dams_sf <- st_as_sf(rise_dams, coords = c("lon","lat"), crs=4326)
rise_dams_sf <- rise_dams_sf |> select(-coordinates)Define a function to find matches between spatial datasets based on location and name similarity:
library(sf)
library(dplyr)
library(stringdist)
find_spatial_matches <- function(points_a, points_b, initial_max_distance = 1000,
min_name_similarity = 0.3,
points_a_id_col = NULL,
points_b_id_col = NULL) {
# Validate ID columns if specified
if (!is.null(points_a_id_col) && !(points_a_id_col %in% names(points_a))) {
stop(sprintf("ID column '%s' not found in points_a dataset", points_a_id_col))
}
if (!is.null(points_b_id_col) && !(points_b_id_col %in% names(points_b))) {
stop(sprintf("ID column '%s' not found in points_b dataset", points_b_id_col))
}
# Function to get matches for a single point with adaptive distance
get_point_matches <- function(point_idx, current_max_distance) {
# Create self-match row first
self_match <- points_a[point_idx,] %>%
mutate(
point_a_name = name,
point_b_name = name,
distance = 0,
name_similarity = 1,
adjusted_distance = FALSE,
point_a_wkt = st_as_text(st_geometry(.)),
point_b_wkt = st_as_text(st_geometry(.)),
match_type = "self"
)
# Get distances to all points in B
distances <- st_distance(points_a[point_idx,], points_b)
dist_vector <- units::drop_units(distances[1,])
while(TRUE) {
nearby_indices <- which(dist_vector <= current_max_distance)
if (length(nearby_indices) > 0) {
# Calculate name similarities for nearby points
similarities <- sapply(points_b$name[nearby_indices], function(name_b) {
calculate_name_similarity(points_a$name[point_idx], name_b)
})
# Check if any meet similarity threshold
keep_indices <- which(similarities >= min_name_similarity)
if (length(keep_indices) > 0) {
# Get matching points from points_b with geometry
matched_points <- points_b[nearby_indices[keep_indices],]
# Create candidate matches
candidate_matches <- matched_points %>%
mutate(
point_a_name = points_a$name[point_idx],
point_b_name = name,
distance = dist_vector[nearby_indices[keep_indices]],
name_similarity = similarities[keep_indices],
adjusted_distance = current_max_distance > initial_max_distance,
point_a_wkt = st_as_text(st_geometry(points_a[point_idx,])),
point_b_wkt = st_as_text(st_geometry(matched_points)),
match_type = "candidate"
)
# Add ID columns if specified
if (!is.null(points_a_id_col)) {
self_match$point_a_id <- points_a[[points_a_id_col]][point_idx]
self_match$point_b_id <- points_a[[points_a_id_col]][point_idx]
candidate_matches$point_a_id <- points_a[[points_a_id_col]][point_idx]
candidate_matches$point_b_id <- candidate_matches[[points_b_id_col]]
}
# Combine self-match with candidate matches
result <- bind_rows(self_match, candidate_matches)
return(result)
}
}
current_max_distance <- current_max_distance * 2
if (current_max_distance > 50000) { # 50 km
warning(sprintf("Very large search distance needed for point: %s",
points_a$name[point_idx]))
}
}
}
# Calculate name similarity score (helper function)
calculate_name_similarity <- function(name1, name2) {
name1 <- tolower(name1)
name2 <- tolower(name2)
1 - stringdist(name1, name2, method = "jw")
}
# Get matches for each point in points_a
all_matches <- lapply(1:nrow(points_a), function(i) {
get_point_matches(i, initial_max_distance)
})
# Combine all matches and sort
results <- bind_rows(all_matches) %>%
group_by(point_a_name) %>%
arrange(match_type, distance, desc(name_similarity), .by_group = TRUE) %>%
ungroup()
return(results)
}Apply the matching function to our datasets:
matches <- find_spatial_matches(
rise_dams_sf,
gcnx,
points_a_id_col = "id",
points_b_id_col = "uri",
initial_max_distance = 1000,
min_name_similarity = 0.3
)
# Prepare results for visualization
matches$rise_name <- matches$point_a_name
matches$geoconnex_name <- matches$point_b_name
matches$rise_id <- matches$point_a_id
matches$geoconnex_id <- matches$uri
matches$src <- NA
matches$src[which(matches$match_type=="self")] <- "rise"
matches$src[which(matches$match_type=="candidate")] <- "geoconnex"
# Create final match set
match_set <- matches |>
select(rise_name, rise_id, geoconnex_name, geoconnex_id,
distance, name_similarity, point_a_wkt, point_b_wkt,
mainstem_uri, nhdpv2_reachcode_uri, nhdpv2_reach_measure, src)The map below shows the spatial distribution of matched dams. Points are colored by source: - Yellow points represent RISE dams/reservoirs - Purple points represent matched Geoconnex dams
The table before the map can be sorted, searched, and exported to csv or excel.
mapview::mapview(match_set, zcol="src")# Prepare display table (excluding WKT fields for readability)
display_table <- match_set %>% filter(src == "geoconnex") %>%
select(rise_name, rise_id, geoconnex_name, geoconnex_id, mainstem_uri,
distance, name_similarity) %>%
mutate(
distance = round(distance, 1),
name_similarity = round(name_similarity, 3)
)
# Create interactive table with download buttons
DT::datatable(
display_table,
extensions = c('Buttons', 'SearchBuilder'),
options = list(
dom = 'Bfrtip',
buttons = list(
'copy',
list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download'
)
),
pageLength = 10,
order = list(list(4, 'asc')), # Sort by distance by default
scrollX = TRUE
),
filter = 'top',
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center;',
'Table: Matched Dams from RISE and Geoconnex Databases'
)
)