Code
library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tables
This analysis matches dam locations between two authoritative sources:
The matching process considers both spatial proximity and name similarity to establish relationships between these datasets.
First, we load the required R packages for spatial analysis, data transformation, and string matching:
library(sf) # manipulate spatial data
library(dplyr) # perform data transformations
library(stringdist) # create name similarity
library(DT) # interactive tables
We define a helper function to convert coordinate data into spatial features:
<- function(df) {
convert_to_sf # Check if coordinates column exists
if (!"coordinates" %in% names(df)) {
stop("Input data frame must have a 'coordinates' column")
}
# Extract coordinates into a matrix more safely
# First check if coordinates are already a matrix
if (is.matrix(df$coordinates)) {
<- df$coordinates
coords_matrix else {
} # Try to convert from list format
<- t(sapply(df$coordinates, function(x) {
coords_matrix if (is.numeric(x)) return(x)
# If it's character, try to parse it
as.numeric(strsplit(gsub("[][]", "", x), "\\s+")[[1]])
}))
}
# Create sf object
<- st_as_sf(
sf_points
df,coords = coords_matrix,
crs = 4326, # Assuming WGS84
agr = "constant"
%>%
) # Remove the original coordinates column as it's now in geometry
select(-coordinates)
return(sf_points)
}
We’ll load data from both Geoconnex and RISE APIs:
# Load geoconnex reference dams from the reference feature service
<- sf::read_sf("https://reference.geoconnex.us/collections/dams/items?limit=1000000")
gcnx
# Load all locations from RISE (paginated API)
<- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=1&itemsPerPage=100&locationTypeId=3")$data
rise1 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=2&itemsPerPage=100&locationTypeId=3")$data
rise2 <- jsonlite::fromJSON("https://data.usbr.gov/rise/api/location?page=3&itemsPerPage=100&locationTypeId=3")$data
rise3 <- bind_rows(rise1,rise2,rise3)
rise rm(rise1,rise2,rise3)
Filter and prepare RISE data to include only dam and reservoir locations:
<- rise |>
rise_dams filter(grepl("Dam", attributes$locationName, ignore.case = TRUE) |
grepl("Reservoir", attributes$locationName, ignore.case = TRUE) |
grepl("Dam", attributes$locationDescription, ignore.case = TRUE))
<- cbind(rise_dams$id,
rise_dams $attributes$locationName,
rise_dams$attributes$locationDescription,
rise_dams$attributes$locationGeometry,
rise_dams$attributes$locationCoordinates)
rise_dams
<- rise_dams |>
rise_dams rename(id = `rise_dams$id`,
name = `rise_dams$attributes$locationName`,
description = `rise_dams$attributes$locationDescription`)
<- rise_dams |> filter(locationGeometry == 'point')
rise_dams
# Extract coordinates
for (i in 1:length(rise_dams$id)) {
$lon[i] <- rise_dams$coordinates[[i]][1]
rise_dams$lat[i] <- rise_dams$coordinates[[i]][2]
rise_dams
}
<- st_as_sf(rise_dams, coords = c("lon","lat"), crs=4326)
rise_dams_sf <- rise_dams_sf |> select(-coordinates) rise_dams_sf
Define a function to find matches between spatial datasets based on location and name similarity:
library(sf)
library(dplyr)
library(stringdist)
<- function(points_a, points_b, initial_max_distance = 1000,
find_spatial_matches min_name_similarity = 0.3,
points_a_id_col = NULL,
points_b_id_col = NULL) {
# Validate ID columns if specified
if (!is.null(points_a_id_col) && !(points_a_id_col %in% names(points_a))) {
stop(sprintf("ID column '%s' not found in points_a dataset", points_a_id_col))
}if (!is.null(points_b_id_col) && !(points_b_id_col %in% names(points_b))) {
stop(sprintf("ID column '%s' not found in points_b dataset", points_b_id_col))
}
# Function to get matches for a single point with adaptive distance
<- function(point_idx, current_max_distance) {
get_point_matches # Create self-match row first
<- points_a[point_idx,] %>%
self_match mutate(
point_a_name = name,
point_b_name = name,
distance = 0,
name_similarity = 1,
adjusted_distance = FALSE,
point_a_wkt = st_as_text(st_geometry(.)),
point_b_wkt = st_as_text(st_geometry(.)),
match_type = "self"
)
# Get distances to all points in B
<- st_distance(points_a[point_idx,], points_b)
distances <- units::drop_units(distances[1,])
dist_vector
while(TRUE) {
<- which(dist_vector <= current_max_distance)
nearby_indices
if (length(nearby_indices) > 0) {
# Calculate name similarities for nearby points
<- sapply(points_b$name[nearby_indices], function(name_b) {
similarities calculate_name_similarity(points_a$name[point_idx], name_b)
})
# Check if any meet similarity threshold
<- which(similarities >= min_name_similarity)
keep_indices
if (length(keep_indices) > 0) {
# Get matching points from points_b with geometry
<- points_b[nearby_indices[keep_indices],]
matched_points
# Create candidate matches
<- matched_points %>%
candidate_matches mutate(
point_a_name = points_a$name[point_idx],
point_b_name = name,
distance = dist_vector[nearby_indices[keep_indices]],
name_similarity = similarities[keep_indices],
adjusted_distance = current_max_distance > initial_max_distance,
point_a_wkt = st_as_text(st_geometry(points_a[point_idx,])),
point_b_wkt = st_as_text(st_geometry(matched_points)),
match_type = "candidate"
)
# Add ID columns if specified
if (!is.null(points_a_id_col)) {
$point_a_id <- points_a[[points_a_id_col]][point_idx]
self_match$point_b_id <- points_a[[points_a_id_col]][point_idx]
self_match$point_a_id <- points_a[[points_a_id_col]][point_idx]
candidate_matches$point_b_id <- candidate_matches[[points_b_id_col]]
candidate_matches
}
# Combine self-match with candidate matches
<- bind_rows(self_match, candidate_matches)
result return(result)
}
}
<- current_max_distance * 2
current_max_distance
if (current_max_distance > 50000) { # 50 km
warning(sprintf("Very large search distance needed for point: %s",
$name[point_idx]))
points_a
}
}
}
# Calculate name similarity score (helper function)
<- function(name1, name2) {
calculate_name_similarity <- tolower(name1)
name1 <- tolower(name2)
name2 1 - stringdist(name1, name2, method = "jw")
}
# Get matches for each point in points_a
<- lapply(1:nrow(points_a), function(i) {
all_matches get_point_matches(i, initial_max_distance)
})
# Combine all matches and sort
<- bind_rows(all_matches) %>%
results group_by(point_a_name) %>%
arrange(match_type, distance, desc(name_similarity), .by_group = TRUE) %>%
ungroup()
return(results)
}
Apply the matching function to our datasets:
<- find_spatial_matches(
matches
rise_dams_sf,
gcnx,points_a_id_col = "id",
points_b_id_col = "uri",
initial_max_distance = 1000,
min_name_similarity = 0.3
)
# Prepare results for visualization
$rise_name <- matches$point_a_name
matches$geoconnex_name <- matches$point_b_name
matches$rise_id <- matches$point_a_id
matches$geoconnex_id <- matches$uri
matches$src <- NA
matches$src[which(matches$match_type=="self")] <- "rise"
matches$src[which(matches$match_type=="candidate")] <- "geoconnex"
matches
# Create final match set
<- matches |>
match_set select(rise_name, rise_id, geoconnex_name, geoconnex_id,
distance, name_similarity, point_a_wkt, point_b_wkt, mainstem_uri, nhdpv2_reachcode_uri, nhdpv2_reach_measure, src)
The map below shows the spatial distribution of matched dams. Points are colored by source: - Yellow points represent RISE dams/reservoirs - Purple points represent matched Geoconnex dams
The table before the map can be sorted, searched, and exported to csv or excel.
::mapview(match_set, zcol="src") mapview
# Prepare display table (excluding WKT fields for readability)
<- match_set %>% filter(src == "geoconnex") %>%
display_table select(rise_name, rise_id, geoconnex_name, geoconnex_id, mainstem_uri,
%>%
distance, name_similarity) mutate(
distance = round(distance, 1),
name_similarity = round(name_similarity, 3)
)
# Create interactive table with download buttons
::datatable(
DT
display_table,extensions = c('Buttons', 'SearchBuilder'),
options = list(
dom = 'Bfrtip',
buttons = list(
'copy',
list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download'
)
),pageLength = 10,
order = list(list(4, 'asc')), # Sort by distance by default
scrollX = TRUE
),filter = 'top',
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center;',
'Table: Matched Dams from RISE and Geoconnex Databases'
) )