Fast spatial data matching in R

How to match locations based on their coordinates

Big Data
Data Manipulation
Spatial Data
R
SQL
Author

Marc-Aurèle Rivière

Published

June 18, 2022

Abstract

This document was prompted by this reddit question on how to fill missing location names by matching them to other known locations by their geographical proximity (using lat/long coordinates). The question is linked to a case study of Google’s Data Analytics certificate.

To answer the question in a nutshell: the proposed matching by position won’t yield anything with this dataset since the locations missing a name or id are missing one because their coordinates lack the precision necessary to be reliably matched to another station by proximity alone.

This document is no longer updated

Please visit this page for a more up-to-date version of this post.

Note

You can check the source code by clicking on the </> Code button at the top-right.

1 Setup


library(here)        # File path management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies

library(data.table)  # Fast data manipulation
library(dplyr)       # Slower (but more readable) data manipulation
library(dtplyr)      # data.table backend for dplyr
library(tidyr)       # Extra convenience functions for data manipulation

library(DBI)         # Database connection
library(dbplyr)      # SQL back-end for dplyr
library(duckdb)      # Quack Stack

library(stringr)     # Manipulating strings
library(purrr)       # Manipulating lists

library(fuzzyjoin)   # Non-equi joins & coordinates-based joins

options(
  dplyr.strict_sql = FALSE,
  scipen = 999L, 
  digits = 4L,
  knitr.max_rows_print = 10
)

data.table::setDTthreads(parallel::detectCores(logical = TRUE))
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.2.1 (2022-06-23)
 os       Ubuntu 20.04.4 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  C.UTF-8
 ctype    C.UTF-8
 tz       Europe/Paris
 date     2022-09-24
 pandoc   2.19.2 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
 Quarto   1.1.251

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version     date (UTC) lib source
 P data.table * 1.14.3      2022-07-27 [?] Github (Rdatatable/data.table@c4a2085)
 P DBI        * 1.1.3       2022-06-18 [?] CRAN (R 4.2.0)
 P dbplyr     * 2.2.1       2022-06-27 [?] CRAN (R 4.2.0)
   dplyr      * 1.0.99.9000 2022-08-15 [1] Github (tidyverse/dplyr@d8294b4)
 P dtplyr     * 1.2.1       2022-01-19 [?] CRAN (R 4.2.0)
   duckdb     * 0.5.0       2022-09-03 [1] https://duckdb.r-universe.dev (R 4.2.1)
   fs         * 1.5.2       2021-12-08 [1] CRAN (R 4.2.0)
   fuzzyjoin  * 0.1.6       2020-05-15 [1] CRAN (R 4.2.0)
 P here       * 1.0.1       2020-12-13 [?] CRAN (R 4.2.0)
   pipebind   * 0.1.1       2022-08-10 [1] CRAN (R 4.2.0)
 P purrr      * 0.3.4       2020-04-17 [?] CRAN (R 4.2.0)
 P stringr    * 1.4.0       2019-02-10 [2] CRAN (R 4.2.0)
 P tidyr      * 1.2.0       2022-02-01 [?] CRAN (R 4.2.0)

 [1] /home/mar/Dev/Projects/R/Misc/renv/library/R-4.2/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/library/Misc-f25fd835/R-4.2/x86_64-pc-linux-gnu
 [3] /usr/lib/R/library
 [4] /usr/local/lib/R/site-library
 [5] /usr/lib/R/site-library

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

2 Loading the data


2.1 Wide format (original)

rides <- (purrr::map_dfr(
    fs::dir_ls(data_path, glob = "*.csv"),
    \(file) fread(file, na.strings = "")
  )
)

setkey(rides, ride_id)
Time difference of 17.34 secs
rides_con <- DBI::dbConnect(duckdb::duckdb())
duckdb::duckdb_read_csv(
  rides_con, 
  "rides", 
  fs::dir_ls(here::here("data", "stations"), glob = "*.csv")
)
CREATE INDEX ride_idx ON rides (ride_id)
Time difference of 8.706 secs
data.frame [5,860,776 x 13]
ride_id rideable_type started_at ended_at start_station_name start_station_id end_station_name end_station_id start_lat start_lng end_lat end_lng member_casual
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 Wells St & Hubbard St TA1307000151 Kingsbury St & Kinzie St KA1503000043 41.89 −87.634 41.889 −87.639 member
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 Clinton St & Jackson Blvd 638 Milwaukee Ave & Grand Ave 13033 41.878 −87.641 41.891 −87.648 member
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 Dearborn St & Van Buren St 624 Federal St & Polk St SL-008 41.876 −87.629 41.872 −87.63 casual
00000B4F1F71F9C2 electric_bike 2021-09-08 16:31:38 2021-09-08 16:37:54 NA NA NA NA 41.91 −87.69 41.91 −87.7 member
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 Streeter Dr & Grand Ave 13022 Fairbanks Ct & Grand Ave TA1305000003 41.892 −87.612 41.892 −87.621 casual
00000E22FBA89D81 electric_bike 2022-05-19 14:42:55 2022-05-19 14:54:03 NA NA Clark St & Armitage Ave 13146 41.9 −87.62 41.918 −87.636 member
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 Dorchester Ave & 49th St KA1503000069 Kimbark Ave & 53rd St TA1309000037 41.806 −87.592 41.8 −87.595 member
000018B1D040DB44 electric_bike 2022-04-25 10:37:22 2022-04-25 10:44:19 NA NA NA NA 41.79 −87.6 41.79 −87.59 member
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 Larrabee St & Webster Ave 13193 Sheffield Ave & Webster Ave TA1309000033 41.922 −87.644 41.922 −87.654 member
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 DuSable Lake Shore Dr & Belmont Ave TA1309000049 Broadway & Waveland Ave 13325 41.941 −87.639 41.949 −87.649 casual
[ omitted 5,860,761 entries ]

2.2 Long format

rides_l <- melt(
  rides,
  measure = measure(way, value.name, pattern = "(end|start).*(name|id|lat|lng)")
)

setkey(rides_l, ride_id, id)
Time difference of 6.138 secs
rides_l.dtp <- (pivot_longer(
    rides,
    matches("^end_|^start_"),
    names_pattern = "(end|start).*(name|id|lat|lng)",
    names_to = c("way", ".value")
  ) 
  |> as.data.table()
)

setkey(rides_l.dtp, ride_id, id)
Time difference of 10.41 secs
CREATE TABLE rides_l AS 
(
  SELECT
    ride_id, rideable_type, started_at, ended_at,member_casual
    , 'start' AS way
    , start_station_name AS name
    , start_station_id AS id
    , start_lat AS lat
    , start_lng AS lng
  FROM rides
)
UNION ALL
(
  SELECT
    ride_id, rideable_type, started_at, ended_at, member_casual
    , 'end' AS way
    , end_station_name AS name
    , end_station_id AS id
    , end_lat AS lat
    , end_lng AS lng
  FROM rides
);

CREATE INDEX station_idx ON rides_l (id);

No need to re-define an index for ride_id here, it was passed down from the table rides

Time difference of 6.508 secs

We can reuse an existinf df and directly add it (or bind it as a view) to the database:

DBI::dbWriteTable(rides_con, "rides_l", rides_l) # As a TABLE

duckdb::duckdb_register(rides_con, "rides_l", rides_l) # As a VIEW
(dplyr::tbl(rides_con, "rides") 
  |> pivot_longer(
    matches("^end_|^start_"),
    names_pattern = "(end|start).*(name|id|lat|lng)", 
    names_to = c("way", ".value")
  )
  |> dplyr::copy_to(
    rides_con, 
    df = _, 
    "rides_l_dbp",
    temporary = FALSE,
    indexes = list("ride_id", "id")
  )
)
Time difference of 14.7 secs
data.frame [11,721,552 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng
99FEC93BA843FB20 electric_bike 2021-06-13 14:31:28 2021-06-13 14:34:11 member start NA NA 41.8 −87.59
06048DCFC8520CAF electric_bike 2021-06-04 11:18:02 2021-06-04 11:24:19 member start NA NA 41.79 −87.59
9598066F68045DF2 electric_bike 2021-06-04 09:49:35 2021-06-04 09:55:34 member start NA NA 41.8 −87.6
B03C0FE48C412214 electric_bike 2021-06-03 19:56:05 2021-06-03 20:21:55 member start NA NA 41.78 −87.58
B9EEA89F8FEE73B7 electric_bike 2021-06-04 14:05:51 2021-06-04 14:09:59 member start NA NA 41.8 −87.59
62B943CEAAA420BA electric_bike 2021-06-03 19:32:01 2021-06-03 19:38:46 member start NA NA 41.78 −87.58
7E2546FBA79C46EE electric_bike 2021-06-10 16:30:10 2021-06-10 16:36:21 member start NA NA 41.79 −87.6
3DDF3BBF6C4C3C89 electric_bike 2021-06-10 17:00:30 2021-06-10 17:06:48 member start NA NA 41.79 −87.59
2608805637155AB6 electric_bike 2021-06-10 12:46:16 2021-06-10 12:55:02 member start NA NA 41.93 −87.67
AF529C946F28ED42 electric_bike 2021-06-23 17:57:29 2021-06-23 18:06:40 member start NA NA 41.88 −87.61
[ omitted 11,721,537 entries ]

3 Exploring the data


3.1 Cleaning useless coordinates

Removing entries were lat/lng do not have sufficient precision to be reliably matched to a station (i.e. entries having less than 4 decimals, which corresponds to a 11 meters “radius” at the equator).

Decimal Distance at the equator (m)
0 111,120
1 11,112
2 1,111.2
3 111.12
4 11.112
5 1.1112
decp <- \(x) str_length(str_remove(as.character(abs(x)), ".*\\.")) >= 4
CREATE FUNCTION decp(x) AS length(str_split(CAST(abs(x) AS VARCHAR(10)), '.')[2]) >= 4
rides_l_clean <- rides_l[decp(lat) & decp(lng), ]

setkey(rides_l_clean, ride_id)
Time difference of 25.32 secs
rides_l_clean.dtp <- (rides_l
  |> filter(decp(lat) & decp(lng))
  |> as.data.table()
)

setkey(rides_l_clean.dtp, ride_id)
Time difference of 26.81 secs
CREATE TABLE rides_l_clean AS 
SELECT * FROM rides_l 
WHERE decp(lat) AND decp(lng)
Time difference of 5.369 secs
Note

Here, dbplyr will leave the decp call as-is in the SQL translation, but since we have previously defined a decp SQL function, this function will get called when the SQL query is executed.

(dplyr::tbl(rides_con, "rides_l") 
  |> filter(if_all(c(lat, lng), \(x) decp(x)))
  |> dplyr::copy_to(
    rides_con, 
    df = _, 
    "rides_l_clean_dbp",
    temporary = FALSE,
    indexes = list("ride_id", "id")
  )
)
Time difference of 16.91 secs
data.table [9,937,077 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 member end Kingsbury St & Kinzie St KA1503000043 41.889 −87.639
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 member start Wells St & Hubbard St TA1307000151 41.89 −87.634
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 member end Milwaukee Ave & Grand Ave 13033 41.891 −87.648
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 member start Clinton St & Jackson Blvd 638 41.878 −87.641
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 casual start Dearborn St & Van Buren St 624 41.876 −87.629
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 casual end Federal St & Polk St SL-008 41.872 −87.63
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 casual start Streeter Dr & Grand Ave 13022 41.892 −87.612
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 casual end Fairbanks Ct & Grand Ave TA1305000003 41.892 −87.621
00000E22FBA89D81 electric_bike 2022-05-19 14:42:55 2022-05-19 14:54:03 member end Clark St & Armitage Ave 13146 41.918 −87.636
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 member start Dorchester Ave & 49th St KA1503000069 41.806 −87.592
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 member end Kimbark Ave & 53rd St TA1309000037 41.8 −87.595
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 member start Larrabee St & Webster Ave 13193 41.922 −87.644
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 member end Sheffield Ave & Webster Ave TA1309000033 41.922 −87.654
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 casual end Broadway & Waveland Ave 13325 41.949 −87.649
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 casual start DuSable Lake Shore Dr & Belmont Ave TA1309000049 41.941 −87.639
[ omitted 9,937,062 entries ]

3.2 What’s missing ?

Entries missing one or both coordinates but having an id or name:

rides_l_clean[(is.na(lat) | is.na(lng)) & (!is.na(id) | !is.na(name)), ]
SELECT * FROM rides_l_clean 
WHERE ((lat IS NULL) OR (lng IS NULL) 
  AND (NOT((id IS NULL)) OR NOT((name IS NULL))))
(dplyr::tbl(rides_con, "rides_l_clean")
  |> filter(
    if_any(matches("lat$|lng$"), \(v) is.na(v)) & 
    (!is.na(id) | !is.na(name))
  )
  |> collect()
)
data.frame [0 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng

Entries missing either name or id, but having coordinates:

rides_l_clean[(!is.na(lat) & !is.na(lng)) & (is.na(id) | is.na(name)), ]
rides_l_clean_unk <- (rides_l_clean 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> as.data.table()
)
CREATE TABLE rides_l_clean_unk AS
SELECT * FROM rides_l_clean
WHERE ((id IS NULL) OR (name IS NULL)) 
  AND (NOT((lat IS NULL)) AND NOT((lng IS NULL)))
(dplyr::tbl(rides_con, "rides_l_clean") 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> collect()
)
data.frame [3 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng
176105D1F8A1216B electric_bike 2021-07-18 03:44:22 2021-07-18 04:12:23 casual start NA 13221 41.908 −87.673
DE82A15026BA3056 electric_bike 2021-09-21 18:18:59 2021-09-21 18:21:48 casual start NA 20215 41.648 −87.546
EE197EDA4CF8CFE5 electric_bike 2021-09-22 07:14:42 2021-09-22 07:22:38 casual start NA WL-008 41.867 −87.641

It seems there are only 3 entries missing identification that could be matched based on their coordinates at the level of precision we use (11m / 4 decimals).

Warning

Although, if we look at the original dataset (before filtering the inaccurate coordinates):

rides_l[(!is.na(lat) & !is.na(lng)) & (is.na(id) | is.na(name)), ]
rides_l_unk <- (rides_l
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> as.data.table()
)
CREATE TABLE rides_l_unk AS
SELECT * FROM rides_l
WHERE ((id IS NULL) OR (name IS NULL)) 
  AND (NOT((lat IS NULL)) AND NOT((lng IS NULL)))
(dplyr::tbl(rides_con, "rides_l") 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> collect()
)
data.frame [1,696,469 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng
00000B4F1F71F9C2 electric_bike 2021-09-08 16:31:38 2021-09-08 16:37:54 member end NA NA 41.91 −87.7
00000B4F1F71F9C2 electric_bike 2021-09-08 16:31:38 2021-09-08 16:37:54 member start NA NA 41.91 −87.69
00000E22FBA89D81 electric_bike 2022-05-19 14:42:55 2022-05-19 14:54:03 member start NA NA 41.9 −87.62
000018B1D040DB44 electric_bike 2022-04-25 10:37:22 2022-04-25 10:44:19 member end NA NA 41.79 −87.59
000018B1D040DB44 electric_bike 2022-04-25 10:37:22 2022-04-25 10:44:19 member start NA NA 41.79 −87.6
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member end NA NA 41.81 −87.62
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member start NA NA 41.81 −87.62
00002E385DB2888C electric_bike 2022-05-07 16:28:53 2022-05-07 16:40:28 casual end NA NA 41.87 −87.75
00002E385DB2888C electric_bike 2022-05-07 16:28:53 2022-05-07 16:40:28 casual start NA NA 41.89 −87.75
000043B681BFB305 electric_bike 2021-10-14 16:24:41 2021-10-14 16:29:02 casual end NA NA 41.9 −87.7
[ omitted 1,696,454 entries ]

There were 1,696,469 missing stations’ id or name that could have been filled in the original data, but it seems that all of them disappeared when we filtered the coordinates with less than 4 decimals of precision. It would seem that those entries were missing their id/name in the source data because their coordinates were too imprecise to be matched to any station in the first place.

If one were to do the macthing anyway, here’s how:

4 Stations data


4.1 Creating stations data

First, we need to assemble a dataset linking each unique station id (and name) with a set of coordinates (here, we use the average lat & lng)

stations_clean <- ((rides_l_clean
  |> na.omit(cols = c("id", "name"))
  |> dcast(id + name ~ ., fun.aggregate = list(min, max, mean), value.var = c("lat", "lng"))
  |> pipebind::bind(x, setcolorder(x, c("id", "name", str_subset(names(x), "lat_|_lng"))))
  |> unique(by = "id")
  )
)

setkey(stations_clean, id)
CREATE TABLE stations_clean AS 
SELECT DISTINCT on(id)
  id, name
  , MIN(lat) AS lat_min
  , MAX(lat) AS lat_max
  , AVG(lat) AS lat_mean
  , MIN(lng) AS lng_min
  , MAX(lng) AS lng_max
  , AVG(lng) AS lng_mean
FROM rides_l_clean
WHERE (NOT((id IS NULL))) AND (NOT((name IS NULL)))
GROUP BY id, name;
(dplyr::tbl(rides_con, "rides_l_clean")
  |> filter(!is.na(id), !is.na(name))
  |> group_by(id, name)
  |> summarize(across(c(lat, lng), list(min, max, mean), .names = "{.col}_{.fn}"))
  |> ungroup()
  |> distinct(id, .keep_all = TRUE)
  |> arrange(id)
  |> collect()
)
data.frame [693 x 8]
id name lat_min lat_max lat_mean lng_min lng_max lng_mean
13001 Michigan Ave & Washington St 41.813 41.958 41.884 −87.657 −87.59 −87.625
13006 LaSalle St & Washington St 41.872 41.952 41.883 −87.833 −87.619 −87.633
13008 Millennium Park 41.809 41.96 41.881 −87.785 −87.587 −87.624
13011 Canal St & Adams St 41.871 41.902 41.879 −87.657 −87.626 −87.64
13016 St. Clair St & Erie St 41.841 41.931 41.894 −87.744 −87.616 −87.623
13017 Franklin St & Chicago Ave 41.873 41.917 41.897 −87.663 −87.625 −87.636
13021 Clinton St & Lake St 41.876 41.909 41.886 −87.661 −87.617 −87.642
13022 Streeter Dr & Grand Ave 41.772 41.969 41.892 −87.785 −87.586 −87.612
13028 900 W Harrison St 41.836 41.881 41.875 −87.679 −87.607 −87.65
13029 Field Museum 41.81 41.896 41.865 −87.625 −87.588 −87.618
[ omitted 678 entries ]

5 Matching missing id by position


Let’s match the entries of rides_l_clean with stations_clean by proximity:

5.1 Matching on the cleaned data

To save time, let’s only apply the procedure to the entries that actually need to be matched (i.e. the ones having coordinates but missing either name or id).

There are 3 entries from rides_l_clean that could be position-matched to a known station.

matched_clean <- (fuzzyjoin::geo_inner_join(
    as.data.frame(rides_l_clean_unk),
    as.data.frame(stations_clean),
    by = c("lng" = "lng_mean", "lat" = "lat_mean"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y)) 
  |> select(names(rides_l_clean), dist)
  |> arrange(ride_id)
  |> drop_na(ride_id, id, name)
  |> setDT()
)

setkey(matched_clean, ride_id, id)
Time difference of 0.4161 secs

There are 3 entries from rides_l_clean that could be position-matched to a known station.

Creating the haversine distance function:

CREATE FUNCTION haversine(lat1, lng1, lat2, lng2) 
    AS ( 6371 * acos( cos( radians(lat1) ) *
       cos( radians(lat2) ) * cos( radians(lng2) - radians(lng1) ) +
       sin( radians(lat1) ) * sin( radians(lat2) ) ) 
    );

Doing the matching:

CREATE TABLE matched_clean AS
SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual, way  
  , COALESCE(r.name, s.name) AS name
  , COALESCE(r.id, s.id) AS id
  , r.lat, r.lng
  , haversine(s.lat_mean, s.lng_mean, r.lat, r.lng) AS dist
FROM rides_l_clean_unk r, stations_clean s
WHERE dist <= 0.011
Time difference of 0.004525 secs
data.table [3 x 11]
ride_id rideable_type started_at ended_at member_casual way name id lat lng dist
176105D1F8A1216B electric_bike 2021-07-18 03:44:22 2021-07-18 04:12:23 casual start Wood St & Milwaukee Ave 13221 41.908 −87.673 0.002
DE82A15026BA3056 electric_bike 2021-09-21 18:18:59 2021-09-21 18:21:48 casual start Hegewisch Metra Station 20215 41.648 −87.546 0.009
EE197EDA4CF8CFE5 electric_bike 2021-09-22 07:14:42 2021-09-22 07:22:38 casual start Clinton St & Roosevelt Rd WL-008 41.867 −87.641 0.001

And we indeed get three matches !

Note

But those three already had an id, so we could probably have filled their missing name using stations_clean directly, instead of a convoluted proximity-based matching (which is more ressource intensive and less precise).

stations_clean[matched_clean, on = .(id)
             ][, .(id, name.stations = name, name.proximity = i.name, lat, lng)]
data.table [3 x 5]
id name.stations name.proximity lat lng
13221 Wood St & Milwaukee Ave Wood St & Milwaukee Ave 41.908 −87.673
20215 Hegewisch Metra Station Hegewisch Metra Station 41.648 −87.546
WL-008 Clinton St & Roosevelt Rd Clinton St & Roosevelt Rd 41.867 −87.641

At least, we can see that the proximity-based matched name and the one associated to that station in stations_clean are the same, so the proximity-matching method works reasonably well.

5.2 Matching on the original data

What if we did the same procedure on the non-cleaned data (the one with coordinates less precise than our criteria for matching) ?

5.2.1 Unfiltered stations data

First, we need to recompute the stations data from rides_l (i.e. rides data before cleaning):

stations <- ((rides_l
  |> na.omit(cols = c("id", "name"))
  |> dcast(id + name ~ ., fun.aggregate = list(min, max, mean), value.var = c("lat", "lng"))
  |> pipebind::bind(x, setcolorder(x, c("id", "name", str_subset(names(x), "lat_|_lng"))))
  |> unique(by = "id")
  )
)

setkey(stations, id, name)
CREATE TABLE stations AS 
SELECT DISTINCT on(id)
  id, name
  , MIN(lat) AS lat_min
  , MAX(lat) AS lat_max
  , AVG(lat) AS lat_mean
  , MIN(lng) AS lng_min
  , MAX(lng) AS lng_max
  , AVG(lng) AS lng_mean
FROM rides_l
WHERE (NOT((id IS NULL))) AND (NOT((name IS NULL)))
GROUP BY id, name;
data.frame [1,078 x 8]
id name lat_min lat_max lat_mean lng_min lng_max lng_mean
021320 MTV Hubbard St 41.89 41.89 41.89 −87.68 −87.68 −87.68
13001 Michigan Ave & Washington St 41.813 41.958 41.884 −87.657 −87.59 −87.625
13006 LaSalle St & Washington St 41.872 41.952 41.883 −87.833 −87.619 −87.633
13008 Millennium Park 41.809 41.96 41.881 −87.785 −87.587 −87.624
13011 Canal St & Adams St 41.871 41.902 41.879 −87.657 −87.626 −87.64
13016 St. Clair St & Erie St 41.841 41.931 41.894 −87.744 −87.616 −87.623
13017 Franklin St & Chicago Ave 41.873 41.917 41.897 −87.663 −87.625 −87.636
13021 Clinton St & Lake St 41.876 41.909 41.886 −87.661 −87.617 −87.642
13022 Streeter Dr & Grand Ave 41.772 41.969 41.892 −87.785 −87.586 −87.612
13028 900 W Harrison St 41.836 41.881 41.875 −87.679 −87.607 −87.65
[ omitted 1,063 entries ]

Cleaning the results:

Notice we get a lot more entries in our stations: 1078 entries vs 693 entries in the filtered version.

Which entries are in stations but not in stations_clean ?

(stations_diff <- stations[!stations_clean, on = .(id, name)])
data.table [417 x 8]
id name lat_min lat_max lat_mean lng_min lng_max lng_mean
021320 MTV Hubbard St 41.89 41.89 41.89 −87.68 −87.68 −87.68
20.0 Damen Ave & Wabansia Ave 41.91 41.91 41.91 −87.68 −87.68 −87.68
20126 S Aberdeen St & W 106th St 41.7 41.7 41.7 −87.65 −87.65 −87.65
20128 S Wentworth Ave & W 111th St 41.69 41.69 41.69 −87.63 −87.63 −87.63
20133 Woodlawn & 103rd - Olive Harvey Vaccination Site 41.71 41.71 41.71 −87.59 −87.59 −87.59
20134 Maryland Ave & 104th St 41.71 41.71 41.71 −87.6 −87.6 −87.6
20201 Kedzie Ave & 104th St 41.7 41.7 41.7 −87.7 −87.7 −87.7
20202 W 103rd St & S Avers Ave 41.71 41.71 41.71 −87.72 −87.72 −87.72
20209 S Michigan Ave & E 118th St 41.68 41.68 41.68 −87.62 −87.62 −87.62
20220 Avenue L & 114th St 41.69 41.69 41.69 −87.54 −87.54 −87.54
20240 Indiana Ave & 133rd St 41.65 41.65 41.65 −87.62 −87.62 −87.62
20241 Steelworkers Park 41.74 41.74 41.74 −87.53 −87.53 −87.53
20246.0 N Green St & W Lake St 41.89 41.89 41.89 −87.65 −87.65 −87.65
20247.0 W Washington Blvd & N Peoria St 41.88 41.88 41.88 −87.65 −87.65 −87.65
202480.0 Hampden Ct & Diversey Ave 41.93 41.93 41.93 −87.64 −87.64 −87.64
[ omitted 402 entries ]

And which of those 417 entries have the necessary coordinate precision to be used later on to match against the unknown stations ?

stations_diff[stations_diff[, Reduce(`&`, lapply(.SD, decp)), .SDcols = patterns("^lat|^lng")]]
data.table [0 x 8]
id name lat_min lat_max lat_mean lng_min lng_max lng_mean

As expected, none. But we’re still going to do the matching, for posterity !

5.2.2 Position-matching on stations

To save time, let’s only apply the procedure to the entries that actually need to be matched (i.e. the ones having coordinates but missing either name or id):

There are 1,696,469 entries from rides_l that could be position-matched to a known station.

matched <- (fuzzyjoin::geo_inner_join(
    as.data.frame(rides_l_unk),
    as.data.frame(stations),
    by = c("lng" = "lng_mean", "lat" = "lat_mean"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y)) 
  |> select(names(rides_l), dist)
  |> arrange(ride_id)
  |> drop_na(ride_id, id, name)
  |> setDT()
)

setkey(matched, ride_id, id)
Time difference of 2.143 secs

There are 1,696,469 entries from rides_l that could be position-matched to a known station.

SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual, way  
  , COALESCE(r.name, s.name) AS name
  , COALESCE(r.id, s.id) AS id
  , r.lat, r.lng
  , haversine(s.lat_mean, s.lng_mean, r.lat, r.lng) AS dist
FROM rides_l_unk r, stations s
WHERE dist <= 0.011
Time difference of 9.838 secs
data.table [939,829 x 11]
ride_id rideable_type started_at ended_at member_casual way name id lat lng dist
00000B4F1F71F9C2 electric_bike 2021-09-08 16:31:38 2021-09-08 16:37:54 member end Francisco Ave & Bloomingdale Ave 429 41.91 −87.7 0
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member start Prairie Ave & 47th St - midblock 814 41.81 −87.62 0
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member end Prairie Ave & 47th St - midblock 814 41.81 −87.62 0
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member start Martin Luther King Dr & 44th St 914 41.81 −87.62 0
000025C113FEB7B6 electric_bike 2021-10-25 11:15:21 2021-10-25 11:20:21 member end Martin Luther King Dr & 44th St 914 41.81 −87.62 0
000043B681BFB305 electric_bike 2021-10-14 16:24:41 2021-10-14 16:29:02 casual end California Ave & Cortez St 512 41.9 −87.7 0
00009A0299026096 electric_bike 2021-07-04 17:10:35 2021-07-04 17:21:29 casual start Lake Park Ave & 44th St 787 41.81 −87.6 0
00009F76371B5B23 electric_bike 2021-12-12 14:10:54 2021-12-12 14:42:44 member end Damen Ave & Wabansia Ave 20.0 41.91 −87.68 0
00009FC70913FC9F electric_bike 2021-11-09 15:02:41 2021-11-09 15:07:55 casual end Lincoln Ave & Balmoral Ave 442 41.98 −87.69 0
0000D2CE5AF46802 electric_bike 2021-10-21 17:41:43 2021-10-21 17:58:18 member end Elston Ave & George St 472 41.93 −87.69 0
0000D8A70B8D59B7 electric_bike 2021-06-23 18:28:49 2021-06-23 19:08:40 casual start Parnell Ave & 103rd St 605 41.71 −87.64 0
0000D8A70B8D59B7 electric_bike 2021-06-23 18:28:49 2021-06-23 19:08:40 casual end Parnell Ave & 103rd St 605 41.71 −87.64 0
0000D8A70B8D59B7 electric_bike 2021-06-23 18:28:49 2021-06-23 19:08:40 casual start Halsted St & 102nd St 623 41.71 −87.64 0
0000D8A70B8D59B7 electric_bike 2021-06-23 18:28:49 2021-06-23 19:08:40 casual end Halsted St & 102nd St 623 41.71 −87.64 0
0000D8A70B8D59B7 electric_bike 2021-06-23 18:28:49 2021-06-23 19:08:40 casual start Green St & 103rd St 878 41.71 −87.64 0
[ omitted 939,814 entries ]
Note

Notice how fast the procedure is, with close to 1 million matches (even if the results are mostly garbage).

What’s inside those matches ?

matched[, .(`Number of matches for an entry` = .N), by = .(ride_id, way)
      ][, .(`Number of times it happens` = .N), by = `Number of matches for an entry`]
data.table [5 x 2]
Number of matches for an entry Number of times it happens
1 437 864
2 130 830
3 51 620
4 19 615
5 1 397

We can see that more than half of the matches are coordinates that matched 2 or more stations, which we should definitely not keep.

But, among the ones with only one match, how many have coordinates precise enough to make that match in the first place (i.e. have 4 or more decimals or precision) ?

matched[, if(.N == 1) .SD, by = .(ride_id, way)][decp(lat) & decp(lng)]
data.table [3 x 11]
ride_id way rideable_type started_at ended_at member_casual name id lat lng dist
176105D1F8A1216B start electric_bike 2021-07-18 03:44:22 2021-07-18 04:12:23 casual Wood St & Milwaukee Ave 13221 41.908 −87.673 0.002
DE82A15026BA3056 start electric_bike 2021-09-21 18:18:59 2021-09-21 18:21:48 casual Hegewisch Metra Station 20215 41.648 −87.546 0.009
EE197EDA4CF8CFE5 start electric_bike 2021-09-22 07:14:42 2021-09-22 07:22:38 casual Clinton St & Roosevelt Rd WL-008 41.867 −87.641 0.001

As it turns out ? Only 3. And those are the same three matches we got from the filtered data.

In the end, those three are the only three position-based matches we should reasonably keep !

6 Updating the original dataset


Finally, we need to update the original dataset (rides_l_clean) with the entries that were position-matched (matched_clean):

6.1 Merging the two datasets

matched_clean[rides_l_clean, on = setdiff(names(rides_l_clean), c("id", "name"))
            ][, `:=`(name = fcoalesce(name, i.name), id = fcoalesce(id, i.id))
            ][, nms, env = list(nms = as.list(names(rides_l_clean)))] -> rides_l_merged

# setkey(rides_l_merged, ride_id, id)
Time difference of 7.555 secs
rides_l_merged.dtp <- (dplyr::right_join(
    matched_clean,
    rides_l_clean,
    by = setdiff(names(rides_l_clean), c("id", "name"))
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y))
  |> select(-matches("\\.x|\\.y"), -dist)
  |> collect()
)

# setkey(rides_l_merged.dtp, ride_id, id)
Time difference of 9.505 secs
Tip

dplyr has the neat rows_* series of functions that can easily replace or patch (i.e. only replace missing values) the content of one dataset by another, when the rows match, which is quite fast !

rides_l_merged.dp <- (dplyr::rows_patch(
    rides_l_clean,
    matched_clean[, -"dist"],
    by = setdiff(names(rides_l_clean), c("name", "id")),
    unmatched = "ignore"
  )
)
Time difference of 3.587 secs
CREATE TABLE rides_l_merged AS
SELECT 
  ride_id, rideable_type, started_at, ended_at, member_casual, way,
  COALESCE(id_x, id_y) AS id,
  COALESCE(name_x, name_y) AS name,
  lat, lng
FROM (
  SELECT
    r.ride_id AS ride_id,
    r.rideable_type AS rideable_type,
    r.started_at AS started_at,
    r.ended_at AS ended_at,
    r.member_casual AS member_casual,
    r.way AS way,
    m.name AS name_x,
    m.id AS id_x,
    r.lat AS lat,
    r.lng AS lng,
    r.name AS name_y,
    r.id AS id_y
  FROM matched_clean AS m
  RIGHT JOIN rides_l_clean AS r
  ON m.ride_id = r.ride_id 
     AND m.rideable_type = r.rideable_type 
     AND m.started_at = r.started_at
     AND m.ended_at = r.ended_at
     AND m.member_casual = r.member_casual
     AND m.way = r.way
);
Time difference of 4.306 secs
data.table [9,937,077 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 member end Kingsbury St & Kinzie St KA1503000043 41.889 −87.639
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 member start Wells St & Hubbard St TA1307000151 41.89 −87.634
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 member end Milwaukee Ave & Grand Ave 13033 41.891 −87.648
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 member start Clinton St & Jackson Blvd 638 41.878 −87.641
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 casual start Dearborn St & Van Buren St 624 41.876 −87.629
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 casual end Federal St & Polk St SL-008 41.872 −87.63
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 casual start Streeter Dr & Grand Ave 13022 41.892 −87.612
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 casual end Fairbanks Ct & Grand Ave TA1305000003 41.892 −87.621
00000E22FBA89D81 electric_bike 2022-05-19 14:42:55 2022-05-19 14:54:03 member end Clark St & Armitage Ave 13146 41.918 −87.636
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 member start Dorchester Ave & 49th St KA1503000069 41.806 −87.592
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 member end Kimbark Ave & 53rd St TA1309000037 41.8 −87.595
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 member start Larrabee St & Webster Ave 13193 41.922 −87.644
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 member end Sheffield Ave & Webster Ave TA1309000033 41.922 −87.654
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 casual end Broadway & Waveland Ave 13325 41.949 −87.649
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 casual start DuSable Lake Shore Dr & Belmont Ave TA1309000049 41.941 −87.639
[ omitted 9,937,062 entries ]

6.2 Validating the merge:

rides_l_merged[(is.na(id) | is.na(name)) & (!is.na(lat) & !is.na(lng))]
data.table [0 x 10]
ride_id rideable_type started_at ended_at member_casual way name id lat lng

We can see that the resulting dataset no longer has any entries that have coordinates but miss a name or an id, whereas there were three before. We have successfully updated them !

6.3 Pivoting back to the original (wide) format

To finish, let’s pivot the resulting data back into the wider format it was originally in:

rides_merged <- dcast(
  rides_l_merged, 
  ... ~ way, 
  value.var = c("name", "id", "lat", "lng"), sep = "_station_"
)
Time difference of 9.425 secs
rides_merged.dtp <- (rides_l_merged 
  |> pivot_wider(
    names_from = "way", 
    values_from = c("name", "id", "lat", "lng"), 
    names_glue = "{way}_station_{.value}"
  )
  |> collect()
)
Time difference of 10.02 secs
CREATE TABLE rides_merged AS
SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual,
  MAX(CASE WHEN (way = 'start') THEN name END) AS start_station_name,
  MAX(CASE WHEN (way = 'end') THEN name END) AS end_station_name,
  MAX(CASE WHEN (way = 'start') THEN id END) AS start_station_id,
  MAX(CASE WHEN (way = 'end') THEN id END) AS end_station_id,
  MAX(CASE WHEN (way = 'start') THEN lat END) AS start_lat,
  MAX(CASE WHEN (way = 'end') THEN lat END) AS end_lat,
  MAX(CASE WHEN (way = 'start') THEN lng END) AS start_lng,
  MAX(CASE WHEN (way = 'end') THEN lng END) AS end_lng
FROM rides_l_merged
GROUP BY ride_id, rideable_type, started_at, ended_at, member_casual
Time difference of 6.463 secs
data.table [5,316,218 x 13]
ride_id rideable_type started_at ended_at member_casual name_station_end name_station_start id_station_end id_station_start lat_station_end lat_station_start lng_station_end lng_station_start
00000123F60251E6 classic_bike 2022-02-07 15:47:40 2022-02-07 15:49:28 member Kingsbury St & Kinzie St Wells St & Hubbard St KA1503000043 TA1307000151 41.889 41.89 −87.639 −87.634
000002EBE159AE82 electric_bike 2021-06-22 17:25:15 2021-06-22 17:31:34 member Milwaukee Ave & Grand Ave Clinton St & Jackson Blvd 13033 638 41.891 41.878 −87.648 −87.641
0000080D43BAA9E4 classic_bike 2021-08-29 15:38:05 2021-08-29 16:24:03 casual Federal St & Polk St Dearborn St & Van Buren St SL-008 624 41.872 41.876 −87.63 −87.629
00000CAE95438C9D classic_bike 2021-07-20 15:40:46 2021-07-20 17:38:17 casual Fairbanks Ct & Grand Ave Streeter Dr & Grand Ave TA1305000003 13022 41.892 41.892 −87.621 −87.612
00000E22FBA89D81 electric_bike 2022-05-19 14:42:55 2022-05-19 14:54:03 member Clark St & Armitage Ave NA 13146 NA 41.918 NA −87.636 NA
00000EBBC119168C classic_bike 2021-10-31 11:30:37 2021-10-31 11:39:27 member Kimbark Ave & 53rd St Dorchester Ave & 49th St TA1309000037 KA1503000069 41.8 41.806 −87.595 −87.592
000019B7F053D461 classic_bike 2021-08-13 19:57:28 2021-08-13 20:02:56 member Sheffield Ave & Webster Ave Larrabee St & Webster Ave TA1309000033 13193 41.922 41.922 −87.654 −87.644
00001B4F79D102B5 classic_bike 2021-07-28 07:58:27 2021-07-28 08:05:00 casual Broadway & Waveland Ave DuSable Lake Shore Dr & Belmont Ave 13325 TA1309000049 41.949 41.941 −87.649 −87.639
00001BEE76AB24E0 electric_bike 2021-11-30 16:55:38 2021-11-30 17:08:53 member Ashland Ave & Division St Daley Center Plaza 13061 TA1306000010 41.903 41.884 −87.668 −87.629
00001DCF2BC423F4 docked_bike 2021-06-13 12:00:49 2021-06-13 12:29:51 casual Fort Dearborn Dr & 31st St Millennium Park TA1307000048 13008 41.839 41.881 −87.608 −87.624
000020C92AA9D6F7 classic_bike 2021-09-12 09:53:00 2021-09-12 10:12:52 casual Dusable Harbor Clark St & North Ave KA1503000064 13128 41.887 41.912 −87.613 −87.632
0000228A4B430869 electric_bike 2021-10-18 10:42:20 2021-10-18 10:47:58 member Calumet Ave & 18th St MLK Jr Dr & 29th St 13102 TA1307000139 41.857 41.842 −87.619 −87.617
000022C3D3CE7DD5 classic_bike 2022-04-30 09:57:39 2022-04-30 10:03:12 casual Sheffield Ave & Willow St Halsted St & Clybourn Ave TA1306000032 331 41.914 41.91 −87.653 −87.648
0000278F02EFFEF9 classic_bike 2021-09-18 16:09:39 2021-09-18 16:32:06 member Michigan Ave & Washington St Burnham Harbor 13001 15545 41.884 41.856 −87.625 −87.613
000027C557F9372D docked_bike 2022-05-13 11:01:17 2022-05-13 11:09:05 casual Lincoln Ave & Roscoe St* Ashland Ave & Belle Plaine Ave chargingstx5 13249 41.943 41.956 −87.671 −87.669
[ omitted 5,316,203 entries ]
Note

We get less than the original (wide format) ~6 millions entries due to having removed (filtered) the entries with bad coordinates.