1 Setup

library(here)        # File path management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies

library(data.table)  # Fast data manipulation
library(dplyr)       # Slower (but more readable) data manipulation
library(dtplyr)      # data.table backend for dplyr
library(tidyr)       # Extra convenience functions for data manipulation

library(DBI)         # Database connection
library(dbplyr)      # SQL back-end for dplyr
library(duckdb)      # Quack Stack

library(stringr)     # Manipulating strings
library(purrr)       # Manipulating lists

library(fuzzyjoin)   # Non-equi joins & coordinates-based joins

options(
  dplyr.strict_sql = FALSE,
  scipen = 999L, 
  digits = 4L,
  knitr.max_rows_print = 10
)

data.table::setDTthreads(parallel::detectCores(logical = TRUE))

Expand for Session Info

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.2.1 (2022-06-23)
 os       Ubuntu 20.04.4 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  C.UTF-8
 ctype    C.UTF-8
 tz       Europe/Paris
 date     2022-09-24
 pandoc   2.19.2 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
 Quarto   1.1.251

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version     date (UTC) lib source
 P data.table * 1.14.3      2022-07-27 [?] Github (Rdatatable/data.table@c4a2085)
 P DBI        * 1.1.3       2022-06-18 [?] CRAN (R 4.2.0)
 P dbplyr     * 2.2.1       2022-06-27 [?] CRAN (R 4.2.0)
   dplyr      * 1.0.99.9000 2022-08-15 [1] Github (tidyverse/dplyr@d8294b4)
 P dtplyr     * 1.2.1       2022-01-19 [?] CRAN (R 4.2.0)
   duckdb     * 0.5.0       2022-09-03 [1] https://duckdb.r-universe.dev (R 4.2.1)
   fs         * 1.5.2       2021-12-08 [1] CRAN (R 4.2.0)
   fuzzyjoin  * 0.1.6       2020-05-15 [1] CRAN (R 4.2.0)
 P here       * 1.0.1       2020-12-13 [?] CRAN (R 4.2.0)
   pipebind   * 0.1.1       2022-08-10 [1] CRAN (R 4.2.0)
 P purrr      * 0.3.4       2020-04-17 [?] CRAN (R 4.2.0)
 P stringr    * 1.4.0       2019-02-10 [2] CRAN (R 4.2.0)
 P tidyr      * 1.2.0       2022-02-01 [?] CRAN (R 4.2.0)

 [1] /home/mar/Dev/Projects/R/Misc/renv/library/R-4.2/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/library/Misc-f25fd835/R-4.2/x86_64-pc-linux-gnu
 [3] /usr/lib/R/library
 [4] /usr/local/lib/R/site-library
 [5] /usr/lib/R/site-library

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

2 Loading the data

2.1 Wide format (original)

rides <- (purrr::map_dfr(
    fs::dir_ls(data_path, glob = "*.csv"),
    \(file) fread(file, na.strings = "")
  )
)

setkey(rides, ride_id)

Time difference of 17.34 secs

rides_con <- DBI::dbConnect(duckdb::duckdb())

duckdb::duckdb_read_csv(
  rides_con, 
  "rides", 
  fs::dir_ls(here::here("data", "stations"), glob = "*.csv")
)

CREATE INDEX ride_idx ON rides (ride_id)

Time difference of 8.706 secs

data.frame [5,860,776 x 13]

ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	Wells St & Hubbard St	TA1307000151	Kingsbury St & Kinzie St	KA1503000043	41.89	−87.634	41.889	−87.639	member
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	Clinton St & Jackson Blvd	638	Milwaukee Ave & Grand Ave	13033	41.878	−87.641	41.891	−87.648	member
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	Dearborn St & Van Buren St	624	Federal St & Polk St	SL-008	41.876	−87.629	41.872	−87.63	casual
00000B4F1F71F9C2	electric_bike	2021-09-08 16:31:38	2021-09-08 16:37:54	NA	NA	NA	NA	41.91	−87.69	41.91	−87.7	member
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	Streeter Dr & Grand Ave	13022	Fairbanks Ct & Grand Ave	TA1305000003	41.892	−87.612	41.892	−87.621	casual
00000E22FBA89D81	electric_bike	2022-05-19 14:42:55	2022-05-19 14:54:03	NA	NA	Clark St & Armitage Ave	13146	41.9	−87.62	41.918	−87.636	member
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	Dorchester Ave & 49th St	KA1503000069	Kimbark Ave & 53rd St	TA1309000037	41.806	−87.592	41.8	−87.595	member
000018B1D040DB44	electric_bike	2022-04-25 10:37:22	2022-04-25 10:44:19	NA	NA	NA	NA	41.79	−87.6	41.79	−87.59	member
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	Larrabee St & Webster Ave	13193	Sheffield Ave & Webster Ave	TA1309000033	41.922	−87.644	41.922	−87.654	member
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	DuSable Lake Shore Dr & Belmont Ave	TA1309000049	Broadway & Waveland Ave	13325	41.941	−87.639	41.949	−87.649	casual
[ omitted 5,860,761 entries ]

2.2 Long format

rides_l <- melt(
  rides,
  measure = measure(way, value.name, pattern = "(end|start).*(name|id|lat|lng)")
)

setkey(rides_l, ride_id, id)

Time difference of 6.138 secs

rides_l.dtp <- (pivot_longer(
    rides,
    matches("^end_|^start_"),
    names_pattern = "(end|start).*(name|id|lat|lng)",
    names_to = c("way", ".value")
  ) 
  |> as.data.table()
)

setkey(rides_l.dtp, ride_id, id)

Time difference of 10.41 secs

CREATE TABLE rides_l AS 
(
  SELECT
    ride_id, rideable_type, started_at, ended_at,member_casual
    , 'start' AS way
    , start_station_name AS name
    , start_station_id AS id
    , start_lat AS lat
    , start_lng AS lng
  FROM rides
)
UNION ALL
(
  SELECT
    ride_id, rideable_type, started_at, ended_at, member_casual
    , 'end' AS way
    , end_station_name AS name
    , end_station_id AS id
    , end_lat AS lat
    , end_lng AS lng
  FROM rides
);

CREATE INDEX station_idx ON rides_l (id);

No need to re-define an index for ride_id here, it was passed down from the table rides

Time difference of 6.508 secs

Alternatively

We can reuse an existinf df and directly add it (or bind it as a view) to the database:

DBI::dbWriteTable(rides_con, "rides_l", rides_l) # As a TABLE

duckdb::duckdb_register(rides_con, "rides_l", rides_l) # As a VIEW

(dplyr::tbl(rides_con, "rides") 
  |> pivot_longer(
    matches("^end_|^start_"),
    names_pattern = "(end|start).*(name|id|lat|lng)", 
    names_to = c("way", ".value")
  )
  |> dplyr::copy_to(
    rides_con, 
    df = _, 
    "rides_l_dbp",
    temporary = FALSE,
    indexes = list("ride_id", "id")
  )
)

Time difference of 14.7 secs

data.frame [11,721,552 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng
99FEC93BA843FB20	electric_bike	2021-06-13 14:31:28	2021-06-13 14:34:11	member	start	NA	NA	41.8	−87.59
06048DCFC8520CAF	electric_bike	2021-06-04 11:18:02	2021-06-04 11:24:19	member	start	NA	NA	41.79	−87.59
9598066F68045DF2	electric_bike	2021-06-04 09:49:35	2021-06-04 09:55:34	member	start	NA	NA	41.8	−87.6
B03C0FE48C412214	electric_bike	2021-06-03 19:56:05	2021-06-03 20:21:55	member	start	NA	NA	41.78	−87.58
B9EEA89F8FEE73B7	electric_bike	2021-06-04 14:05:51	2021-06-04 14:09:59	member	start	NA	NA	41.8	−87.59
62B943CEAAA420BA	electric_bike	2021-06-03 19:32:01	2021-06-03 19:38:46	member	start	NA	NA	41.78	−87.58
7E2546FBA79C46EE	electric_bike	2021-06-10 16:30:10	2021-06-10 16:36:21	member	start	NA	NA	41.79	−87.6
3DDF3BBF6C4C3C89	electric_bike	2021-06-10 17:00:30	2021-06-10 17:06:48	member	start	NA	NA	41.79	−87.59
2608805637155AB6	electric_bike	2021-06-10 12:46:16	2021-06-10 12:55:02	member	start	NA	NA	41.93	−87.67
AF529C946F28ED42	electric_bike	2021-06-23 17:57:29	2021-06-23 18:06:40	member	start	NA	NA	41.88	−87.61
[ omitted 11,721,537 entries ]

3 Exploring the data

3.1 Cleaning useless coordinates

Removing entries were lat/lng do not have sufficient precision to be reliably matched to a station (i.e. entries having less than 4 decimals, which corresponds to a 11 meters “radius” at the equator).

Degrees to distance equivalence

Decimal	Distance at the equator (m)
0	111,120
1	11,112
2	1,111.2
3	111.12
4	11.112
5	1.1112

decp <- \(x) str_length(str_remove(as.character(abs(x)), ".*\\.")) >= 4

CREATE FUNCTION decp(x) AS length(str_split(CAST(abs(x) AS VARCHAR(10)), '.')[2]) >= 4

rides_l_clean <- rides_l[decp(lat) & decp(lng), ]

setkey(rides_l_clean, ride_id)

Time difference of 25.32 secs

rides_l_clean.dtp <- (rides_l
  |> filter(decp(lat) & decp(lng))
  |> as.data.table()
)

setkey(rides_l_clean.dtp, ride_id)

Time difference of 26.81 secs

CREATE TABLE rides_l_clean AS 
SELECT * FROM rides_l 
WHERE decp(lat) AND decp(lng)

Time difference of 5.369 secs

Note

Here, dbplyr will leave the decp call as-is in the SQL translation, but since we have previously defined a decp SQL function, this function will get called when the SQL query is executed.

(dplyr::tbl(rides_con, "rides_l") 
  |> filter(if_all(c(lat, lng), \(x) decp(x)))
  |> dplyr::copy_to(
    rides_con, 
    df = _, 
    "rides_l_clean_dbp",
    temporary = FALSE,
    indexes = list("ride_id", "id")
  )
)

Time difference of 16.91 secs

data.table [9,937,077 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	member	end	Kingsbury St & Kinzie St	KA1503000043	41.889	−87.639
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	member	start	Wells St & Hubbard St	TA1307000151	41.89	−87.634
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	member	end	Milwaukee Ave & Grand Ave	13033	41.891	−87.648
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	member	start	Clinton St & Jackson Blvd	638	41.878	−87.641
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	casual	start	Dearborn St & Van Buren St	624	41.876	−87.629
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	casual	end	Federal St & Polk St	SL-008	41.872	−87.63
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	casual	start	Streeter Dr & Grand Ave	13022	41.892	−87.612
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	casual	end	Fairbanks Ct & Grand Ave	TA1305000003	41.892	−87.621
00000E22FBA89D81	electric_bike	2022-05-19 14:42:55	2022-05-19 14:54:03	member	end	Clark St & Armitage Ave	13146	41.918	−87.636
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	member	start	Dorchester Ave & 49th St	KA1503000069	41.806	−87.592
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	member	end	Kimbark Ave & 53rd St	TA1309000037	41.8	−87.595
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	member	start	Larrabee St & Webster Ave	13193	41.922	−87.644
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	member	end	Sheffield Ave & Webster Ave	TA1309000033	41.922	−87.654
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	casual	end	Broadway & Waveland Ave	13325	41.949	−87.649
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	casual	start	DuSable Lake Shore Dr & Belmont Ave	TA1309000049	41.941	−87.639
[ omitted 9,937,062 entries ]

3.2 What’s missing ?

Entries missing one or both coordinates but having an id or name:

rides_l_clean[(is.na(lat) | is.na(lng)) & (!is.na(id) | !is.na(name)), ]

SELECT * FROM rides_l_clean 
WHERE ((lat IS NULL) OR (lng IS NULL) 
  AND (NOT((id IS NULL)) OR NOT((name IS NULL))))

(dplyr::tbl(rides_con, "rides_l_clean")
  |> filter(
    if_any(matches("lat$|lng$"), \(v) is.na(v)) & 
    (!is.na(id) | !is.na(name))
  )
  |> collect()
)

data.frame [0 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng

Entries missing either name or id, but having coordinates:

rides_l_clean[(!is.na(lat) & !is.na(lng)) & (is.na(id) | is.na(name)), ]

rides_l_clean_unk <- (rides_l_clean 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> as.data.table()
)

CREATE TABLE rides_l_clean_unk AS
SELECT * FROM rides_l_clean
WHERE ((id IS NULL) OR (name IS NULL)) 
  AND (NOT((lat IS NULL)) AND NOT((lng IS NULL)))

(dplyr::tbl(rides_con, "rides_l_clean") 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> collect()
)

data.frame [3 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng
176105D1F8A1216B	electric_bike	2021-07-18 03:44:22	2021-07-18 04:12:23	casual	start	NA	13221	41.908	−87.673
DE82A15026BA3056	electric_bike	2021-09-21 18:18:59	2021-09-21 18:21:48	casual	start	NA	20215	41.648	−87.546
EE197EDA4CF8CFE5	electric_bike	2021-09-22 07:14:42	2021-09-22 07:22:38	casual	start	NA	WL-008	41.867	−87.641

It seems there are only 3 entries missing identification that could be matched based on their coordinates at the level of precision we use (11m / 4 decimals).

Warning

Although, if we look at the original dataset (before filtering the inaccurate coordinates):

rides_l[(!is.na(lat) & !is.na(lng)) & (is.na(id) | is.na(name)), ]

rides_l_unk <- (rides_l
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> as.data.table()
)

CREATE TABLE rides_l_unk AS
SELECT * FROM rides_l
WHERE ((id IS NULL) OR (name IS NULL)) 
  AND (NOT((lat IS NULL)) AND NOT((lng IS NULL)))

(dplyr::tbl(rides_con, "rides_l") 
  |> filter(
    if_all(matches("lat$|lng$"), \(v) !is.na(v)) & 
    (is.na(id) | is.na(name))
  )
  |> collect()
)

data.frame [1,696,469 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng
00000B4F1F71F9C2	electric_bike	2021-09-08 16:31:38	2021-09-08 16:37:54	member	end	NA	NA	41.91	−87.7
00000B4F1F71F9C2	electric_bike	2021-09-08 16:31:38	2021-09-08 16:37:54	member	start	NA	NA	41.91	−87.69
00000E22FBA89D81	electric_bike	2022-05-19 14:42:55	2022-05-19 14:54:03	member	start	NA	NA	41.9	−87.62
000018B1D040DB44	electric_bike	2022-04-25 10:37:22	2022-04-25 10:44:19	member	end	NA	NA	41.79	−87.59
000018B1D040DB44	electric_bike	2022-04-25 10:37:22	2022-04-25 10:44:19	member	start	NA	NA	41.79	−87.6
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	end	NA	NA	41.81	−87.62
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	start	NA	NA	41.81	−87.62
00002E385DB2888C	electric_bike	2022-05-07 16:28:53	2022-05-07 16:40:28	casual	end	NA	NA	41.87	−87.75
00002E385DB2888C	electric_bike	2022-05-07 16:28:53	2022-05-07 16:40:28	casual	start	NA	NA	41.89	−87.75
000043B681BFB305	electric_bike	2021-10-14 16:24:41	2021-10-14 16:29:02	casual	end	NA	NA	41.9	−87.7
[ omitted 1,696,454 entries ]

There were 1,696,469 missing stations’ id or name that could have been filled in the original data, but it seems that all of them disappeared when we filtered the coordinates with less than 4 decimals of precision. It would seem that those entries were missing their id/name in the source data because their coordinates were too imprecise to be matched to any station in the first place.

If one were to do the macthing anyway, here’s how:

4 Stations data

4.1 Creating `stations` data

First, we need to assemble a dataset linking each unique station id (and name) with a set of coordinates (here, we use the average lat & lng)

stations_clean <- ((rides_l_clean
  |> na.omit(cols = c("id", "name"))
  |> dcast(id + name ~ ., fun.aggregate = list(min, max, mean), value.var = c("lat", "lng"))
  |> pipebind::bind(x, setcolorder(x, c("id", "name", str_subset(names(x), "lat_|_lng"))))
  |> unique(by = "id")
  )
)

setkey(stations_clean, id)

CREATE TABLE stations_clean AS 
SELECT DISTINCT on(id)
  id, name
  , MIN(lat) AS lat_min
  , MAX(lat) AS lat_max
  , AVG(lat) AS lat_mean
  , MIN(lng) AS lng_min
  , MAX(lng) AS lng_max
  , AVG(lng) AS lng_mean
FROM rides_l_clean
WHERE (NOT((id IS NULL))) AND (NOT((name IS NULL)))
GROUP BY id, name;

(dplyr::tbl(rides_con, "rides_l_clean")
  |> filter(!is.na(id), !is.na(name))
  |> group_by(id, name)
  |> summarize(across(c(lat, lng), list(min, max, mean), .names = "{.col}_{.fn}"))
  |> ungroup()
  |> distinct(id, .keep_all = TRUE)
  |> arrange(id)
  |> collect()
)

data.frame [693 x 8]

id	name	lat_min	lat_max	lat_mean	lng_min	lng_max	lng_mean
13001	Michigan Ave & Washington St	41.813	41.958	41.884	−87.657	−87.59	−87.625
13006	LaSalle St & Washington St	41.872	41.952	41.883	−87.833	−87.619	−87.633
13008	Millennium Park	41.809	41.96	41.881	−87.785	−87.587	−87.624
13011	Canal St & Adams St	41.871	41.902	41.879	−87.657	−87.626	−87.64
13016	St. Clair St & Erie St	41.841	41.931	41.894	−87.744	−87.616	−87.623
13017	Franklin St & Chicago Ave	41.873	41.917	41.897	−87.663	−87.625	−87.636
13021	Clinton St & Lake St	41.876	41.909	41.886	−87.661	−87.617	−87.642
13022	Streeter Dr & Grand Ave	41.772	41.969	41.892	−87.785	−87.586	−87.612
13028	900 W Harrison St	41.836	41.881	41.875	−87.679	−87.607	−87.65
13029	Field Museum	41.81	41.896	41.865	−87.625	−87.588	−87.618
[ omitted 678 entries ]

5 Matching missing `id` by position

Let’s match the entries of rides_l_clean with stations_clean by proximity:

5.1 Matching on the cleaned data

To save time, let’s only apply the procedure to the entries that actually need to be matched (i.e. the ones having coordinates but missing either name or id).

There are 3 entries from rides_l_clean that could be position-matched to a known station.

matched_clean <- (fuzzyjoin::geo_inner_join(
    as.data.frame(rides_l_clean_unk),
    as.data.frame(stations_clean),
    by = c("lng" = "lng_mean", "lat" = "lat_mean"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y)) 
  |> select(names(rides_l_clean), dist)
  |> arrange(ride_id)
  |> drop_na(ride_id, id, name)
  |> setDT()
)

setkey(matched_clean, ride_id, id)

Time difference of 0.4161 secs

There are 3 entries from rides_l_clean that could be position-matched to a known station.

Creating the haversine distance function:

CREATE FUNCTION haversine(lat1, lng1, lat2, lng2) 
    AS ( 6371 * acos( cos( radians(lat1) ) *
       cos( radians(lat2) ) * cos( radians(lng2) - radians(lng1) ) +
       sin( radians(lat1) ) * sin( radians(lat2) ) ) 
    );

Doing the matching:

CREATE TABLE matched_clean AS
SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual, way  
  , COALESCE(r.name, s.name) AS name
  , COALESCE(r.id, s.id) AS id
  , r.lat, r.lng
  , haversine(s.lat_mean, s.lng_mean, r.lat, r.lng) AS dist
FROM rides_l_clean_unk r, stations_clean s
WHERE dist <= 0.011

Time difference of 0.004525 secs

data.table [3 x 11]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng	dist
176105D1F8A1216B	electric_bike	2021-07-18 03:44:22	2021-07-18 04:12:23	casual	start	Wood St & Milwaukee Ave	13221	41.908	−87.673	0.002
DE82A15026BA3056	electric_bike	2021-09-21 18:18:59	2021-09-21 18:21:48	casual	start	Hegewisch Metra Station	20215	41.648	−87.546	0.009
EE197EDA4CF8CFE5	electric_bike	2021-09-22 07:14:42	2021-09-22 07:22:38	casual	start	Clinton St & Roosevelt Rd	WL-008	41.867	−87.641	0.001

And we indeed get three matches !

Note

But those three already had an id, so we could probably have filled their missing name using stations_clean directly, instead of a convoluted proximity-based matching (which is more ressource intensive and less precise).

stations_clean[matched_clean, on = .(id)
             ][, .(id, name.stations = name, name.proximity = i.name, lat, lng)]

data.table [3 x 5]

id	name.stations	name.proximity	lat	lng
13221	Wood St & Milwaukee Ave	Wood St & Milwaukee Ave	41.908	−87.673
20215	Hegewisch Metra Station	Hegewisch Metra Station	41.648	−87.546
WL-008	Clinton St & Roosevelt Rd	Clinton St & Roosevelt Rd	41.867	−87.641

At least, we can see that the proximity-based matched name and the one associated to that station in stations_clean are the same, so the proximity-matching method works reasonably well.

5.2 Matching on the original data

What if we did the same procedure on the non-cleaned data (the one with coordinates less precise than our criteria for matching) ?

5.2.1 Unfiltered `stations` data

First, we need to recompute the stations data from rides_l (i.e. rides data before cleaning):

stations <- ((rides_l
  |> na.omit(cols = c("id", "name"))
  |> dcast(id + name ~ ., fun.aggregate = list(min, max, mean), value.var = c("lat", "lng"))
  |> pipebind::bind(x, setcolorder(x, c("id", "name", str_subset(names(x), "lat_|_lng"))))
  |> unique(by = "id")
  )
)

setkey(stations, id, name)

CREATE TABLE stations AS 
SELECT DISTINCT on(id)
  id, name
  , MIN(lat) AS lat_min
  , MAX(lat) AS lat_max
  , AVG(lat) AS lat_mean
  , MIN(lng) AS lng_min
  , MAX(lng) AS lng_max
  , AVG(lng) AS lng_mean
FROM rides_l
WHERE (NOT((id IS NULL))) AND (NOT((name IS NULL)))
GROUP BY id, name;

data.frame [1,078 x 8]

id	name	lat_min	lat_max	lat_mean	lng_min	lng_max	lng_mean
021320	MTV Hubbard St	41.89	41.89	41.89	−87.68	−87.68	−87.68
13001	Michigan Ave & Washington St	41.813	41.958	41.884	−87.657	−87.59	−87.625
13006	LaSalle St & Washington St	41.872	41.952	41.883	−87.833	−87.619	−87.633
13008	Millennium Park	41.809	41.96	41.881	−87.785	−87.587	−87.624
13011	Canal St & Adams St	41.871	41.902	41.879	−87.657	−87.626	−87.64
13016	St. Clair St & Erie St	41.841	41.931	41.894	−87.744	−87.616	−87.623
13017	Franklin St & Chicago Ave	41.873	41.917	41.897	−87.663	−87.625	−87.636
13021	Clinton St & Lake St	41.876	41.909	41.886	−87.661	−87.617	−87.642
13022	Streeter Dr & Grand Ave	41.772	41.969	41.892	−87.785	−87.586	−87.612
13028	900 W Harrison St	41.836	41.881	41.875	−87.679	−87.607	−87.65
[ omitted 1,063 entries ]

Cleaning the results:

Notice we get a lot more entries in our stations: 1078 entries vs 693 entries in the filtered version.

Which entries are in stations but not in stations_clean ?

(stations_diff <- stations[!stations_clean, on = .(id, name)])

data.table [417 x 8]

id	name	lat_min	lat_max	lat_mean	lng_min	lng_max	lng_mean
021320	MTV Hubbard St	41.89	41.89	41.89	−87.68	−87.68	−87.68
20.0	Damen Ave & Wabansia Ave	41.91	41.91	41.91	−87.68	−87.68	−87.68
20126	S Aberdeen St & W 106th St	41.7	41.7	41.7	−87.65	−87.65	−87.65
20128	S Wentworth Ave & W 111th St	41.69	41.69	41.69	−87.63	−87.63	−87.63
20133	Woodlawn & 103rd - Olive Harvey Vaccination Site	41.71	41.71	41.71	−87.59	−87.59	−87.59
20134	Maryland Ave & 104th St	41.71	41.71	41.71	−87.6	−87.6	−87.6
20201	Kedzie Ave & 104th St	41.7	41.7	41.7	−87.7	−87.7	−87.7
20202	W 103rd St & S Avers Ave	41.71	41.71	41.71	−87.72	−87.72	−87.72
20209	S Michigan Ave & E 118th St	41.68	41.68	41.68	−87.62	−87.62	−87.62
20220	Avenue L & 114th St	41.69	41.69	41.69	−87.54	−87.54	−87.54
20240	Indiana Ave & 133rd St	41.65	41.65	41.65	−87.62	−87.62	−87.62
20241	Steelworkers Park	41.74	41.74	41.74	−87.53	−87.53	−87.53
20246.0	N Green St & W Lake St	41.89	41.89	41.89	−87.65	−87.65	−87.65
20247.0	W Washington Blvd & N Peoria St	41.88	41.88	41.88	−87.65	−87.65	−87.65
202480.0	Hampden Ct & Diversey Ave	41.93	41.93	41.93	−87.64	−87.64	−87.64
[ omitted 402 entries ]

And which of those 417 entries have the necessary coordinate precision to be used later on to match against the unknown stations ?

stations_diff[stations_diff[, Reduce(`&`, lapply(.SD, decp)), .SDcols = patterns("^lat|^lng")]]

data.table [0 x 8]

id	name	lat_min	lat_max	lat_mean	lng_min	lng_max	lng_mean

As expected, none. But we’re still going to do the matching, for posterity !

5.2.2 Position-matching on `stations`

To save time, let’s only apply the procedure to the entries that actually need to be matched (i.e. the ones having coordinates but missing either name or id):

There are 1,696,469 entries from rides_l that could be position-matched to a known station.

matched <- (fuzzyjoin::geo_inner_join(
    as.data.frame(rides_l_unk),
    as.data.frame(stations),
    by = c("lng" = "lng_mean", "lat" = "lat_mean"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y)) 
  |> select(names(rides_l), dist)
  |> arrange(ride_id)
  |> drop_na(ride_id, id, name)
  |> setDT()
)

setkey(matched, ride_id, id)

Time difference of 2.143 secs

There are 1,696,469 entries from rides_l that could be position-matched to a known station.

SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual, way  
  , COALESCE(r.name, s.name) AS name
  , COALESCE(r.id, s.id) AS id
  , r.lat, r.lng
  , haversine(s.lat_mean, s.lng_mean, r.lat, r.lng) AS dist
FROM rides_l_unk r, stations s
WHERE dist <= 0.011

Time difference of 9.838 secs

data.table [939,829 x 11]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng	dist
00000B4F1F71F9C2	electric_bike	2021-09-08 16:31:38	2021-09-08 16:37:54	member	end	Francisco Ave & Bloomingdale Ave	429	41.91	−87.7	0
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	start	Prairie Ave & 47th St - midblock	814	41.81	−87.62	0
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	end	Prairie Ave & 47th St - midblock	814	41.81	−87.62	0
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	start	Martin Luther King Dr & 44th St	914	41.81	−87.62	0
000025C113FEB7B6	electric_bike	2021-10-25 11:15:21	2021-10-25 11:20:21	member	end	Martin Luther King Dr & 44th St	914	41.81	−87.62	0
000043B681BFB305	electric_bike	2021-10-14 16:24:41	2021-10-14 16:29:02	casual	end	California Ave & Cortez St	512	41.9	−87.7	0
00009A0299026096	electric_bike	2021-07-04 17:10:35	2021-07-04 17:21:29	casual	start	Lake Park Ave & 44th St	787	41.81	−87.6	0
00009F76371B5B23	electric_bike	2021-12-12 14:10:54	2021-12-12 14:42:44	member	end	Damen Ave & Wabansia Ave	20.0	41.91	−87.68	0
00009FC70913FC9F	electric_bike	2021-11-09 15:02:41	2021-11-09 15:07:55	casual	end	Lincoln Ave & Balmoral Ave	442	41.98	−87.69	0
0000D2CE5AF46802	electric_bike	2021-10-21 17:41:43	2021-10-21 17:58:18	member	end	Elston Ave & George St	472	41.93	−87.69	0
0000D8A70B8D59B7	electric_bike	2021-06-23 18:28:49	2021-06-23 19:08:40	casual	start	Parnell Ave & 103rd St	605	41.71	−87.64	0
0000D8A70B8D59B7	electric_bike	2021-06-23 18:28:49	2021-06-23 19:08:40	casual	end	Parnell Ave & 103rd St	605	41.71	−87.64	0
0000D8A70B8D59B7	electric_bike	2021-06-23 18:28:49	2021-06-23 19:08:40	casual	start	Halsted St & 102nd St	623	41.71	−87.64	0
0000D8A70B8D59B7	electric_bike	2021-06-23 18:28:49	2021-06-23 19:08:40	casual	end	Halsted St & 102nd St	623	41.71	−87.64	0
0000D8A70B8D59B7	electric_bike	2021-06-23 18:28:49	2021-06-23 19:08:40	casual	start	Green St & 103rd St	878	41.71	−87.64	0
[ omitted 939,814 entries ]

Note

Notice how fast the procedure is, with close to 1 million matches (even if the results are mostly garbage).

What’s inside those matches ?

matched[, .(`Number of matches for an entry` = .N), by = .(ride_id, way)
      ][, .(`Number of times it happens` = .N), by = `Number of matches for an entry`]

data.table [5 x 2]

Number of matches for an entry	Number of times it happens
1	437 864
2	130 830
3	51 620
4	19 615
5	1 397

We can see that more than half of the matches are coordinates that matched 2 or more stations, which we should definitely not keep.

But, among the ones with only one match, how many have coordinates precise enough to make that match in the first place (i.e. have 4 or more decimals or precision) ?

matched[, if(.N == 1) .SD, by = .(ride_id, way)][decp(lat) & decp(lng)]

data.table [3 x 11]

ride_id	way	rideable_type	started_at	ended_at	member_casual	name	id	lat	lng	dist
176105D1F8A1216B	start	electric_bike	2021-07-18 03:44:22	2021-07-18 04:12:23	casual	Wood St & Milwaukee Ave	13221	41.908	−87.673	0.002
DE82A15026BA3056	start	electric_bike	2021-09-21 18:18:59	2021-09-21 18:21:48	casual	Hegewisch Metra Station	20215	41.648	−87.546	0.009
EE197EDA4CF8CFE5	start	electric_bike	2021-09-22 07:14:42	2021-09-22 07:22:38	casual	Clinton St & Roosevelt Rd	WL-008	41.867	−87.641	0.001

As it turns out ? Only 3. And those are the same three matches we got from the filtered data.

In the end, those three are the only three position-based matches we should reasonably keep !

6 Updating the original dataset

Finally, we need to update the original dataset (rides_l_clean) with the entries that were position-matched (matched_clean):

6.1 Merging the two datasets

matched_clean[rides_l_clean, on = setdiff(names(rides_l_clean), c("id", "name"))
            ][, `:=`(name = fcoalesce(name, i.name), id = fcoalesce(id, i.id))
            ][, nms, env = list(nms = as.list(names(rides_l_clean)))] -> rides_l_merged

# setkey(rides_l_merged, ride_id, id)

Time difference of 7.555 secs

rides_l_merged.dtp <- (dplyr::right_join(
    matched_clean,
    rides_l_clean,
    by = setdiff(names(rides_l_clean), c("id", "name"))
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y))
  |> select(-matches("\\.x|\\.y"), -dist)
  |> collect()
)

# setkey(rides_l_merged.dtp, ride_id, id)

Time difference of 9.505 secs

Tip

dplyr has the neat rows_* series of functions that can easily replace or patch (i.e. only replace missing values) the content of one dataset by another, when the rows match, which is quite fast !

rides_l_merged.dp <- (dplyr::rows_patch(
    rides_l_clean,
    matched_clean[, -"dist"],
    by = setdiff(names(rides_l_clean), c("name", "id")),
    unmatched = "ignore"
  )
)

Time difference of 3.587 secs

CREATE TABLE rides_l_merged AS
SELECT 
  ride_id, rideable_type, started_at, ended_at, member_casual, way,
  COALESCE(id_x, id_y) AS id,
  COALESCE(name_x, name_y) AS name,
  lat, lng
FROM (
  SELECT
    r.ride_id AS ride_id,
    r.rideable_type AS rideable_type,
    r.started_at AS started_at,
    r.ended_at AS ended_at,
    r.member_casual AS member_casual,
    r.way AS way,
    m.name AS name_x,
    m.id AS id_x,
    r.lat AS lat,
    r.lng AS lng,
    r.name AS name_y,
    r.id AS id_y
  FROM matched_clean AS m
  RIGHT JOIN rides_l_clean AS r
  ON m.ride_id = r.ride_id 
     AND m.rideable_type = r.rideable_type 
     AND m.started_at = r.started_at
     AND m.ended_at = r.ended_at
     AND m.member_casual = r.member_casual
     AND m.way = r.way
);

Time difference of 4.306 secs

data.table [9,937,077 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	member	end	Kingsbury St & Kinzie St	KA1503000043	41.889	−87.639
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	member	start	Wells St & Hubbard St	TA1307000151	41.89	−87.634
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	member	end	Milwaukee Ave & Grand Ave	13033	41.891	−87.648
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	member	start	Clinton St & Jackson Blvd	638	41.878	−87.641
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	casual	start	Dearborn St & Van Buren St	624	41.876	−87.629
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	casual	end	Federal St & Polk St	SL-008	41.872	−87.63
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	casual	start	Streeter Dr & Grand Ave	13022	41.892	−87.612
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	casual	end	Fairbanks Ct & Grand Ave	TA1305000003	41.892	−87.621
00000E22FBA89D81	electric_bike	2022-05-19 14:42:55	2022-05-19 14:54:03	member	end	Clark St & Armitage Ave	13146	41.918	−87.636
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	member	start	Dorchester Ave & 49th St	KA1503000069	41.806	−87.592
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	member	end	Kimbark Ave & 53rd St	TA1309000037	41.8	−87.595
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	member	start	Larrabee St & Webster Ave	13193	41.922	−87.644
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	member	end	Sheffield Ave & Webster Ave	TA1309000033	41.922	−87.654
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	casual	end	Broadway & Waveland Ave	13325	41.949	−87.649
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	casual	start	DuSable Lake Shore Dr & Belmont Ave	TA1309000049	41.941	−87.639
[ omitted 9,937,062 entries ]

6.2 Validating the merge:

rides_l_merged[(is.na(id) | is.na(name)) & (!is.na(lat) & !is.na(lng))]

data.table [0 x 10]

ride_id	rideable_type	started_at	ended_at	member_casual	way	name	id	lat	lng

We can see that the resulting dataset no longer has any entries that have coordinates but miss a name or an id, whereas there were three before. We have successfully updated them !

6.3 Pivoting back to the original (wide) format

To finish, let’s pivot the resulting data back into the wider format it was originally in:

rides_merged <- dcast(
  rides_l_merged, 
  ... ~ way, 
  value.var = c("name", "id", "lat", "lng"), sep = "_station_"
)

Time difference of 9.425 secs

rides_merged.dtp <- (rides_l_merged 
  |> pivot_wider(
    names_from = "way", 
    values_from = c("name", "id", "lat", "lng"), 
    names_glue = "{way}_station_{.value}"
  )
  |> collect()
)

Time difference of 10.02 secs

CREATE TABLE rides_merged AS
SELECT
  ride_id, rideable_type, started_at, ended_at, member_casual,
  MAX(CASE WHEN (way = 'start') THEN name END) AS start_station_name,
  MAX(CASE WHEN (way = 'end') THEN name END) AS end_station_name,
  MAX(CASE WHEN (way = 'start') THEN id END) AS start_station_id,
  MAX(CASE WHEN (way = 'end') THEN id END) AS end_station_id,
  MAX(CASE WHEN (way = 'start') THEN lat END) AS start_lat,
  MAX(CASE WHEN (way = 'end') THEN lat END) AS end_lat,
  MAX(CASE WHEN (way = 'start') THEN lng END) AS start_lng,
  MAX(CASE WHEN (way = 'end') THEN lng END) AS end_lng
FROM rides_l_merged
GROUP BY ride_id, rideable_type, started_at, ended_at, member_casual

Time difference of 6.463 secs

data.table [5,316,218 x 13]

ride_id	rideable_type	started_at	ended_at	member_casual	name_station_end	name_station_start	id_station_end	id_station_start	lat_station_end	lat_station_start	lng_station_end	lng_station_start
00000123F60251E6	classic_bike	2022-02-07 15:47:40	2022-02-07 15:49:28	member	Kingsbury St & Kinzie St	Wells St & Hubbard St	KA1503000043	TA1307000151	41.889	41.89	−87.639	−87.634
000002EBE159AE82	electric_bike	2021-06-22 17:25:15	2021-06-22 17:31:34	member	Milwaukee Ave & Grand Ave	Clinton St & Jackson Blvd	13033	638	41.891	41.878	−87.648	−87.641
0000080D43BAA9E4	classic_bike	2021-08-29 15:38:05	2021-08-29 16:24:03	casual	Federal St & Polk St	Dearborn St & Van Buren St	SL-008	624	41.872	41.876	−87.63	−87.629
00000CAE95438C9D	classic_bike	2021-07-20 15:40:46	2021-07-20 17:38:17	casual	Fairbanks Ct & Grand Ave	Streeter Dr & Grand Ave	TA1305000003	13022	41.892	41.892	−87.621	−87.612
00000E22FBA89D81	electric_bike	2022-05-19 14:42:55	2022-05-19 14:54:03	member	Clark St & Armitage Ave	NA	13146	NA	41.918	NA	−87.636	NA
00000EBBC119168C	classic_bike	2021-10-31 11:30:37	2021-10-31 11:39:27	member	Kimbark Ave & 53rd St	Dorchester Ave & 49th St	TA1309000037	KA1503000069	41.8	41.806	−87.595	−87.592
000019B7F053D461	classic_bike	2021-08-13 19:57:28	2021-08-13 20:02:56	member	Sheffield Ave & Webster Ave	Larrabee St & Webster Ave	TA1309000033	13193	41.922	41.922	−87.654	−87.644
00001B4F79D102B5	classic_bike	2021-07-28 07:58:27	2021-07-28 08:05:00	casual	Broadway & Waveland Ave	DuSable Lake Shore Dr & Belmont Ave	13325	TA1309000049	41.949	41.941	−87.649	−87.639
00001BEE76AB24E0	electric_bike	2021-11-30 16:55:38	2021-11-30 17:08:53	member	Ashland Ave & Division St	Daley Center Plaza	13061	TA1306000010	41.903	41.884	−87.668	−87.629
00001DCF2BC423F4	docked_bike	2021-06-13 12:00:49	2021-06-13 12:29:51	casual	Fort Dearborn Dr & 31st St	Millennium Park	TA1307000048	13008	41.839	41.881	−87.608	−87.624
000020C92AA9D6F7	classic_bike	2021-09-12 09:53:00	2021-09-12 10:12:52	casual	Dusable Harbor	Clark St & North Ave	KA1503000064	13128	41.887	41.912	−87.613	−87.632
0000228A4B430869	electric_bike	2021-10-18 10:42:20	2021-10-18 10:47:58	member	Calumet Ave & 18th St	MLK Jr Dr & 29th St	13102	TA1307000139	41.857	41.842	−87.619	−87.617
000022C3D3CE7DD5	classic_bike	2022-04-30 09:57:39	2022-04-30 10:03:12	casual	Sheffield Ave & Willow St	Halsted St & Clybourn Ave	TA1306000032	331	41.914	41.91	−87.653	−87.648
0000278F02EFFEF9	classic_bike	2021-09-18 16:09:39	2021-09-18 16:32:06	member	Michigan Ave & Washington St	Burnham Harbor	13001	15545	41.884	41.856	−87.625	−87.613
000027C557F9372D	docked_bike	2022-05-13 11:01:17	2022-05-13 11:09:05	casual	Lincoln Ave & Roscoe St*	Ashland Ave & Belle Plaine Ave	chargingstx5	13249	41.943	41.956	−87.671	−87.669
[ omitted 5,316,203 entries ]

Note

We get less than the original (wide format) ~6 millions entries due to having removed (filtered) the entries with bad coordinates.

1 Setup

2 Loading the data

2.1 Wide format (original)

2.2 Long format

3 Exploring the data

3.1 Cleaning useless coordinates

3.2 What’s missing ?

4 Stations data

4.1 Creating stations data

5 Matching missing id by position

5.1 Matching on the cleaned data

5.2 Matching on the original data

5.2.1 Unfiltered stations data

5.2.2 Position-matching on stations

6 Updating the original dataset

6.1 Merging the two datasets

6.2 Validating the merge:

6.3 Pivoting back to the original (wide) format

4.1 Creating `stations` data

5 Matching missing `id` by position

5.2.1 Unfiltered `stations` data

5.2.2 Position-matching on `stations`