Construction of Automated Banking Machine Location Data

This appendix outlines the steps taken to construct the automated banking machine (ABM) data used in this paper. The construction process proceeded in five steps, outlined below.

Step 1: Identify and Drop Duplicates in Mastercard Data

Identify Duplicates (2023)

Code

# load packages 
library(tidyverse)
library(janitor)
library(knitr)
library(kableExtra)
library(gt)

# import raw MasterCard data

mc_raw_2023 <- read_csv("data/unprocessed/mastercard/BOC_All_ATMImage(December 2023).txt", guess_max = 50000) %>% 
  clean_names()

Within Sponsors

In this step, we are identifying duplicates within each sponsor. We treat both the terminal ID and sponsor name as a combined key. We then add a column called duplicate_tag_1 that indicates whether each terminal ID appears once, twice, or more than twice within the same sponsor:

Tag 0: The terminal ID appears only once within a sponsor.
Tag 1: The terminal ID appears exactly twice within a sponsor.
Tag 2: The terminal ID appears more than twice within a sponsor.

This helps us manage duplicates at the sponsor level and ensures we accurately identify repeated terminal IDs within each sponsor’s dataset.

Code

tagged_2_2023 <- mc_raw_2023 %>% 
  group_by(inst_assigned_term_id, sponsor_name) %>%
  mutate(duplicate_tag_1 = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup()

tagged_2_2023 %>% 
  count(sponsor_name, duplicate_tag_1) %>% 
  pivot_wider(names_from = duplicate_tag_1, values_from = n, names_prefix = "tag_") %>% 
  # Replace missing values (NA) with 0 in the reshaped data
  mutate(across(where(is.integer), ~ replace_na(., 0))) %>% 
  # Create a formatted table with 'kable' for better presentation
  kbl(caption = "Table 2: Duplicates Within Sponsors (2023)", 
      col.names = c("Sponsor Name", "Tag 0", "Tag 1", "Tag 2")) %>% 
  kable_classic()

Table 2: Duplicates Within Sponsors (2023)
Sponsor Name	Tag 0	Tag 1	Tag 2
BANK OF MONTREAL	37875	52	0
BANK OF NOVA SCOTIA	2434	48	18
CANADIAN IMPERIAL BANK OF COMMERCE	2968	0	0
DIRECTCASH BANK	7255	128	0
EVERLINK PAYMENT SERVICES INC.	1851	0	0
FEDERATION DES CAISSES DESJARDINS DU QUEBEC	1875	0	0
HSBC BANK CANADA	168	0	0
MONERIS SOLUTIONS CORPORATION	26397	0	0
NATIONAL BANK OF CANADA	927	0	0
ROYAL BANK OF CANADA	3955	0	0
TORONTO-DOMINION BANK, THE	2691	0	0

For Directcash Bank, some of the duplicates are due to the same record assigned once to its actual address, and once to a PO Box. Otherwise, the records look exactly the same.

Code

tagged_2_2023 %>% 
  filter(duplicate_tag_1 == 1) %>%
  select(sponsor_name, owner_name, 
         inst_assigned_term_id, address1, 
         city_name, location_name) %>% 
  filter(inst_assigned_term_id == "D1018430" | 
           inst_assigned_term_id == "D1027205" | 
           inst_assigned_term_id == "D1032094") %>% 
  arrange(inst_assigned_term_id) %>% 
  kbl(col.names = 
        c("Sponsor Name", "Owner Name", "Terminal ID", 
          "Address", "City", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "blue") %>% 
  row_spec(6, color = "blue")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
DIRECTCASH BANK	ALVENA CO-OPERATIVE ASSOCIATION LTD	D1018430	103 RAILWAY AVE	ALVENA	ALVENA COOP
DIRECTCASH BANK	ALVENA CO-OPERATIVE ASSOCIATION LTD	D1018430	PO BOX 150	ALVENA	ALVENA COOP
DIRECTCASH BANK	DC PAYMENTS CA	D1027205	215 VICTORIA AVE	MONARCH	MONARCH POST OFFICE
DIRECTCASH BANK	DC PAYMENTS CA	D1027205	PO BOX 168	MONARCH	MONARCH POST OFFICE
DIRECTCASH BANK	TIN ROOSTER HOLDINGS LTD	D1032094	BOULDER AVE	DEASE LAKE	DEASE LAKE SUPER A
DIRECTCASH BANK	TIN ROOSTER HOLDINGS LTD	D1032094	PO BOX 160	DEASE LAKE	DEASE LAKE SUPER A

Deduplication rule: When DirectCash Bank records share the same terminal ID but have different addresses, we drop the record where the address contains a PO Box and retain the record with a physical address.

Scotiabank duplicates have the same record with its address written differently, such as different city names with the same address, or one record ignores the street number while the other does have it. These are real duplicates as well.

Code

tagged_2_2023 %>% 
  filter(duplicate_tag_1 != 0) %>%
  select(sponsor_name, owner_name, last_reported, 
         inst_assigned_term_id, address1, city_name, latitude, longitude) %>%
  filter(inst_assigned_term_id == "AE25" | inst_assigned_term_id == "AH08") %>% 
  arrange(inst_assigned_term_id) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Last Reported",
                    "Terminal ID", "Address", "City", "Latitude", "Longitude")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "red") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "green")

Sponsor Name	Owner Name	Last Reported	Terminal ID	Address	City	Latitude	Longitude
BANK OF NOVA SCOTIA	SCOTIABANK	24-11-2023 14:20:10	AE25	18TH ST RICHMOND AVE	BRANDON	49.82750	-99.96201
BANK OF NOVA SCOTIA	SCOTIABANK	24-11-2023 14:20:10	AE25	1570 18TH ST	BRANDON	49.82204	-99.96358
BANK OF NOVA SCOTIA	SCOTIABANK	24-11-2023 14:20:10	AE25	18TH ST RICHMOND AVE	BRANDON	49.82750	-99.96201
BANK OF NOVA SCOTIA	SCOTIABANK	24-11-2023 14:20:10	AH08	3401 DUFFERIN ST	TORONTO	43.72598	-79.45241
BANK OF NOVA SCOTIA	SCOTIABANK	24-11-2023 14:20:10	AH08	3401 DUFFERIN ST	NORTH YORK	43.72611	-79.45227

BMO ATM duplicates seem to be the same ATM reported under ownership of both Cardtronics and Direct Cash. Since Cardtronics had already acquired Direct Cash in 2017, this seems to be real duplicates as well.

Code

tagged_2_2023 %>% 
  filter(duplicate_tag_1 != 0) %>%
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name) %>% 
  filter(inst_assigned_term_id == "D1018373" | 
           inst_assigned_term_id == "D1057885" | 
           inst_assigned_term_id == "D1066100") %>% 
  arrange(inst_assigned_term_id) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City
BANK OF MONTREAL	DIRECT CASH	D1018373	4650 SHEPPARD AVE E	SCARBOROUGH
BANK OF MONTREAL	CARDTRONICS	D1018373	1691 CT GAS SCARBOROUGH	SCARBOROUGH
BANK OF MONTREAL	DIRECT CASH	D1057885	165 MAIN ST	ANTIGONISH
BANK OF MONTREAL	CARDTRONICS	D1057885	476 CENTRE ST N	NAPANEE
BANK OF MONTREAL	DIRECT CASH	D1066100	6250 NO 3 RD	RICHMOND
BANK OF MONTREAL	CARDTRONICS	D1066100	RICHMOND BRIDGEHOUSE STATION 6250 NO 3 ROAD	RICHMOND

Deduplication rule: When records within the same sponsor (BMO) share the same terminal ID but have different owner names, specifically Cardtronics and Direct Cash, we drop the record associated with Cardtronics and retain the record associated with DirectCash..

The other BMO ATM duplicates have two owners as well: MBE and RapidCash.

Code

tagged_2_2023 %>%
  filter(duplicate_tag_1 != 0) %>%
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name) %>%
  filter(inst_assigned_term_id == "RCA00539" | inst_assigned_term_id == "RCA00540" |
    inst_assigned_term_id == "RCA00630") %>% 
  arrange(desc(inst_assigned_term_id)) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City
BANK OF MONTREAL	MBE	RCA00630	1ST ST	VERNON
BANK OF MONTREAL	RAPID CASH ATM LTD	RCA00630	12900 29 AVE	VERNON
BANK OF MONTREAL	MBE	RCA00540	TC HWY 1	CALGARY
BANK OF MONTREAL	RAPID CASH ATM LTD	RCA00540	1414 3 ST SW	CALGARY
BANK OF MONTREAL	MBE	RCA00539	TC HWY 1	CALGARY
BANK OF MONTREAL	RAPID CASH ATM LTD	RCA00539	1414 3 ST SW	CALGARY

Deduplication rule: When records within the same sponsor (BMO) share the same terminal ID but have different owner names, specifically MBE and Rapid Cash, we drop the record associated with MBE and retain the record associated with Rapid Cash.

Across Sponsors

We begin by treating the terminal ID as a unique identifier. To manage duplicates, we add a column called duplicate tag that indicates whether each terminal ID appears once, twice, or more than twice. This tagging helps highlight duplicates in the data:

Tag 0: The terminal ID appears only once (i.e., it’s unique).
Tag 1: The terminal ID appears exactly twice.
Tag 2: The terminal ID appears more than twice.

Finally, we count the number of occurrences for each combination of sponsor name and duplicate tag to better understand the distribution of duplicates.

Code

# Group the raw Mastercard 2023 data by 'inst_assigned_term_id'
# inst_assigned_term_id is terminal ID in the 2023 mastercard data

tagged_2023 <- mc_raw_2023 %>% 
  group_by(inst_assigned_term_id) %>%
  
  # Create a new column 'duplicate_tag' to label the number of duplicates:
  # - Assign 0 if there's only one record for this 'inst_assigned_term_id'
  # - Assign 1 if there are exactly two records
  # - Assign 2 if there are more than two records
  mutate(duplicate_tag = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  
  # Remove the grouping structure
  ungroup()

# Count the occurrences of each 'sponsor_name' by 'duplicate_tag', 
# then reshape the data to have columns for each 'duplicate_tag' (0, 1, 2)
tagged_2023 %>% 
  count(sponsor_name, duplicate_tag) %>% 
  pivot_wider(names_from = duplicate_tag, values_from = n, names_prefix = "tag_") %>% 
  
  # Replace missing values (NA) with 0 in the reshaped data
  mutate(across(where(is.integer), ~ replace_na(., 0))) %>% 
  
  # Create a formatted table with 'kable' for better presentation
  kbl(caption = "Table 1: Duplicates Across Sponsors (2023)", col.names = c("Sponsor Name", "Tag 0", "Tag 1", "Tag 2")) %>% 
  kable_classic() %>% 
  kable_styling(full_width = FALSE)

Table 1: Duplicates Across Sponsors (2023)
Sponsor Name	Tag 0	Tag 1	Tag 2
BANK OF MONTREAL	9915	27845	167
BANK OF NOVA SCOTIA	2341	136	23
CANADIAN IMPERIAL BANK OF COMMERCE	2714	239	15
DIRECTCASH BANK	4493	2806	84
EVERLINK PAYMENT SERVICES INC.	1747	1	103
FEDERATION DES CAISSES DESJARDINS DU QUEBEC	1875	0	0
HSBC BANK CANADA	168	0	0
MONERIS SOLUTIONS CORPORATION	1460	24833	104
NATIONAL BANK OF CANADA	927	0	0
ROYAL BANK OF CANADA	3861	92	2
TORONTO-DOMINION BANK, THE	2691	0	0

It seems that the duplicates are concentrated in BMO (i.e. Moneris) and Directcash (which has been acquired by BMO according to MC).

BMO- Moneris Duplicates

BMO has 9915 unique terminal IDs, but 27845 Terminals appear more than once. And 167 Terminals appear at least twice. Moneris has 1460 unique terminal IDs, and 24833 are duplicates. And 104 terminals appear more than once. These are the primary drivers of the duplicates.

Here are examples of BMO- Moneris duplicates.

Code

filter_bmo_examples <- c("001BCLVS", "001NYKBA", "TOT00005")

tagged_2023 %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, 
         address1, city_name, location_name, duplicate_tag) %>%
  filter(duplicate_tag == 1) %>% 
  arrange(inst_assigned_term_id, sponsor_name) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, location_name) %>% 
  filter(inst_assigned_term_id %in% filter_bmo_examples) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>%  
  kable_classic() %>% 
  #kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "blue") %>% 
  row_spec(6, color = "blue")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
BANK OF MONTREAL	FLEX TOUCH CANADA 4	001BCLVS	19925 WILLOWBROOK DR UNIT 101	LANGLEY	SAMMY J PEPPERS
MONERIS SOLUTIONS CORPORATION	FLEX TOUCH CANADA 4	001BCLVS	19925 WILLOWBROOK DR UNIT 101	LANGLEY	SAMMY J PEPPERS
BANK OF MONTREAL	FLEX TOUCH CANADA 4	001NYKBA	1731 TAPSCOTT RD	SCARBOROUGH	E GAMES PLUS
MONERIS SOLUTIONS CORPORATION	FLEX TOUCH CANADA 4	001NYKBA	1731 TAPSCOTT RD	SCARBOROUGH	E GAMES PLUS
BANK OF MONTREAL	CASH IN TIME	TOT00005	2694 BROOKDALE AVE	CORNWALL	SUPER 8
MONERIS SOLUTIONS CORPORATION	CASH IN TIME	TOT00005	2694 BROOKDALE AVE	CORNWALL	SUPER 8

Deduplication rule: When BMO and Moneris share the same terminal ID, we use the Sponsor Name to determine which record to keep. We drop the record associated with Moneris and retain the record associated with BMO.

BMO- Direct Cash Duplicates

We also know that direct cash has been acquired by BMO. In these cases, we see the sponsor is Direct Cash in one entry, and the owner is direct cash in another entry.

Code

filter_dc_examples <- c("D4024406", "D4026911", "D4018524", "D4018462")

tagged_2023 %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, 
         address1, city_name, location_name, duplicate_tag) %>%
  filter(duplicate_tag == 1) %>% 
  filter(inst_assigned_term_id %in% filter_dc_examples) %>% 
  select(-duplicate_tag) %>% 
  arrange(desc(inst_assigned_term_id)) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "blue") %>% 
  row_spec(6, color = "blue") %>% 
  row_spec(7, color = "orange") %>% 
  row_spec(8, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
BANK OF MONTREAL	DIRECT CASH	D4026911	204 DAKOTA DUNES WAY	WHITECAP	SIGA DAKOTA DUNES 1
DIRECTCASH BANK	SASKATCHEWAN INDIAN GAMING AUTHORITY INC	D4026911	204 DAKOTA DUNES WAY	WHITECAP	SIGA DAKOTA DUNES 1
BANK OF MONTREAL	DIRECT CASH	D4024406	9208 JASPER AVE NW	EDMONTON	RIVERVIEW HUSKY 2
DIRECTCASH BANK	ROCKY STAR ENTERPRISE LTD	D4024406	9208 JASPER AVE NW	EDMONTON	RIVERVIEW HUSKY 2
BANK OF MONTREAL	DIRECT CASH	D4018524	377 RIVERSIDE DR E	WINDSOR	CAESARS WINDSOR EMPLOYEES LOUNGE
DIRECTCASH BANK	DC PAYMENTS CA	D4018524	377 RIVERSIDE DR E	WINDSOR	CAESARS WINDSOR EMPLOYEES LOUNGE
BANK OF MONTREAL	DIRECT CASH	D4018462	44 MARQUIS RD W	PRINCE ALBERT	SIGA NORTHERN LIGHTS 2
DIRECTCASH BANK	DC PAYMENTS CA	D4018462	44 MARQUIS RD W	PRINCE ALBERT	SIGA NORTHERN LIGHTS 2

Deduplication rule: When BMO and DirectCash Bank share the same terminal ID, we use the Sponsor Name to determine which record to keep. We drop the record associated with DirectCash Bank and retain the record associated with BMO.

Other

Most of the duplicates in the 2023 data originate from Moneris and Direct Cash. By applying the rules above, we can clean up most of the data, but a few other duplicates may still need attention.

For the few duplicate CIBC ATMs, it seems that they are actually owned by Cardtronics, but branded by CIBC.

Code

filter_cibc_examples <- c("MS909C9E", "MS909G9J", "MS909G9C", "MS909F9W")

tagged_2023 %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, 
         address1, city_name, location_name, duplicate_tag) %>%
  filter(duplicate_tag == 1) %>% 
  filter(inst_assigned_term_id %in% filter_cibc_examples) %>% 
  select(-duplicate_tag) %>% 
  arrange(desc(inst_assigned_term_id)) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "blue") %>% 
  row_spec(6, color = "blue") %>% 
  row_spec(7, color = "orange") %>% 
  row_spec(8, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	MS909G9J	1131 DUNDAS ST W	MISSISSAUGA	GIANT TIGER
BANK OF MONTREAL	CARDTRONICS	MS909G9J	1131 DUNDAS ST W UNIT M1	MISSISSAUGA	GIANT TIGER 234
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	MS909G9C	9225 HWY 93	MIDLAND	GIANT TIGER
BANK OF MONTREAL	CARDTRONICS	MS909G9C	9225 COUNTY RD 93 HURONIA MALL	MIDLAND	GIANT TIGER 128
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	MS909F9W	800 BOUL THIBEAU	TROISRIVIRES	GIANT TIGER
BANK OF MONTREAL	CARDTRONICS	MS909F9W	800 BOUL THIBEAU	TROISRIVIRES	GIANT TIGER 116
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	MS909C9E	1250 MAIN ST	OTTAWA	IDA PHARMACY
BANK OF MONTREAL	CARDTRONICS	MS909C9E	1250 MAIN ST	OTTAWA	IDA PHARMACY

Deduplication rule: Retain the record associated with CIBC

It seems that the RBC and Scotiabank’s “duplicated ATMs” are not duplicates, but different ATMs present in their respective branches. This could be because the terminal variable is only 4 digits in each case. For example, there are NOT duplicates.

Code

filter_rbc_examples <- c("WD21", "WC23", "VM20")

tagged_2023 %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, location_name, duplicate_tag) %>%
  filter(duplicate_tag == 1) %>% 
  filter(inst_assigned_term_id %in% filter_rbc_examples) %>% 
  select(-duplicate_tag) %>% 
  arrange(inst_assigned_term_id) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  kable_classic()

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	VM20	2288 WHATCOM RD	ABBOTSFORD	SUMAS MOUNTAIN VI DT
BANK OF NOVA SCOTIA	SCOTIABANK	VM20	7030 ALBERNI ST	POWELL RIVER	POWELL RIVER 1
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	WC23	220 PORTAGE AVE	WINNIPEG	WINNIPEG MNWE54AUD
BANK OF NOVA SCOTIA	SCOTIABANK	WC23	1091 ST MARYS RD	WINNIPEG	DAKOTA VILLAGE 3
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	WD21	85 MAIN ST	MELITA	MELITA BRWH28 AUD
BANK OF NOVA SCOTIA	SCOTIABANK	WD21	1195 ARTHUR ST W	THUNDER BAY	ARTHUR PARKDALE

Deduplication rule: These are not real duplicates. Nothing to do here.

Now, we explore the cases of duplicates with Tag 2 – i.e., when there are more than two instances of the same terminal ID.

There are two cases of double ATMs in Scotiabank branches that have the same 4-digit terminal ID as RBC ATMs. As mentioned in the previous paragraph, these are not real duplicates, so we do not drop them.

Code

filter_trip_bmo_rbc_examples <- c("CM25", "CM23")

tagged_2023 %>% 
  arrange(inst_assigned_term_id) %>% 
  filter(duplicate_tag == 2) %>%
  arrange(desc(inst_assigned_term_id)) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, location_name, duplicate_tag) %>% 
  filter(inst_assigned_term_id %in% filter_trip_bmo_rbc_examples) %>% 
  select(-duplicate_tag) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "Location Name")) %>% 
  kable_classic()

Sponsor Name	Owner Name	Terminal ID	Address	Location Name
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	CM25	4804 50 ST	SANDSTONE HENDAY AUD
BANK OF NOVA SCOTIA	SCOTIABANK	CM25	1265 YORK MILLS RD	PARKWOOD VILLAGE 3
BANK OF NOVA SCOTIA	SCOTIABANK	CM25	1265 YORK MILLS RD	PARKWOOD VILLAGE 3
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	CM23	65 GASOLINE ALLEY E	PETROCAN AUD
BANK OF NOVA SCOTIA	SCOTIABANK	CM23	1265 YORK MILLS RD	PARKWOOD VILLAGE 1
BANK OF NOVA SCOTIA	SCOTIABANK	CM23	1265 YORK MILLS RD	PARKWOOD VILLAGE 1

Deduplication rule: These are not true duplicates across sponsors, so we do not perform deduplication between different sponsors.

For some Direct Cash ATMs, there were two duplicate records already in 2019. Coupled with the BMO record in 2023, that makes three of the same.

Code

tagged_2023 %>% 
  select(sponsor_name, owner_name, last_reported, inst_assigned_term_id, address1, location_name, duplicate_tag) %>% 
  filter(inst_assigned_term_id == "D2069425" | inst_assigned_term_id == "D2069137") %>% 
  select(-duplicate_tag) %>% 
  arrange(desc(inst_assigned_term_id)) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Last Reported", 
                    "Terminal ID", "Address", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "red") %>% 
  row_spec(4, color = "blue") %>% 
  row_spec(5, color = "blue") %>% 
  row_spec(6, color = "blue")

Sponsor Name	Owner Name	Last Reported	Terminal ID	Address	Location Name
DIRECTCASH BANK	DC PAYMENTS CA	24-05-2019 09:40:38	D2069425	3456 HIGHWAY 138	MACEWEN RANGER CONVENIENCE
DIRECTCASH BANK	DC PAYMENTS CA	23-05-2019 16:59:37	D2069425	PO BOX 45	MACEWEN RANGER CONVENIENCE
BANK OF MONTREAL	CARDTRONICS	13-12-2023 15:55:39	D2069425	3456 HWY 138	ESSO MONKLAND 25005
BANK OF MONTREAL	DIRECT CASH	13-12-2023 15:59:40	D2069137	455 BROADWAY ST W	GALLAGHER CENTRE
DIRECTCASH BANK	GALLAGHER CENTRE- CITY OF YORKTON	24-05-2019 09:40:30	D2069137	455 BROADWAY AVE W BOX 908	GALLAGHER CENTRE
DIRECTCASH BANK	GALLAGHER CENTRE- CITY OF YORKTON	23-05-2019 16:58:50	D2069137	PO BOX 908 STN DELCENTRE	GALLAGHER CENTRE

Deduplication rule: Keep BMO.

Some CIBC-Branded ATMs were originally Direct Cash ATMs, acquired by Cardtronics and reported via BMO. For these ATMs there are three sponsors reporting them as well.

Code

tagged_2023 %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1,
         city_name, location_name, duplicate_tag) %>%
  filter(duplicate_tag == 2) %>% 
  filter(inst_assigned_term_id == "F4257559") %>% 
  select(-duplicate_tag) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "red")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
BANK OF MONTREAL	DIRECT CASH	F4257559	20 YORK ST	TORONTO	CIBC GTAA UP UNION STATION
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	F4257559	20 YORK ST	TORONTO	UNION STATION UP EXPRESS TRAIN PLATFOR
DIRECTCASH BANK	DC PAYMENTS CA	F4257559	20 YORK ST	TORONTO	GTAA UP UNION STATION

Deduplication rule: .

Deduplicate (2023)

There are 88642 observations in the 2023 Mastercard data.

Eliminate duplicates within sponsor

Code

df_1_2023 <- mc_raw_2023 %>%
  group_by(sponsor_name, inst_assigned_term_id, state_prov) %>%   
  mutate(tag1 = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup() %>%
  arrange(state_prov, tag1, inst_assigned_term_id) %>%        
  mutate(todrop = 0,                                         
         todrop = if_else(owner_name == "MBE", todrop + 1, todrop),
         todrop = if_else(grepl("PO BOX", address1), todrop + 1, todrop),
         todrop = if_else(is.na(latitude), todrop + 1, todrop),
         todrop = if_else(tag1 == 1 & owner_name == "DIRECT CASH", todrop + 1, todrop)) %>% 
  arrange(desc(tag1), desc(inst_assigned_term_id), desc(todrop), desc(address1))

df_2_2023 <- df_1_2023 %>% 
  arrange(tag1, inst_assigned_term_id, todrop, address1) %>%
  group_by(tag1,inst_assigned_term_id) %>%
  mutate(dupeorder = row_number()) %>%
  ungroup() %>%
  filter(tag1 < 1 | dupeorder == 1) %>%
  select(-todrop, -tag1) %>% 
  select(-dupeorder)

# mc_raw_2023 %>%
#   group_by(inst_assigned_term_id) %>%
#   # Flag PO Box entries for removal if the same terminal ID has a real address
#   mutate(todrop = if_else(grepl("PO BOX", address1) & n() > 1, 1, 0)) %>%
#   # Remove the flagged PO Box entries
#   filter(todrop == 0) %>%
#   ungroup() %>%
#   select(-todrop)

Then eliminate duplicates with different sponsors

Code

df_3_2023 <- df_2_2023 %>%
  group_by(inst_assigned_term_id, state_prov) %>%   
  mutate(tag = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup() %>% 
  mutate(tokeep = 0) %>%
  # when BMO and Moneris share a terminal ID, we are keeping BMO
  mutate(tokeep = if_else(tag >= 1 & sponsor_name == "BANK OF MONTREAL", tokeep + 1, tokeep),
         # When directcash bank and some other sponsor (BMO) share an terminal ID, we remove
         # direct cash bank 
         tokeep = if_else(tag >= 1 & sponsor_name == "DIRECTCASH BANK", tokeep - 1, tokeep),
         # if cibc shares an ATM with another sponsor we retain CIBC
         tokeep = if_else(tag >= 1 & sponsor_name == "CANADIAN IMPERIAL BANK OF COMMERCE", tokeep + 1, tokeep),
         
         ### RBC AND SCOTIA BANK DUPLICATES ACROSS SPONSORS ARE NOT REAL DUPLICATES
         tokeep = if_else(tag >= 1 & sponsor_name == "ROYAL BANK OF CANADA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "BANK OF NOVA SCOTIA", tokeep + 100, tokeep)) %>% 
  arrange(inst_assigned_term_id, desc(tokeep)) #%>% 
  #arrange(tokeep)


df_4_2023 <- df_3_2023 %>% 
  group_by(inst_assigned_term_id) %>%
  mutate(dupeorder = row_number()) %>%
  ungroup() %>% 
  filter(tokeep >= 10 | dupeorder == 1) %>% 
  select(-tokeep, -tag) %>% 
  # DROP FEDERATION DES CAISSES POPULAIRES ACADIENNES
  #filter(owner_name != "FEDERATION DES CAISSES POPULAIRES ACADIENNES") %>% 
  mutate(
    last_reported = dmy_hms(last_reported),  # Convert to datetime using lubridate
    year = year(last_reported)               # Extract the year
  ) 

df_4_2023 %>% count(sponsor_name)

# A tibble: 11 × 2
   sponsor_name                                    n
   <chr>                                       <int>
 1 BANK OF MONTREAL                            37674
 2 BANK OF NOVA SCOTIA                          2440
 3 CANADIAN IMPERIAL BANK OF COMMERCE           2934
 4 DIRECTCASH BANK                              4527
 5 EVERLINK PAYMENT SERVICES INC.               1747
 6 FEDERATION DES CAISSES DESJARDINS DU QUEBEC  1875
 7 HSBC BANK CANADA                              168
 8 MONERIS SOLUTIONS CORPORATION                1466
 9 NATIONAL BANK OF CANADA                       927
10 ROYAL BANK OF CANADA                         3936
11 TORONTO-DOMINION BANK, THE                   2691

Code

# df_3_2023 %>% 
#   group_by(inst_assigned_term_id) %>%
#   mutate(dupeorder = row_number()) %>%
#   ungroup() %>% 
#   filter(tokeep >= 10 | dupeorder == 1) %>% 
#   select(-tokeep, -tag) %>% 
#   # DROP FEDERATION DES CAISSES POPULAIRES ACADIENNES
#  # filter(owner_name != "FEDERATION DES CAISSES POPULAIRES ACADIENNES") %>% 
#   mutate(
#     last_reported = dmy_hms(last_reported),  # Convert to datetime using lubridate
#     year = year(last_reported)               # Extract the year
#   )

After deduplication, there are 60385 observations in the 2023 Mastercard data.

Identify Duplicates (2022)

Code

mc_raw_2022 <- read_csv("data/unprocessed/mastercard/BOC_All_ATMImage(November 2022).txt", guess_max = 50000) %>% 
  clean_names()

Within Sponsors

Code

tagged_2_2023 <- mc_raw_2022 %>% 
  group_by(inst_assigned_term_id, sponsor_name) %>%
  mutate(duplicate_tag_1 = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup()

tagged_2_2023 %>% 
  count(sponsor_name, duplicate_tag_1) %>% 
  pivot_wider(names_from = duplicate_tag_1, values_from = n, names_prefix = "tag_") %>% 
  # Replace missing values (NA) with 0 in the reshaped data
  mutate(across(where(is.integer), ~ replace_na(., 0))) %>% 
  # Create a formatted table with 'kable' for better presentation
  kbl(caption = "Table 4: Duplicates Within Sponsors (2022)", col.names = c("Sponsor Name", "Tag 0", "Tag 1")) %>% 
  kable_classic()

Table 4: Duplicates Within Sponsors (2022)
Sponsor Name	Tag 0	Tag 1
BANK OF MONTREAL	2950	0
BANK OF NOVA SCOTIA	2402	80
CANADIAN IMPERIAL BANK OF COMMERCE	2999	0
DIRECTCASH BANK	7271	96
EVERLINK PAYMENT SERVICES INC.	2279	0
FEDERATION DES CAISSES DESJARDINS DU QUEBEC	1875	0
MONERIS SOLUTIONS CORPORATION	34540	0
NATIONAL BANK OF CANADA	929	0
ROYAL BANK OF CANADA	3989	0
TORONTO-DOMINION BANK THE	2749	0

For Directcash Bank, the duplicates are due to the same record assigned once to its actual address, and once to a PO Box. Otherwise the records look exactly the same.

Code

tagged_2_2023 %>%
  filter(duplicate_tag_1 != 0) %>%
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name) %>% 
  filter(sponsor_name == "DIRECTCASH BANK") %>% 
  arrange(inst_assigned_term_id) %>% 
  filter(inst_assigned_term_id != "C4066615") %>% 
  filter(inst_assigned_term_id != "C4066617") %>% 
  head(6) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City
DIRECTCASH BANK	DC PAYMENTS CA	C4066599	PO BOX 3725 STN CENTRAL	VICTORIA
DIRECTCASH BANK	DC PAYMENTS CA	C4066599	NELLES BLOCK CFB ESQUIMALT	VICTORIA
DIRECTCASH BANK	DC PAYMENTS CA	C4066616	PO BOX 250 BUILDING 100	SHEARWATER
DIRECTCASH BANK	DC PAYMENTS CA	C4066616	12 WING SHEARWATER BUILDING 100	SHEARWATER
DIRECTCASH BANK	CHARLIES LUNCH	D1016013	PO BOX 172	EASTEND
DIRECTCASH BANK	CHARLIES LUNCH	D1016013	327 RED COAT DR	EASTEND

Code

tagged_2_2023 %>%
  filter(duplicate_tag_1 != 0) %>%
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name) %>% 
  filter(sponsor_name == "BANK OF NOVA SCOTIA") %>% 
  arrange(inst_assigned_term_id) %>% 
  head(6) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City
BANK OF NOVA SCOTIA	SCOTIABANK	AA13	305 PORT UNION RD	WEST HILL
BANK OF NOVA SCOTIA	SCOTIABANK	AA13	305 PORT UNION RD	SCARBOROUGH
BANK OF NOVA SCOTIA	SCOTIABANK	AA21	305 PORT UNION RD	WEST HILL
BANK OF NOVA SCOTIA	SCOTIABANK	AA21	305 PORT UNION RD	SCARBOROUGH
BANK OF NOVA SCOTIA	SCOTIABANK	AE25	1414 18TH ST 1901 RICHMOND AVE	BRANDON
BANK OF NOVA SCOTIA	SCOTIABANK	AE25	1570 18TH ST	BRANDON

Across Sponsors

Code

tagged_2022 <- mc_raw_2022 %>% 
  group_by(inst_assigned_term_id) %>%
  mutate(duplicate_tag = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup()

tagged_2022 %>% 
  count(sponsor_name, duplicate_tag) %>% 
  pivot_wider(names_from = duplicate_tag, values_from = n, names_prefix = "tag_") %>% 
  mutate(across(where(is.integer), ~ replace_na(., 0))) %>% 
  # Create a formatted table with 'kable' for better presentation
  kbl(caption = "Table 3: Duplicates Across Sponsors (2022)", 
      col.names = c("Sponsor Name", "Tag 0", "Tag 1", "Tag 2")) %>% 
  kable_classic() %>% 
  kable_styling(full_width = FALSE)

Table 3: Duplicates Across Sponsors (2022)
Sponsor Name	Tag 0	Tag 1	Tag 2
BANK OF MONTREAL	2950	0	0
BANK OF NOVA SCOTIA	2307	171	4
CANADIAN IMPERIAL BANK OF COMMERCE	2755	244	0
DIRECTCASH BANK	3312	3991	64
EVERLINK PAYMENT SERVICES INC.	2279	0	0
FEDERATION DES CAISSES DESJARDINS DU QUEBEC	1875	0	0
MONERIS SOLUTIONS CORPORATION	30334	4174	32
NATIONAL BANK OF CANADA	816	113	0
ROYAL BANK OF CANADA	3893	94	2
TORONTO-DOMINION BANK THE	2636	113	0

Most duplicates (around 4000) have the sponsor being Directcash in one entry and the owner being Direct Cash in the other entry

Direct Cash Bank - Moneris

Code

tagged_2022 %>% 
  filter(duplicate_tag == 1) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, 
         location_name) %>% 
  filter(inst_assigned_term_id == "D1014793" | 
           inst_assigned_term_id == "D1037192") %>% 
  arrange(inst_assigned_term_id) %>% 
  head(4) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name")) %>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
DIRECTCASH BANK	AVONDALE STORES LIMITED	D1014793	242 KILLALY ST W	PORT COLBORNE	AVONDALE 033
MONERIS SOLUTIONS CORPORATION	DIRECT CASH	D1014793	242 KILLALY ST W	PORT COLBORNE	AVONDALE 033
MONERIS SOLUTIONS CORPORATION	DIRECT CASH	D1037192	252 THE ESPLANADE	TORONTO	VILLAGES MINI MART
DIRECTCASH BANK	DC PAYMENTS CA	D1037192	252 THE ESPLANADE	TORONTO	VILLAGES MINI MART

Deduplication rule: When DIRECTCASH BANK and MONERIS share the same terminal ID, we use the Sponsor Name to determine which record to keep. We drop the record associated with DirectCash Bank and retain the record associated with MONERIS.

Other

For the few duplicate CIBC ATMs, it seems that they are actually owned by Cardtronics, but branded by CIBC.

Code

tagged_2022 %>% 
  filter(duplicate_tag == 1) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, 
         location_name) %>% 
  group_by(inst_assigned_term_id) %>%
  filter(all(c("CANADIAN IMPERIAL BANK OF COMMERCE", "MONERIS SOLUTIONS CORPORATION") %in% sponsor_name)) %>%
  ungroup() %>% 
  arrange(desc(inst_assigned_term_id)) %>%
  head(6) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	PI909G9X	1315 COUNTY RD 28	FRASERVILLE	PIONEER GAS STATION
MONERIS SOLUTIONS CORPORATION	CARDTRONICS	PI909G9X	1315 COUNTY ROAD 28	FRASERVILLE	FRASERVILLE OTR 43251
MONERIS SOLUTIONS CORPORATION	CARDTRONICS	PI909G9W	453 DUNLOP ST W	BARRIE	PIONEER OTR42339
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	PI909G9W	453 DUNLOP ST W	BARRIE	PIONEER GAS STATION
CANADIAN IMPERIAL BANK OF COMMERCE	CIBC	PI909G9V	1089 TALBOT ST	ST THOMAS	PIONEER GAS STATION
MONERIS SOLUTIONS CORPORATION	CARDTRONICS	PI909G9V	1089 TALBOT ST	ST THOMAS	PIONEER OTR 43233

Deduplication rule: Retain the record associated with CIBC

It seems that the RBC and Scotiabank’s “duplicated ATMs” are actually not duplicates, but different ATMs present in their respective branches. This could to be because the terminal ID variable is only 4 digit in each case.

Code

tagged_2022 %>% 
  filter(duplicate_tag == 1) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, 
         location_name) %>% 
  group_by(inst_assigned_term_id) %>%
  filter(all(c("ROYAL BANK OF CANADA", "BANK OF NOVA SCOTIA") %in% sponsor_name)) %>%
  ungroup() %>% 
  arrange(desc(inst_assigned_term_id)) %>%
  head(6) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	WD21	85 MAIN ST	MELITA	MELITA BRWH28 AUD
BANK OF NOVA SCOTIA	SCOTIABANK	WD21	1195 ARTHUR ST W	THUNDER BAY	ARTHUR PARKDALE
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	WC23	220 PORTAGE AVE	WINNIPEG	WINNIPEG MNWE54AUD
BANK OF NOVA SCOTIA	SCOTIABANK	WC23	1091 ST MARYS RD	WINNIPEG	DAKOTA VILLAGE 3
ROYAL BANK OF CANADA	ROYAL BANK OF CANADA	WC21	220 PORTAGE AVE	WINNIPEG	WINNIPEG MAIN BR AUD
BANK OF NOVA SCOTIA	SCOTIABANK	WC21	318 1 ST	WINNIPEGOSIS	WINNIPEGOSIS BRANCH

Deduplication rule: These are not real duplicates. Nothing to do here.

It is also true that NBC and TD’s “duplicated ATMs” are actually not duplicates, but different ATMs present in their respective branches.

Code

tagged_2022 %>% 
  filter(duplicate_tag == 1) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, 
         location_name) %>% 
  group_by(inst_assigned_term_id) %>%
  filter(all(c("TORONTO-DOMINION BANK THE", "NATIONAL BANK OF CANADA") %in% sponsor_name)) %>%
  ungroup() %>% 
  arrange(desc(inst_assigned_term_id)) %>%
  head(6) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green") %>% 
  row_spec(5, color = "orange") %>% 
  row_spec(6, color = "orange")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
TORONTO-DOMINION BANK THE	TORONTO-DOMINION BANK	7772	2025 PRINCE OF WALES DR	REGINA	VICTORIA SQUARE
NATIONAL BANK OF CANADA	NATIONAL BANK OF CANADA	7772	560 RUE SACR-COEUR O	ALMA	SUCC FERME
NATIONAL BANK OF CANADA	NATIONAL BANK OF CANADA	7771	3261 1IRE AV	RAWDON	SUCCURSALE
TORONTO-DOMINION BANK THE	TORONTO-DOMINION BANK	7771	2025 PRINCE OF WALES DR	REGINA	VICTORIA SQUARE
NATIONAL BANK OF CANADA	NATIONAL BANK OF CANADA	7770	585 AV SAINT-CHARLES	VAUDREUIL-DORION	SUCCURSALE
TORONTO-DOMINION BANK THE	TORONTO-DOMINION BANK	7770	2025 PRINCE OF WALES DR	REGINA	VICTORIA SQUARE

Deduplication rule: These are not real duplicates. Nothing to do here.

Some Scotiabank ATMs are double counted due to same address but different city names

Code

tagged_2022 %>% 
  filter(duplicate_tag == 1) %>% 
  select(sponsor_name, owner_name, inst_assigned_term_id, address1, city_name, 
         location_name) %>% 
  #group_by(inst_assigned_term_id) %>%
  filter(inst_assigned_term_id == "AA13" | inst_assigned_term_id == "AA21") %>% 
 # filter(all(c("TORONTO-DOMINION BANK THE", "NATIONAL BANK OF CANADA") %in% sponsor_name)) %>%
 # ungroup() %>% 
  arrange(desc(inst_assigned_term_id)) %>%
  head(4) %>% 
  kbl(col.names = c("Sponsor Name", "Owner Name", "Terminal ID", "Address", "City", "Location Name"))%>% 
  kable_classic() %>% 
  row_spec(1, color = "red") %>% 
  row_spec(2, color = "red") %>% 
  row_spec(3, color = "green") %>% 
  row_spec(4, color = "green")

Sponsor Name	Owner Name	Terminal ID	Address	City	Location Name
BANK OF NOVA SCOTIA	SCOTIABANK	AA21	305 PORT UNION RD	WEST HILL	PORT UNION FANFARE 2
BANK OF NOVA SCOTIA	SCOTIABANK	AA21	305 PORT UNION RD	SCARBOROUGH	PORT UNION FANFARE 2
BANK OF NOVA SCOTIA	SCOTIABANK	AA13	305 PORT UNION RD	WEST HILL	PORT UNION FANFARE 1
BANK OF NOVA SCOTIA	SCOTIABANK	AA13	305 PORT UNION RD	SCARBOROUGH	PORT UNION FANFARE 1

Deduplication rule: Keep one observation

Deduplicate (2022)

There are 62159 observations in the 2023 Mastercard data.

Eliminate duplicates within sponsor

Code

# Tag duplicates within the same sponsor and province
df_1_2022 <- mc_raw_2022 %>%
  group_by(sponsor_name, inst_assigned_term_id, state_prov) %>%   
  mutate(tag1 = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup() %>%
  arrange(state_prov, tag1, inst_assigned_term_id) %>%
  mutate(todrop = 0,
         todrop = if_else(grepl("PO BOX", address1), todrop + 1, todrop),
         todrop = if_else(is.na(latitude), todrop + 1, todrop)) %>% 
  arrange(desc(tag1), desc(inst_assigned_term_id), desc(todrop), desc(address1))

# Drop the duplicates within the same sponsor and province
df_2_2022 <- df_1_2022 %>%
  arrange(tag1, inst_assigned_term_id, todrop, address1) %>%
  group_by(tag1, inst_assigned_term_id) %>%
  mutate(dupeorder = row_number()) %>%
  ungroup() %>%
  filter(tag1 < 1 | dupeorder == 1) %>%
  select(-todrop, -tag1) %>% 
  select(-dupeorder)

Then eliminate duplicates with different sponsors

Code

# Tag duplicates across different sponsors
df_3_2022 <- df_2_2022 %>%
  group_by(inst_assigned_term_id, state_prov) %>%   
  mutate(tag = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup() %>% 
  mutate(tokeep = 0) %>%
  mutate(tokeep = if_else(tag >= 1 & sponsor_name == "MONERIS SOLUTIONS CORPORATION", tokeep + 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "DIRECTCASH BANK", tokeep - 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "CANADIAN IMPERIAL BANK OF COMMERCE", tokeep + 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "ROYAL BANK OF CANADA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "BANK OF NOVA SCOTIA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "NATIONAL BANK OF CANADA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "TORONTO-DOMINION BANK THE", tokeep + 100, tokeep)) %>%
  arrange(inst_assigned_term_id, desc(tokeep)) #%>% 
  #arrange(tokeep)

# Final filtering and cleaning
df_4_2022 <- df_3_2022 %>% 
  group_by(inst_assigned_term_id) %>%
  mutate(dupeorder = row_number()) %>%
  ungroup() %>%
  filter(tokeep >= 10 | dupeorder == 1) %>%
  select(-tokeep, -tag) #%>% 
  # drop direct cash bank
  #filter(sponsor_name != "DIRECTCASH BANK")

After deduplication, there are 57697 observations in the 2022 Mastercard data.

Deduplicate (2019)

Code

mc_raw_2019 <- read_csv("data/unprocessed/mastercard/BOC_All_ATMImage_(Nov 2019).txt", guess_max = 50000) %>% 
  clean_names()

There are 60568 observations in the 2019 Mastercard data.

There are no duplicates within the same sponsor to eliminate

We eliminate duplicates with different sponsors

Code

 # mc_raw_2019 %>%
 #  rename(inst_assigned_term_id = terminal_id) %>% 
 #  group_by(inst_assigned_term_id, state_prov) %>%   
 #  mutate(tag = case_when(
 #    n() == 1 ~ 0,
 #    n() == 2 ~ 1,
 #    n() > 2 ~ 2
 #  )) %>% 
 #  ungroup() %>% 
 #  count(tag)

# table <- mc_raw_2019 %>%
#   rename(inst_assigned_term_id = terminal_id) %>% 
#   group_by(inst_assigned_term_id, state_prov) %>%   
#   mutate(tag = case_when(
#     n() == 1 ~ 0,
#     n() == 2 ~ 1,
#     n() > 2 ~ 2
#   )) %>% 
#   ungroup() 

# table %>% 
#   count(sponsor_name, tag) %>% 
#   pivot_wider(names_from = tag, values_from = n, names_prefix = "tag_") %>% 
#   # Replace missing values (NA) with 0 in the reshaped data
#   mutate(across(where(is.integer), ~ replace_na(., 0)))

# Tag duplicates across different sponsors
df_2_2019 <- mc_raw_2019 %>%
  rename(inst_assigned_term_id = terminal_id) %>% 
  group_by(inst_assigned_term_id, state_prov) %>%   
  mutate(tag = case_when(
    n() == 1 ~ 0,
    n() == 2 ~ 1,
    n() > 2 ~ 2
  )) %>% 
  ungroup() %>% 
  mutate(tokeep = 0) %>%
  mutate(tokeep = if_else(tag >= 1 & sponsor_name == "MONERIS SOLUTIONS CORPORATION", tokeep + 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "DIRECTCASH BANK", tokeep - 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "CANADIAN IMPERIAL BANK OF COMMERCE", tokeep + 1, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "ROYAL BANK OF CANADA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "BANK OF NOVA SCOTIA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "NATIONAL BANK OF CANADA", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "BANK OF MONTREAL", tokeep + 100, tokeep),
         tokeep = if_else(tag >= 1 & sponsor_name == "TORONTO DOMINION BANK THE", tokeep + 100, tokeep)) %>%
  arrange(inst_assigned_term_id, desc(tokeep))

# Final filtering 
df_3_2019 <- df_2_2019 %>% 
  group_by(inst_assigned_term_id, state_prov) %>%
  mutate(dupeorder = row_number()) %>%
  ungroup() %>%
  filter(tokeep >= 10 | dupeorder == 1) %>%
  select(-tokeep, -tag)

After deduplication, there are 60401 observations in the 2019 Mastercard data.

Step 2: Updating Desjardins ATM Data

The Mastercard data for Desjardins is unreliable, as the ATM counts for 2019, 2022, and 2023 remain unchanged, which is inconsistent given Desjardins’ ongoing ATM closures (see Table 5 below). Therefore, we rely on external data sources to update the 2022 and 2023 datasets.

Code

mc_raw_2019 <- read_csv("data/unprocessed/mastercard/BOC_All_ATMImage_(Nov 2019).txt", guess_max = 50000) %>% 
  clean_names()

desjardins_19 <- mc_raw_2019 %>% 
  count(owner_name) %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" 
         | owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  mutate(date = "Nov 2019")

desjardins_22 <- df_4_2022 %>% 
  count(owner_name) %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" 
         | owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO") %>% 
  mutate(date = "Nov 2022")

desjardins_23 <- df_4_2023 %>% 
  count(owner_name) %>% 
   filter(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" 
         | owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  mutate(date = "Dec 2023")

desjardins_19 %>% 
  bind_rows(desjardins_22) %>% 
  bind_rows(desjardins_23) %>% 
  mutate(
    owner_name = case_when(
    str_detect(owner_name, "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") ~ "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC",
    str_detect(owner_name, "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO|LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO") ~ "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO",
    TRUE ~ owner_name
  )) %>% 
   pivot_wider(names_from = date, values_from = n) %>% 
  kbl(caption = "Table 5: Desjardins ATM Counts Across Years (Mastercard Data)", col.names = c("Owner Name", "November 2019", "November 2022", "November 2023"))%>% 
  kable_classic()

In the fall of 2022 and 2023, we scraped Desjardins ATM data directly from their website. Unfortunately, we couldn’t do this for 2019, so our improvements focus on the 2022 and 2023 datasets. The updated matching process for Desjardins ATMs includes the following steps:

Fuzzy Matching: We perform fuzzy matching on the addresses from the Mastercard ATMs to the webscraped ATMs using the fuzzyjoin package. This uses Jaro-Winkler string distance with a higher threshold of 0.7 to ensure every Mastercard ATM is initially matched to something in the webscraped data. From there, we select the top 3 closest matches and prioritize those with the same street number. If no match with the same street number is found, we take the closest one by string distance and filter again based on street number.
Manual Review: We manually review the unmatched Mastercard ATMs to catch any false negatives missed during the fuzzy matching process. We also review matched ATMs for potential false positives. This results in two datasets: finalized matches (including both fuzzy and manually matched data) and unmatched Mastercard ATMs.
Consolidation: The string-matched and manually reviewed ATMs are consolidated into a single matched dataset.
Final Integration: Since the webscraped data is more current and reliable, we add any unmatched webscraped ATMs back into the consolidated dataset to ensure complete coverage.

In summary, the final matched dataset is built through initial fuzzy matching, a manual review process, and integrating unmatched webscraped ATMs to ensure the most accurate and complete data.

Code

## READ IN OUR WEBSCRAPED DATA


# read in webscraped Desjardins ATM data 2023
desjardins_atm_2023_webscraped <- read_csv("data/unprocessed/webscraped/desjardins_2023_atm_geocoded.csv") %>% 
  mutate(latitude = lat) %>% 
  mutate(longitude = lng) %>% 
  select(-lat, -lng) %>% 
  rename(webscraped_id = id) %>% 
  # clean addresses
  mutate(address_clean = str_replace_all(address_clean, ",", "")) %>% 
  mutate(address_clean = str_replace_all(address_clean, " Québec", "")) %>% 
  mutate(address_clean = paste0(address_clean, " Québec")) %>% 
  mutate(address_clean = str_replace_all(address_clean, "-", "")) %>%
  
  # renaming address for id 350 "The ATM will be closed from August 14 to September 1 2023. 501 rue Beaudoin SaintLéonardd’Aston J0C 1M0 Québec"
  mutate(address_clean = ifelse(webscraped_id == 350, "501 rue Beaudoin Saint-Léonard-d’Aston J0C 1M0 Québec", address_clean)) %>% 
  
  # renaming address for id 777 "The ATMs will be closed until April 17 2023. 170 boulevard Harwood VaudreuilDorion J7V 1Y2 Québec"
  mutate(address_clean = ifelse(webscraped_id == 777, "170 boulevard Harwood Vaudreuil-Dorion J7V 1Y2 Québec", address_clean)) %>% 
  
  # renaming address for id 758 "The ATMs are closed from midnight to 5:00 a.m. 745 rue Roy Est Montréal H2L 1E1 Québec "
  mutate(address_clean = ifelse(webscraped_id == 758, "745 rue Roy Est Montréal H2L 1E1 Québec", address_clean)) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  ) 

# read in webscraped Desjardins ATM data 2022
desjardins_atm_2022_webscraped <- read_csv("data/unprocessed/webscraped/desjardins_2022_atm_geocoded.csv") %>% 
  mutate(latitude = lat) %>% 
  mutate(longitude = lng) %>% 
  select(-lat, -lng, -score, -caisse_atm_name) %>%
  filter(caisse_caisse_name != "Desjardins Ontario Credit Union") %>% 
  rename(webscraped_id = id, name = caisse_caisse_name, address_clean = caisse_atm_address) %>% 
  mutate(address_clean = str_replace_all(address_clean, ",", " ")) %>% 
  mutate(address_clean = str_replace_all(address_clean, "Québec", "")) %>% 
  
  mutate(address_clean = paste0(address_clean, "Québec")) %>% 
  mutate(address_clean = str_replace_all(address_clean, "  ", " ")) %>% 

  mutate(address_clean = str_replace_all(address_clean, "-", "")) %>%
  mutate(address_clean = str_squish(address_clean))  %>%  
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  ) %>% 
  mutate(address_clean = str_replace_all(address_clean, "-", "")) 

# read in webscraped Desjardins 2022 ATM
desjardins_ontario_atm_2022 <- read_csv("data/unprocessed/webscraped/desjardins_ontario_atm_2022.csv") %>% 
  mutate(address_clean = str_replace_all(string = atm_address, "\n", " ")) %>% 
  select(-atm_address, -atm_name)

# read in web scraped Ontario desjardins ATMs
desjardins_ontario_atm_2023 <- read_csv("data/unprocessed/webscraped/desjardins_ontario_atm_2023.csv") %>% 
  mutate(address_clean = str_replace_all(string = atm_address, "\n", " ")) %>% 
  select(-atm_address, -atm_name)


# read in ontario web scraped branches
desjardins_branch_ont_23 <- read_csv("data/unprocessed/webscraped/desjardins_branches_ontario_2023.csv") %>% 
  select(branch_address) %>% 
  mutate(branch_address = str_replace_all(branch_address, "\n", " ")) %>%
  mutate(branch_id = row_number()) %>% 
  mutate(address_clean = branch_address) %>% 
  mutate(address_clean = str_remove_all(address_clean, " Ontario ")) %>% 
  mutate(address_clean = str_replace_all(address_clean, ",", " ")) %>% 
  mutate(address_clean = paste0(address_clean, " ", "Ontario")) %>% 
  mutate(address_clean = str_squish(address_clean)) %>% 
  select(-branch_address) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )

desjardins_branch_ont_22 <- read_csv("data/unprocessed/webscraped/desjardins_branches_ontario_2022.csv") %>% 
  select(branch_address) %>% 
  mutate(branch_address = str_replace_all(branch_address, "\n", " ")) %>%
  mutate(branch_id = row_number()) %>% 
  mutate(address_clean = branch_address) %>% 
  mutate(address_clean = str_remove_all(address_clean, " Ontario ")) %>% 
  mutate(address_clean = str_replace_all(address_clean, ",", " ")) %>% 
  mutate(address_clean = paste0(address_clean, " ", "Ontario")) %>% 
  mutate(address_clean = str_squish(address_clean)) %>% 
  select(-branch_address) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )

## Subset deduplicated 2023 MasterCard data for quebec desjardin ATMs
desjardins_2023_mc <- df_4_2023 %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  mutate(address_clean = paste0(address1, " ", city_name, " ", postal_code, " Quebec")) %>% 
  mutate(address_clean = str_to_lower(address_clean)) %>% 
  select(inst_assigned_term_id, owner_name, latitude, longitude, address_clean) %>% 
  mutate(mc_id = row_number()) %>% 
  mutate(address_clean = str_replace_all(address_clean, pattern = "quebec", "Québec")) %>% 
 # mutate(address_clean = str_replace_all(address_clean, pattern = " qubec", "")) %>%
  mutate(address_clean = str_replace_all(address_clean, pattern = "montral", "Montréal")) %>% 
  mutate(address_clean = address_clean %>%
           str_replace_all("\\bav\\b|\\bave\\b", "avenue") %>%
           str_replace_all("\\brte\\b", "route") %>%
           str_replace_all("\\bboul\\b", "boulevard") %>%
           str_replace_all("\\bch\\b", "chemin")) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )

## Subset deduplicated 2023 MasterCard data for Ontario desjardin ATMs

desjardins_2023_mc_ontario <- df_4_2023 %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  mutate(address_clean = paste0(address1, " ", city_name, " ", postal_code, " Ontario")) %>% 
  mutate(address_clean = str_to_lower(address_clean)) %>% 
  select(inst_assigned_term_id, owner_name, latitude, longitude, address_clean) %>% 
  mutate(mc_id = row_number()) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )


## Subset deduplicated 2022 MasterCard data for quebec desjardin ATMs
desjardins_2022_mc <- df_4_2022 %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  mutate(address_clean = paste0(address1, " ", city_name, " ", postal_code, " Quebec")) %>% 
  mutate(address_clean = str_to_lower(address_clean)) %>% 
  select(inst_assigned_term_id, owner_name, latitude, longitude, address_clean) %>% 
  mutate(mc_id = row_number()) %>% 
  mutate(address_clean = str_replace_all(address_clean, pattern = "quebec", "Québec")) %>% 
  mutate(address_clean = str_replace_all(address_clean, pattern = "montral", "Montréal")) %>% 
   mutate(address_clean = address_clean %>%
           str_replace_all("\\bav\\b|\\bave\\b", "avenue") %>%
           str_replace_all("\\brte\\b", "route") %>%
           str_replace_all("\\bboul\\b", "boulevard") %>%
           str_replace_all("\\bch\\b", "chemin")) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )

## Subset deduplicated 2023 MasterCard data for Ontario desjardin ATMs
desjardins_2022_mc_ontario <- df_4_2022 %>% 
  filter(owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO") %>% 
  mutate(address_clean = paste0(address1, " ", city_name, " ", postal_code, " Ontario")) %>% 
  mutate(address_clean = str_to_lower(address_clean)) %>% 
  select(inst_assigned_term_id, owner_name, latitude, longitude, address_clean) %>% 
  mutate(mc_id = row_number()) %>% 
  mutate(
    number_part = str_extract(address_clean, "^\\d+"),
    address_part = str_remove(address_clean, "^\\d+\\s*")
  )

String Match (2023)

Code

library(downloadthis)
library(gt)
library(fuzzyjoin)

# Perform fuzzy matching
matched_data_23 <- stringdist_join(
  x = desjardins_2023_mc,
  y = desjardins_atm_2023_webscraped,
  by = "address_clean",
  method = "jw",
  max_dist = 0.7,
  ignore_case = TRUE,
  mode = "inner",
  distance_col = "dist"
)

best_matches_23 <- matched_data_23 %>%
  group_by(mc_id) %>%
  arrange(dist) %>%  # Arrange by string distance in ascending order
  slice_head(n = 3) %>%  # Take the top 3 closest matches by string distance
  # check if they have the same address number
  mutate(
    within_top_3_same_number = ifelse(number_part.x == number_part.y, 1, 0)
  ) %>% 
  # Prioritize same street number first, then lowest distance within top 3
  arrange(desc(within_top_3_same_number), dist) %>%  # Prioritize same street number first, then lowest distance within top 3
  # Select the best match according to the new priority
  slice(1) %>%  
  ungroup() %>%
  select(-latitude.y, -longitude.y, -name) %>%
  rename(
    longitude = longitude.x,
    latitude = latitude.x,
    address_mc = address_clean.x,
    address_webscraped = address_clean.y,
    street_number_mc = number_part.x,
    street_number_web = number_part.y,
    address_part_mc = address_part.x,
    address_part_web = address_part.y
  ) %>% 
  select(mc_id, webscraped_id, everything()) %>% 
  mutate(same_street_number = ifelse(street_number_mc == street_number_web, 1, 0)) 

best_matches_23_v1 <- best_matches_23 %>% 
  select(mc_id, webscraped_id, address_mc, address_webscraped, dist, same_street_number) %>% 
  rename(str_distance = dist) %>% 
  filter(same_street_number == 1) 

# Identify unmatched records in dataset 1 (to discard)

unmatched_mc <- desjardins_2023_mc %>%
  anti_join(best_matches_23_v1, by = "mc_id")

# Identify unmatched records in dataset 2 (to retain)
unmatched_webscraped <- desjardins_atm_2023_webscraped %>%
  anti_join(best_matches_23_v1, by = "webscraped_id")

We matched 1423 ATMs from the Mastercard data to the web scraped data after string matching. You can download the data below.

Code

library(downloadthis)

best_matches_23_v1 %>%
  download_this(
    output_name = "desjardin_matching_23",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Next, we review the ATMs that were rejected from both the Mastercard and webscraped data. The table below gives us the ATMs from the webscraped data that did not meet the threshold for string matching

Code

# library(DT)
# unmatched_webscraped %>% 
#   select(-name, -number_part, -address_part) %>% 
#   datatable() 

unmatched_webscraped %>%
  select(webscraped_id, address_clean, latitude, longitude) %>% 
  download_this(
    output_name = "unmatched_webscraped_2023_v1",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

The table below gives us the ATMs from the Mastercard data that did not meet the threshold for string matching

Code

library(DT)
# unmatched_mc %>% 
#   select(mc_id, address_clean, latitude, longitude) %>% 
#   datatable() 

unmatched_mc %>%
  select(mc_id, address_clean, latitude, longitude) %>% 
  download_this(
    output_name = "unmatched_mc_2023_v1",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Manual Review (2023)

Using the data above, we manually match the false negatives based on their IDs.

Code

# Define the manually matched IDs based on your input
matched_mc_webscraped <- tibble::tribble(
  ~mc_id, ~webscraped_id,
  216, 99,
  274, 109,
  348, 371,
  351, 371,
  490, 162,
  491, 162,
  639, 678,
  640, 678,
  721, 841,
  729, 687,
  746, 845,
  779, 751,
  780, 751,
  792, 882,
  807, 856,
  901, 703,
  902, 703,
  903, 703,
  911, 704,
  912, 704,
  913, 704,
  917, 860,
  918, 860,
  919, 860,
  1043, 656,
  1048, 659,
  1111, 792,
  1112, 791,
  1113, 790,
  1114, 791,
  1115, 791,
  1117, 790,
  1183, 666,
  1184, 666,
  1192, 827,
  1330, 460,
  1345, 465,
  1346, 465,
  1390, 69,
  1431, 293,
  1434, 293,
  1459, 314,
  1508, 896,
  1509, 896,
  1510, 896,
  1572, 510,
  1573, 510,
  1575, 510,
  1589, 523,
  1590, 523,
  1605, 479,
  1611, 479,
  1612, 479,
  1615, 479,
  1620, 479,
  1634, 540,
  1635, 540,
  1685, 919
)

Results of Manual Matching

Code

# join Id's that we matched manually
mc_clean <- unmatched_mc %>% 
  rename(address_clean_mc = address_clean) %>% 
  rename(latitude_mc = latitude,
         longitude_mc = longitude) %>% 
  select(-number_part, -address_part) %>% 
  select(mc_id, owner_name, address_clean_mc, latitude_mc, longitude_mc)

web_clean <- unmatched_webscraped %>% 
  rename(address_web = address_clean) %>% 
  rename(latitude_web = latitude,
         longitude_web = longitude) %>% 
  select(-name, -number_part, -address_part)

# join manually matched id
matched_mc_webscraped %>% 
  inner_join(mc_clean) %>% 
  inner_join(web_clean) %>% 
  select(mc_id, webscraped_id, address_clean_mc, address_web) %>% 
  datatable()

We consolidate the string-matched and manually matched data, then add the unmatched webscraped data to create the final dataset for Desjardins ATMs in Quebec.

Code

# unmatched mastercard after removing the false negatives
unmatched_mc_final <- unmatched_mc %>%
  filter(!mc_id %in% matched_mc_webscraped$mc_id)

# unmatched webscraped after removing the false negatives
# we will add these below
unmatched_webscraped_final <- unmatched_webscraped %>%
  filter(!webscraped_id %in% matched_mc_webscraped$webscraped_id) %>% 
  select(-name) %>% 
  rename(latitude_web = latitude,
         longitude_web = longitude,
         address_web = address_clean)

matched_final <- matched_mc_webscraped %>% 
   inner_join(mc_clean) %>% 
   inner_join(web_clean) %>% 
   bind_rows(unmatched_webscraped_final %>% select(-number_part, -address_part)) %>% 
   mutate(address_clean = address_web) %>% 
   mutate(address_clean = 
            ifelse(webscraped_id == 350, 
                   "501 rue Beaudoin Saint-Léonard-d’Aston Québec J0C 1M0", address_clean)) %>% 
   mutate(address_clean = 
            ifelse(webscraped_id == 777, 
                   "170 boulevard Harwood Vaudreuil-Dorion Québec J7V 1Y2", address_clean)) %>% 
   mutate(address_clean = 
            ifelse(webscraped_id == 758, 
                   "745 rue Roy Est Montréal Québec H2L 1E1", address_clean)) %>% 
   filter(webscraped_id != 488) %>% 
   filter(webscraped_id != 778) %>% 
   mutate(latitude = latitude_mc,
          longitude = longitude_mc) %>% 
   mutate(latitude = ifelse(is.na(latitude), latitude_web, latitude)) %>% 
   mutate(longitude = ifelse(is.na(longitude), longitude_web, longitude)) %>%  
   mutate(owner_name = ifelse(is.na(owner_name), "Desjardins (webscraped)", owner_name)) %>% 
   select(mc_id, webscraped_id, owner_name, 
          address_webscraped = address_web, address_mc = address_clean_mc, 
          address_clean, latitude, longitude) %>%
   mutate(matched = "manual") %>%
   mutate(matched = ifelse(is.na(mc_id), "unmatched webscraped", matched)) %>% 
   select(mc_id, webscraped_id, owner_name, address_clean, latitude, longitude, matched)

Lastly, we remove the false positives.

Code

remove <- c(630, 847, 848, 1199, 1216, 1224, 1225, 1339, 1677, 1693)

best_matches_23_v2 <- best_matches_23 %>% 
  filter(same_street_number == 1) %>% 
  select(mc_id, webscraped_id, inst_assigned_term_id, 
         owner_name, address_clean = address_webscraped, latitude, longitude) %>% 
  filter(!mc_id %in% remove) %>% 
  mutate(matched = "string matched")

Final data for Quebec Desjardin in 2023.

Code

best_matches_23_v3 <- best_matches_23_v2 %>% 
  bind_rows(matched_final) 

best_matches_23_v3 %>%
  download_this(
    output_name = "desjardins_matched_2023",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Ontario (2023)

Next, we will match the Ontario Desjardins ATMs. This matching procedure is the same as the one used for Quebec.

Code

#  Perform fuzzy matching
matched_data_23_ont <- stringdist_join(
   x = desjardins_2023_mc_ontario,
   y = desjardins_branch_ont_23,
   by = "address_clean",
   method = "jw",
   max_dist = 0.7,
   ignore_case = TRUE,
   mode = "inner",
   distance_col = "dist"
 )

best_matches_ont_23 <- matched_data_23_ont %>%
  group_by(mc_id) %>%
  arrange(dist) %>%  # Arrange by string distance in ascending order
  slice_head(n = 3) %>%  # Take the top 3 closest matches by string distance
  # check if they have the same address number
  mutate(
    within_top_3_same_number = ifelse(number_part.x == number_part.y, 1, 0)
  ) %>% 
  # Prioritize same street number first, then lowest distance within top 3
  arrange(desc(within_top_3_same_number), dist) %>%  
  # Prioritize same street number first, then lowest distance within top 3
  # Select the best match according to the new priority
  slice(1) %>%  
  ungroup() %>%
  rename(
    address_mc = address_clean.x,
    address_branch = address_clean.y,
    street_number_mc = number_part.x,
    street_number_branch = number_part.y,
    address_part_mc = address_part.x,
    address_part_branch = address_part.y
  ) %>% 
  select(mc_id, branch_id, everything()) %>% 
  mutate(same_street_number = ifelse(street_number_mc == street_number_branch, 1, 0)) %>% 
  select(mc_id, branch_id, address_mc, address_branch, dist, latitude, 
         longitude, inst_assigned_term_id, owner_name, same_street_number)

best_matches_ont_23_v1 <- best_matches_ont_23 %>% 
  filter(same_street_number == 1)

 # Identify unmatched records in dataset 1 (to discard)
unmatched_mc_ontario_23 <- desjardins_2023_mc_ontario %>%
  anti_join(best_matches_ont_23_v1, by = "mc_id") %>% 
  select(mc_id, address_clean, latitude, longitude, inst_assigned_term_id, owner_name) %>% 
  arrange(address_clean)

unmatched_branches_ontario_23 <- desjardins_branch_ont_23 %>%
  anti_join(best_matches_ont_23_v1, by = "branch_id") %>% 
  select(-address_part, -number_part) %>% 
  arrange(address_clean)
# 
# 
# best_matches_ont_23_v1 %>% write_excel_csv("data/temp/match/ontario_23_desjardins.csv")
# 
# unmatched_branches_ontario_23 %>% write_excel_csv("data/temp/match/ont_23_desjardins_unmatched_branches.csv")
# 
# unmatched_mc_ontario_23 %>% write_excel_csv("data/temp/match/ont_23_desjardins_unmatched_mc.csv")

Using the data above, we manually match the false negatives based on their IDs.

Code

# Define the manually matched IDs based on your input
manual_matched_mc_ont_23 <- tibble::tribble(
  ~mc_id, ~branch_id,
  50, 4,
  48, 2,
  56, 15,
  51, 11
)


matched_ont_23 <- manual_matched_mc_ont_23 %>% 
   inner_join(unmatched_mc_ontario_23) %>% 
   mutate(matched = "manual")

We remove the false positives and add manual matched

Code

remove_ont_23 <- c(3)

best_matches_ont_23_v2 <- best_matches_ont_23_v1 %>% 
  select(mc_id, branch_id, inst_assigned_term_id, 
         owner_name, address_clean = address_branch, latitude, longitude) %>% 
  filter(!mc_id %in% remove_ont_23) %>% 
  mutate(matched = "string matched")

best_matches_ont_23_v3 <- best_matches_ont_23_v2 %>% 
  bind_rows(matched_ont_23)

String Match (2022)

Code

library(downloadthis)
library(gt)
library(fuzzyjoin)

# Perform fuzzy matching
matched_data_22 <- stringdist_join(
  x = desjardins_2022_mc,
  y = desjardins_atm_2022_webscraped,
  by = "address_clean",
  method = "jw",
  max_dist = 0.7,
  ignore_case = TRUE,
  mode = "inner",
  distance_col = "dist"
)

best_matches_22 <- matched_data_22 %>%
  group_by(mc_id) %>%
  arrange(dist) %>%  # Arrange by string distance in ascending order
  slice_head(n = 3) %>%  # Take the top 3 closest matches by string distance
  # check if they have the same address number
  mutate(
    within_top_3_same_number = ifelse(number_part.x == number_part.y, 1, 0)
  ) %>% 
  # Prioritize same street number first, then lowest distance within top 3
  arrange(desc(within_top_3_same_number), dist) %>%  # Prioritize same street number first, then lowest distance within top 3
  # Select the best match according to the new priority
  slice(1) %>%  
  ungroup() %>%
  select(-latitude.y, -longitude.y, -name) %>%
  rename(
    longitude = longitude.x,
    latitude = latitude.x,
    address_mc = address_clean.x,
    address_webscraped = address_clean.y,
    street_number_mc = number_part.x,
    street_number_web = number_part.y,
    address_part_mc = address_part.x,
    address_part_web = address_part.y
  ) %>% 
  select(mc_id, webscraped_id, everything()) %>% 
  mutate(same_street_number = ifelse(street_number_mc == street_number_web, 1, 0)) 

best_matches_22_v1 <- best_matches_22 %>% 
  select(mc_id, webscraped_id, address_mc, address_webscraped, dist, same_street_number) %>% 
  rename(str_distance = dist) %>% 
  filter(same_street_number == 1) 

# Identify unmatched records in dataset 1 (to discard)

unmatched_mc_2022 <- desjardins_2022_mc %>%
  anti_join(best_matches_22_v1, by = "mc_id")

# Identify unmatched records in dataset 2 (to retain)
unmatched_webscraped_2022 <- desjardins_atm_2022_webscraped %>%
  anti_join(best_matches_22_v1, by = "webscraped_id")

We matched 1474 ATMs from the Mastercard data to the web scraped data after string matching. You can download the data below.

Code

library(downloadthis)

best_matches_22_v1 %>%
  download_this(
    output_name = "desjardin_matching_22",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Code

unmatched_webscraped_2022 %>%
  select(webscraped_id, address_clean, latitude, longitude) %>% 
  download_this(
    output_name = "unmatched_webscraped_2022",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

The table below gives us the ATMs from the Mastercard data that did not meet the threshold for string matching

Code

unmatched_mc_2022 %>%
  select(mc_id, address_clean, latitude, longitude) %>% 
  download_this(
    output_name = "unmatched_mc_2022",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Manual Review (2022)

Using the data above, we manually match the false negatives based on their IDs.

Code

matched_mc_webscraped_22 <- tibble::tribble(
  ~mc_id, ~webscraped_id,
  792, 16,
  1605, 141,
  1611, 141,
  1612, 141,
  1615, 141,
  1620, 141,
  433, 309,
  434, 309,
  1685, 324,
  1181, 480,
  888, 554,
  889, 554,
  1192, 569,
  1113, 632,
  1112, 633,
  1114, 633,
  1115, 633,
  1111, 634,
  1117, 634,
  1048, 898,
  1183, 906,
  1184, 906,
  99, 494,
  216, 196,
  226, 681,
  227, 681,
  274, 206,
  313, 779,
  314, 779,
  348, 176,
  351, 176,
  456, 744,
  457, 744,
  458, 744,
  490, 256,
  491, 256,
  560, 763,
  639, 420,
  640, 420,
  729, 429,
  746, 650,
  807, 661,
  843, 638,
  844, 638,
  862, 549,
  863, 549,
  864, 549,
  968, 619,
  1043, 895,
  1330, 121,
  1358, 45,
  1360, 45,
  1511, 31,
  1512, 31,
  1572, 819,
  1573, 819,
  1575, 819,
  1589, 832,
  1590, 832,
  1606, 142,
  1634, 849,
  1635, 849
)

Results of Manual Matching

Code

library(DT)

# join Id's that we matched manually
mc_clean_22 <- unmatched_mc_2022 %>% 
  rename(address_clean_mc = address_clean) %>% 
  rename(latitude_mc = latitude,
         longitude_mc = longitude) %>% 
  select(-number_part, -address_part) %>% 
  select(mc_id, owner_name, address_clean_mc, latitude_mc, longitude_mc)

web_clean_22 <- unmatched_webscraped_2022 %>% 
  rename(address_web = address_clean) %>% 
  rename(latitude_web = latitude,
         longitude_web = longitude) %>% 
  select(-name, -number_part, -address_part)

# join manually matched id
matched_mc_webscraped_22 %>% 
  inner_join(mc_clean_22) %>% 
  inner_join(web_clean_22) %>% 
  select(mc_id, webscraped_id, address_clean_mc, address_web) %>% 
  datatable()

We consolidate the string-matched and manually matched data, then add the unmatched webscraped data to create the final dataset for Desjardins ATMs in Quebec.

Code

# unmatched mastercard after removing the false negatives
unmatched_mc_final_22 <- unmatched_mc_2022 %>%
  filter(!mc_id %in% matched_mc_webscraped_22$mc_id)

# unmatched webscraped after removing the false negatives
# we will add these below
unmatched_webscraped_final_22 <- unmatched_webscraped_2022 %>%
  filter(!webscraped_id %in% matched_mc_webscraped_22$webscraped_id) %>% 
  select(-name) %>% 
  rename(latitude_web = latitude,
         longitude_web = longitude,
         address_web = address_clean)

matched_final_22 <- matched_mc_webscraped_22 %>% 
   inner_join(mc_clean_22) %>% 
   inner_join(web_clean_22) %>% 
   bind_rows(unmatched_webscraped_final_22 %>% select(-number_part, -address_part)) %>% 
   mutate(address_clean = address_web) %>% 
 #  mutate(address_clean = ifelse(webscraped_id == 350, "501 rue Beaudoin Saint-Léonard-d’Aston Québec J0C 1M0", address_clean)) %>% 
 #  mutate(address_clean = ifelse(webscraped_id == 777, "170 boulevard Harwood Vaudreuil-Dorion Québec J7V 1Y2", address_clean)) %>% 
 #  mutate(address_clean = ifelse(webscraped_id == 758, "745 rue Roy Est Montréal Québec H2L 1E1", address_clean)) %>% 
  # filter(webscraped_id != 488) %>% 
  # filter(webscraped_id != 778) %>% 
   mutate(latitude = latitude_mc,
          longitude = longitude_mc) %>% 
   mutate(latitude = ifelse(is.na(latitude), latitude_web, latitude)) %>% 
   mutate(longitude = ifelse(is.na(longitude), longitude_web, longitude)) %>%  
   mutate(owner_name = ifelse(is.na(owner_name), "Desjardins (webscraped)", owner_name)) %>% 
   select(mc_id, webscraped_id, owner_name, 
          address_webscraped = address_web, address_mc = address_clean_mc, address_clean, latitude, longitude) %>%
   mutate(matched = "manual") %>%
   mutate(matched = ifelse(is.na(mc_id), "unmatched webscraped", matched)) %>% 
   select(mc_id, webscraped_id, owner_name, address_clean, latitude, longitude, matched)

Remove the false positives.

Code

remove <- c(1339, 288, 1225, 847, 848, 1216, 
            1199, 1224, 1233)

best_matches_22_v2 <- best_matches_22 %>% 
  filter(same_street_number == 1) %>% 
  select(mc_id, webscraped_id, inst_assigned_term_id, owner_name, address_clean = address_webscraped, latitude, longitude) %>% 
  filter(!mc_id %in% remove) %>% 
  mutate(matched = "string matched")

Final Desjardins data (Quebec 2022)

Code

best_matches_22_v3 <- best_matches_22_v2 %>% 
  bind_rows(matched_final_22) 

best_matches_22_v3 %>%
  download_this(
    output_name = "desjardins_matched_2022",
    output_extension = ".xlsx",
    button_label = "Download data as xlsx",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

Ontario (2022)

Code

#  Perform fuzzy matching
matched_data_22_ont <- stringdist_join(
  x = desjardins_2022_mc_ontario,
  y = desjardins_branch_ont_22,
  by = "address_clean",
  method = "jw",
  max_dist = 0.7,
  ignore_case = TRUE,
  mode = "inner",
  distance_col = "dist"
)

best_matches_ont_22 <- matched_data_22_ont %>%
  group_by(mc_id) %>%
  arrange(dist) %>%  # Arrange by string distance in ascending order
  slice_head(n = 3) %>%  # Take the top 3 closest matches by string distance
  # check if they have the same address number
  mutate(
    within_top_3_same_number = ifelse(number_part.x == number_part.y, 1, 0)
  ) %>% 
  # Prioritize same street number first, then lowest distance within top 3
  arrange(desc(within_top_3_same_number), dist) %>%  
  # Prioritize same street number first, then lowest distance within top 3
  # Select the best match according to the new priority
  slice(1) %>%  
  ungroup() %>%
  rename(
    address_mc = address_clean.x,
    address_branch = address_clean.y,
    street_number_mc = number_part.x,
    street_number_branch = number_part.y,
    address_part_mc = address_part.x,
    address_part_branch = address_part.y
  ) %>% 
  select(mc_id, branch_id, everything()) %>% 
  mutate(same_street_number = ifelse(street_number_mc == street_number_branch, 1, 0)) %>% 
  select(mc_id, branch_id, address_mc, address_branch, dist, latitude, 
         longitude, inst_assigned_term_id, owner_name, same_street_number)

best_matches_ont_22_v1 <- best_matches_ont_22 %>% 
  filter(same_street_number == 1)

# Identify unmatched records in dataset 1 (to discard)
unmatched_mc_ontario_22 <- desjardins_2022_mc_ontario %>%
  anti_join(best_matches_ont_22_v1, by = "mc_id") %>% 
  select(mc_id, address_clean, latitude, longitude, inst_assigned_term_id, owner_name) %>% 
  arrange(address_clean)

unmatched_branches_ontario_22 <- desjardins_branch_ont_22%>%
  anti_join(best_matches_ont_22_v1, by = "branch_id") %>% 
  select(-address_part, -number_part) %>% 
  arrange(address_clean)

# best_matches_ont_22_v1 %>% write_excel_csv("data/temp/match/ontario_22_desjardins.csv")
# 
# unmatched_branches_ontario_22 %>% write_excel_csv("data/temp/match/ont_22_desjardins_unmatched_branches.csv")
#  
# unmatched_mc_ontario_22 %>% write_excel_csv("data/temp/match/ont_22_desjardins_unmatched_mc.csv")

Code

# Define the manually matched IDs based on your input
manual_matched_mc_ont_22 <- tibble::tribble(
  ~mc_id, ~branch_id,
  50, 4,
  48, 2,
  56, 15
  )

matched_ont_22 <- manual_matched_mc_ont_22 %>% 
  inner_join(unmatched_mc_ontario_22) %>% 
  mutate(matched = "manual")

#We remove the false positives. 

remove_ont_22 <- c(3)

best_matches_ont_22_v2 <- best_matches_ont_22_v1 %>% 
  select(mc_id, branch_id, inst_assigned_term_id, 
         owner_name, address_clean = address_branch, latitude, longitude) %>% 
  filter(!mc_id %in% remove_ont_23) %>% 
  mutate(matched = "string matched")

best_matches_ont_22_v3 <- best_matches_ont_22_v2 %>% 
  bind_rows(matched_ont_22)

Consolidate Desjardins

Consolidate Desjardins.

Filter out the Ontario and Quebec Desjardins from the MC data and add back in

Code

library(readxl)
# ontario matched ATMS 2022
best_matches_ont_22_v4 <- best_matches_ont_22_v3 %>% 
  select(-mc_id, -branch_id)

# Quebec matched atms
best_matches_22_v4 <- best_matches_22_v3 %>% 
  select(-mc_id, -webscraped_id)

# consolidate 2022 data
df_5_2022 <- df_4_2022 %>% 
  filter(owner_name != "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  filter(owner_name != "LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO") %>% 
  bind_rows(best_matches_22_v4) %>% 
  bind_rows(best_matches_ont_22_v4)

# ontario matched ATMS 2023
best_matches_ont_23_v4 <- best_matches_ont_23_v3 %>% 
  select(-mc_id, -branch_id)

# Quebec matched atms 2023
best_matches_23_v4 <- best_matches_23_v3 %>% 
  select(-mc_id, -webscraped_id)

# consolidate 2023 data
df_5_2023 <- df_4_2023 %>% 
  filter(owner_name != "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  filter(owner_name != "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  bind_rows(best_matches_23_v4) %>% 
  bind_rows(best_matches_ont_23_v4)

# final desjardins counts

qc_23 <- best_matches_23_v4 %>% 
  mutate(date = "Nov 2023") %>% 
  mutate(owner_name = "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  count(date, owner_name)

on_23 <- best_matches_ont_23_v4 %>% 
  mutate(date = "Nov 2023") %>% 
  mutate(owner_name = "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  count(date, owner_name)

qc_22 <- best_matches_22_v4 %>% 
  mutate(date = "Nov 2022") %>% 
  mutate(owner_name = "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") %>% 
  count(date, owner_name)

on_22 <- best_matches_ont_22_v4 %>% 
  mutate(date = "Nov 2022") %>% 
  mutate(owner_name = "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO") %>% 
  count(date, owner_name)

desjardins_19 %>% 
  bind_rows(qc_23) %>% 
  bind_rows(on_23) %>% 
  bind_rows(qc_22) %>% 
  bind_rows(on_22) %>% 
  mutate(
    owner_name = case_when(
    str_detect(owner_name, "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC") ~ "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC",
    str_detect(owner_name, "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO|LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO") ~ "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO",
    TRUE ~ owner_name
  )) %>% 
   pivot_wider(names_from = date, values_from = n) %>% 
  select(owner_name, `Nov 2019`, `Nov 2022`, `Nov 2023`) %>% 
  kbl(caption = "Table 6: Desjardins ATM Counts After Matching", col.names = c("Owner Name", "November 2019", "November 2022", "November 2023")) %>% 
  kable_classic()

Table 6: Desjardins ATM Counts After Matching
Owner Name	November 2019	November 2022	November 2023
LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC	1706	1581	1533
LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO	63	54	54

Step 3: Geocode

Code

# Create a function to flag missing ATMs for each dataset
flag_missing_atms <- function(data, year) {
  data %>%
    mutate(missing_coords = ifelse(is.na(latitude) | is.na(longitude), 1, 0)) %>%
    summarize(missing_count = sum(missing_coords)) %>%
    mutate(year = year)
}

# Flag missing data for each year
missing_data_2023 <- flag_missing_atms(df_5_2023, 2023)
missing_data_2022 <- flag_missing_atms(df_5_2022, 2022)
missing_data_2019 <- flag_missing_atms(df_3_2019, 2019)

# Combine the missing data counts
missing_data_summary <- bind_rows(missing_data_2023, missing_data_2022, missing_data_2019)

# Plot the missing ATM counts using ggplot2 with counts as text
ggplot(missing_data_summary, aes(x = factor(year), y = missing_count)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  geom_text(aes(label = missing_count), vjust = -0.5, size = 3.5) +  # Add text labels
  labs(
    title = "Number of ATMs with Missing Coordinates by Year",
    x = "Year",
    y = "Number of Missing ATMs"
  ) +
  theme_minimal()

Code

data_2019_geocode_updated <- read_csv("data/temp/geocode/data_2019_geocode_updated.csv", guess_max = 40000)

data_2022_geocode_updated <- read_csv("data/temp/geocode/data_2022_geocode_updated.csv", guess_max = 40000)

data_2023_geocode_updated <- read_csv("data/temp/geocode/data_2023_geocode_updated.csv", guess_max = 40000)

The Mastercard data contained observations with missing latitude and longitude coordinates, so we first had to geocode those locations. We used the following variables to geocode the missing coordinates: address, city or town, province, and country. This information was combined into one character vector and sent through the Google Maps API. For example, “1857 Rue Saint Louis, Saint Laurent, QC, Canada” is one address we geocoded.

Step 4: WL ATMs and CU

Prepare CCUA data

First, We remove Desjardins from the 2022 and 2023 CCUA data.

ATMs belonging to the Fédération des Caisses Acadiennes and sponsored by Desjardins are duplicates, since they broke off with Desjardins and are now under the brand Uni Financial. (See https://ici.radio-canada.ca/nouvelle/1967331/uni-cooperation-financiere-desjardins-separation for the news of the separation announcement).

Therefore, we remove Fédération des Caisses Acadiennes from the 2023 Mastercard data and use UNI financial. To avoid duplication, we remove UNI financial from CCUA.

Caisse Alliance ATMs are not available in CCUA data but are available in Vancity and MC (i.e. ALLIANCE DES CAISSES POPULAIRES DE L’ONTARIO). Therefore, we include ALLIANCE DES CAISSES POPULAIRES DE L’ONTARIO atms from MC, and drop the ones in Vancity and CCUA.

Code

# read in ccua data 2022 and clean
ccua_atm_2022 <- read_excel("data/unprocessed/webscraped/ccua_sept16_clean.xlsx", sheet = "full") %>% 
  clean_names() %>% 
  rename(latitude = lat, longitude = long) %>% 
  mutate(latitude = as.double(latitude)) %>% 
  mutate(longitude = as.double(longitude)) %>% 
  mutate(address = str_squish(address)) %>% 
  filter(type == "atm") %>% 
  select(-transit, -bank_legend, -type) %>% 
  rename(owner_name = bank,
         address1 = address) %>% 
  mutate(type = "CU") %>% 
  mutate(
    day = 01,
    month_year = "nov_2022",
    date_dmy = paste0(day, "-", month_year),
    date = dmy(date_dmy),
  ) %>% 
  select(-day, -month_year, -date_dmy) %>% 
  ## REMOVE DESJARDINS from 2022 CCUA DATA
  filter(owner_name != "DESJARDINS")

# read in 2023 ccua
ccua_atm_23 <- read_csv("data/unprocessed/webscraped/ccua_atm_23.csv") %>% 
  rename(latitude = lat, longitude = lng, owner_name = name) %>%
  select(-type) %>% 
  mutate(data = "CCUA") %>% 
  mutate(type = "CU") %>% 
  # remove Desjardins from 2023 data
  filter(owner_name != "Desjardins") %>% 
  # remove Uni Financial Cooperation  
  filter(owner_name != "Uni Financial Cooperation")
# ALLIANCE DES CAISSES POPULAIRES DE L'ONTARIO not avail in CCUA so we will use mastercard

# We will be adding Bruno credit union ATMs to Vancity so we isolate those ATMs.

bu_ccua <- ccua_atm_23 %>% 
  filter(owner_name == "Bruno Savings And Credit Union Limited") %>% 
  mutate(type = ifelse(data == "CCUA", "CU - Bruno Savings", "type"))

Prepare 2023 Vancity data. We remove Desjardins and all FI Bank ATMs from Vancity. And as mentioned above, we remove Caisse Alliance from CCUA 2023.

Code

vancity_dec2023 <- read_excel("data/unprocessed/webscraped/vancity_dec2023.xlsx")

# remove all banks 
vancity_dec2023_cleaned <- vancity_dec2023 %>% 
  select(address = street, latitude = lat, longitude = lon, owner_name = name) %>% mutate(data = "VanCity") %>% 
  mutate(type = "CU") %>% 
  # remove desjardins
  filter(owner_name != "DESJARDINS") %>% 
  filter(owner_name != "Bank of China (Canada)") %>% 
  filter(owner_name != "Canadian Western Bank") %>% 
  filter(owner_name != "HSBC Bank Canada") %>% 
  filter(owner_name != "DESJARDINS") %>% 
  filter(owner_name != "Industrial & Commercial Bank of China (Canada)") %>% 
  filter(owner_name != "Industrial and Commercial Bank of China (Canada)") %>% 
  filter(owner_name != "Korea Exchange Bank of Canada")  %>% 
  filter(owner_name != "Manulife Bank of Canada") %>% 
  filter(owner_name != "National Bank of Canada") %>% 
  filter(owner_name != "Shinhan Bank Canada") %>%
  # remove CAISSE ALLIANCE
  # these seem to be duplicates 
  filter(owner_name != "CAISSE ALLIANCE")  %>% 
  filter(owner_name != "Caisse Alliance") %>% 
  filter(owner_name != "CTC Bank of Canada") %>% 
  filter(owner_name != "First Nations Bank of Canada") %>% 
  filter(owner_name != "ICICI Bank Canada") %>% 
  filter(owner_name != "Laurentian Bank") %>% 
  filter(owner_name != "State Bank of India")

Read in dictionary used for matching

Code

credit_unions <- read_lines("dictionary/credit_unions_list.txt")

banks <- read_lines("dictionary/banks.txt")

2023

Code

mc_atm_2023_cleaned <- data_2023_geocode_updated %>% 
  # first remove FEDERATION DES CAISSES POPULAIRES ACADIENNES
  filter(owner_name != "FEDERATION DES CAISSES POPULAIRES ACADIENNES") %>% 
  mutate(
    type = case_when(
      owner_name %in% credit_unions ~ "CU",
      owner_name %in% banks ~ "FI",
      TRUE ~ "WL"
    ) 
  ) %>% 
  # we are keeping ALLIANCE DES CAISSES POPULAIRES DE L'ONTARIO and UNI
  mutate(type = ifelse(owner_name == "ALLIANCE DES CAISSES POPULAIRES DE L'ONTARIO", "ALLIANCE DES CAISSES", type)) %>% 
  mutate(type = ifelse(owner_name == "UNI FINANCIAL SERVICES AND TECHNOLOGY INC.", "UNI", type)) %>% 
    
  mutate(type = ifelse(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" | 
                         owner_name == "Desjardins (webscraped)", "Desjardins QC", type)) %>% 
  mutate(type = ifelse(owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO", "Desjardins ON", type)) %>% 
  rename(address = address_clean) %>% 
  mutate(address = ifelse(is.na(address), address1, address))

# bind vancity and mc for travel metric data set
# leave in desjardin from MC and UNI from MC
# add BU from ccua 

travel_metric_atms_cleaned <- mc_atm_2023_cleaned %>% 
  ## remove credit unions
  filter(type != "CU") %>% 
  mutate(data = "MC") %>% 
  #
  bind_rows(vancity_dec2023_cleaned) %>% 
  bind_rows(bu_ccua) %>% 
  mutate(
    day = 01,
    month_year = "nov_2023",
    date_dmy = paste0(day, "-", month_year),
    date = dmy(date_dmy)
  ) %>% 
  select(address, latitude, longitude, owner_name, type, data)

travel_metric_atms_cleaned %>% write_csv("data/processed/ATM/atm_23_travel.csv")

atm_2023_final <- mc_atm_2023_cleaned %>% 
  filter(type != "CU") %>% 
  mutate(data = "MC") %>% 
  bind_rows(ccua_atm_23) %>% 
  select(address, latitude, longitude, owner_name, type, data) %>% 
  mutate(type_2 = case_when(
    type == "WL" ~ "WL",
    TRUE ~ "FI"
  )) %>% 
  mutate(
    day = 01,
    month_year = "nov_2023",
    date_dmy = paste0(day, "-", month_year),
    date = dmy(date_dmy)
  )

2022

Code

mc_atm_2022_cleaned <- data_2022_geocode_updated %>% 
  mutate(
    type = case_when(
      owner_name %in% credit_unions ~ "CU",
      owner_name %in% banks ~ "FI",
      TRUE ~ "WL"
    ),
    day = 01,
    month_year = "nov_2022",
    date_dmy = paste0(day, "-", month_year),
    date = dmy(date_dmy)
  ) %>% 
  # flag desjardins
  mutate(type = ifelse(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" | 
                         owner_name == "Desjardins (webscraped)", "Desjardins QC", type)) %>% 
  mutate(type = ifelse(owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE LONTARIO", "Desjardins ON", type)) %>% 
  rename(address = address_clean) %>% 
  mutate(address = ifelse(is.na(address), address1, address))

atm_2022_final <- mc_atm_2022_cleaned %>% 
  # filter out CU
  filter(type != "CU") %>% 
  select(owner_name, address1, type, latitude, longitude, date) %>% 
  bind_rows(ccua_atm_2022) %>% 
  mutate(type_2 = case_when(
    type == "WL" ~ "WL",
    TRUE ~ "FI"
  ))

2019

Code

atm_2019_final <- data_2019_geocode_updated %>% 
  mutate(
    type = case_when(
      owner_name %in% credit_unions ~ "CU",
      owner_name %in% banks ~ "FI",
      TRUE ~ "WL"),
    day = 01,
    month_year = "nov_2019",
    date_dmy = paste0(day, "-", month_year),
    date = dmy(date_dmy)
  ) %>% 
   mutate(type = ifelse(owner_name == "LA FEDERATION DES CAISSES DESJARDINS DU QUEBEC" | 
                         owner_name == "LA FEDERATION DES CAISSES POPULAIRES DE L'ONTARIO", "Desjardins", type)) %>% 
  
  select(owner_name, address1, type, latitude, longitude, date) %>% 
   mutate(type_2 = case_when(
    type == "WL" ~ "WL",
    TRUE ~ "FI"
  ))

Final

Code

atm_2019_final %>% write_csv("data/processed/ATM/atm_19.csv")

atm_2022_final %>% write_csv("data/processed/ATM/atm_22.csv")

atm_2023_final %>% write_csv("data/processed/ATM/atm_23_density.csv")

# Combine the datasets
atm_combined <- bind_rows(
  atm_2019_final %>% count(type_2, date),
  atm_2022_final %>% count(type_2, date),
  atm_2023_final %>% count(type_2, date)
)

library(scales)

atm_combined %>%
  mutate(date_3 = ifelse(date == "2019-11-01", "2019-Q4", date)) %>%
  mutate(date_3 = ifelse(date == "2022-11-01", "2022-Q4", date_3)) %>%
  mutate(date_3 = ifelse(date == "2023-11-01", "2023-Q4", date_3)) %>%
  ggplot(aes(date_3, n, fill = type_2)) + 
  geom_col(color = "black") +
  geom_text(aes(x = date_3, y = n, label = n, group = type_2),
            position = position_stack(vjust = 0.5)) +
  theme_classic() +
  scale_fill_brewer(palette = "Paired") +
  labs(x = "", y = "# of ABMs", fill = "ABM Type", subtitle = "ABM",
       caption = "FI: Financial Institution ABM\nWL: White-Label ABM") +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "bottom") +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13))

Density Metrics

Code

library(tidyverse)
library(janitor)
library(sf)
library(readxl)

# sf s2
sf_use_s2(FALSE)

# load census boundary files
census_subdivision <- st_read("data/unprocessed/csd_boundry/lcsd000b16a_e.shp") %>%
  clean_names() %>%
  st_transform(., crs = 4326)

csd_id <- census_subdivision %>% 
  st_drop_geometry() %>% 
  as_tibble() %>% 
  select(csduid)

branch_2019 <- read_excel("data/processed/density/branch_data.xlsx", sheet = "2019") %>% select(-date)
branch_2022 <- read_excel("data/processed/density/branch_data.xlsx", sheet = "2022") %>% select(-date)
branch_2023 <- read_excel("data/processed/density/branch_data.xlsx", sheet = "2023") %>% select(-date)

atm_19 <- atm_2019_final %>% select(-date) %>% mutate(type = type_2) %>% select(-type_2)
atm_22 <- atm_2022_final %>% select(-date) %>% mutate(type = type_2) %>% select(-type_2)
atm_23 <- atm_2023_final %>% select(-date) %>% mutate(type = type_2) %>% select(-type_2)
#################################################################################################

csd_join <- function(cash_access) {
  
  cash_access_sf <- cash_access %>% 
    st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>% 
    st_join(census_subdivision, join = st_intersects)
  
  csd_col <- census_subdivision %>% colnames()
  
  missing <- cash_access_sf %>% 
    filter(is.na(csduid)) %>% 
    select(-matches(csd_col))
  
  complete <- cash_access_sf %>% 
    filter(!is.na(csduid)) 
  
  missing_joined <- missing %>% 
    st_join(census_subdivision, join = st_nearest_feature) 
  
  complete %>% 
    bind_rows(missing_joined)
}

#####################################

# Process ATMs and Branches for all years
process_data <- function(atm_data, branch_data, atm_year, branch_year) {
  sf_atm <- csd_join(atm_data)
  sf_branch <- csd_join(branch_data)
  
  atm_n <- sf_atm %>% 
    st_drop_geometry() %>% 
    group_by(csduid) %>% 
    summarise(n = n()) %>% 
    ungroup() %>% 
    full_join(csd_id) %>% 
    rename(atm_n = n) %>% 
    mutate(across(where(is.numeric), .fns = ~replace_na(.,0)))
  
  atm_wl <- sf_atm %>% 
    st_drop_geometry() %>% 
    filter(type == "WL") %>% 
    group_by(csduid) %>% 
    summarise(n = n()) %>% 
    ungroup() %>% 
    full_join(csd_id) %>% 
    rename(wl_n = n) %>% 
    mutate(across(where(is.numeric), .fns = ~replace_na(.,0)))
  
  atm_fi <- sf_atm %>% 
    st_drop_geometry() %>% 
    filter(type != "WL") %>% 
    group_by(csduid) %>% 
    summarise(n = n()) %>% 
    ungroup() %>% 
    full_join(csd_id) %>% 
    rename(fi_n = n) %>% 
    mutate(across(where(is.numeric), .fns = ~replace_na(.,0)))
  
  atm_data_processed <- atm_n %>% 
    inner_join(atm_wl) %>% 
    inner_join(atm_fi) %>% 
    mutate(date = atm_year)
  
  branch_n <- sf_branch %>% 
    st_drop_geometry() %>% 
    group_by(csduid) %>% 
    summarise(n = n()) %>% 
    ungroup() %>% 
    full_join(csd_id) %>% 
    rename(branch_n = n) %>% 
    mutate(across(where(is.numeric), .fns = ~replace_na(.,0))) %>% 
    mutate(date = branch_year)
  
  list(atm_data_processed, branch_n)
}

# Process data for all years
atm_2019_data <- process_data(atm_19, branch_2019, "Nov-19", "Nov-19")
atm_2022_data <- process_data(atm_22, branch_2022, "Nov-22", "Nov-22")
atm_2023_data <- process_data(atm_23, branch_2023, "Nov-23", "Nov-23")

# Combine all data
density_data <- atm_2019_data[[2]] %>% 
  inner_join(atm_2019_data[[1]], by = "csduid") %>% 
  bind_rows(
    atm_2022_data[[2]] %>% inner_join(atm_2022_data[[1]], by = "csduid"),
    atm_2023_data[[2]] %>% inner_join(atm_2023_data[[1]], by = "csduid")
  ) %>% 
  select(-date.y) %>% 
  rename(date = date.x) %>% 
  mutate(csduid = as.double(csduid))


# import csd demographic variables
csd_statistics <- read_csv("data/processed/csd_statistics/csd_statistics.csv")

# join to csd cash

cash_density_19_23 <- csd_statistics %>%
  full_join(density_data) %>%
  mutate(class = case_when(
    fi_n > 0 ~ "FI ABM",
    wl_n > 0 & fi_n == 0 ~ "WL ABM Only",
    fi_n == 0 & wl_n == 0 ~ "None",
  )) %>%
  as_tibble() %>% 
  # mutate(date = fct_relevel(date, c("Nov-19", "Nov-22"))) %>% 
  drop_na(atm_n) %>% 
  mutate(class_2 = case_when(
    branch_n > 0 ~ "Branch",
    branch_n == 0 ~ "None",
  )) %>% 
  rename(pop = population) %>% 
  mutate(csduid = as.double(csduid))

cash_density_19_23 %>% write_csv("data/processed/density/cash_density_19_23.csv")

Total - Branch and ATM

Code

library(tidyverse)
library(janitor)
library(sf)
library(readxl)
library(patchwork)

figure_1_data <- atm_19 %>%
  #select(date, type) %>% 
  mutate(date_2 = "Nov-19") %>%
  bind_rows(atm_22 %>% mutate(date_2 = "Nov-22") %>% select(date_2, type)) %>%
  bind_rows(atm_23 %>% mutate(date_2 = "Nov-23") %>% select(date_2, type)) %>% 
  mutate(type_2 = ifelse(type == "WL", "WL", "FI")) %>%
  group_by(date_2, type_2) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  rename(date = date_2, type = type_2) 

figure_1 <- figure_1_data %>%
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", date)) %>%
  mutate(date_3 = ifelse(date == "Nov-22", "2022-Q4", date_3)) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  ggplot(., aes(date_3, n, fill = type)) + 
  geom_col(color = "black") +
  geom_text(aes(x = date_3, y = n, label = n, group = type),
            position = position_stack(vjust = .5)) +
  theme_classic() +
  scale_fill_brewer(palette = "Paired") +
  labs(x = "", y = "# of ABMs", fill = "ABM Type", subtitle = "ABM",
       caption = "FI: Financial Institution ABM\n WL: White-Label ABM") +
  scale_y_continuous(labels = comma) +
  theme(legend.position="bottom") +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13))

figure_2_data <- branch_2019 %>%
 # select(-date) %>% 
  mutate(date_2 = "Nov-19") %>%
  bind_rows(branch_2022 %>% mutate(date_2 = "Nov-22")) %>%
  bind_rows(branch_2023 %>% mutate(date_2 = "Nov-23")) %>% 
  group_by(date_2, bank_or_cu) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  rename(date = date_2)

figure_2 <- figure_2_data %>%
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  ggplot(., aes(date_3, n, fill = bank_or_cu)) + 
  geom_col(color = "black") +
  geom_text(aes(x = date_3, y = n, label = n),
            position = position_stack(vjust = .5)) +
  theme_classic() +
  scale_fill_brewer(palette = "Paired") +
  labs(x = "", y = "# of Branches", fill = "Branch Type", subtitle = "Branch",
       caption = "Bank - Bank Branch\nCU: Credit Union Branch") +
  scale_y_continuous(labels = comma) +
  theme(legend.position="bottom") +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13))

figure_1_data %>% 
  mutate(cash = "ATM") %>% 
  bind_rows(figure_2_data %>% 
              mutate(type = bank_or_cu) %>% 
              select(-bank_or_cu) %>% 
              mutate(cash = "Branch")) %>% 
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019", "2022")) %>% 
  mutate(type = ifelse(type == "FI", "FI-ATM", type)) %>% 
  mutate(type = ifelse(type == "WL", "White-Label ATM", type)) %>% 
  mutate(type = ifelse(type == "Bank", "Bank Branch", type)) %>% 
  mutate(type = ifelse(type == "CU", "Credit Union", type)) %>% 
  ggplot(., aes(date_3, n, fill = fct_reorder(type, n))) + 
  geom_col(color = "black") +
  geom_text(aes(x = date_3, y = n, label = n),
            position = position_stack(vjust = .5)) +
  theme_classic() +
  facet_wrap(~cash, scales = "free") +
  scale_fill_brewer(palette = "Paired", breaks=c('White-Label ATM', 'FI-ATM', 'Bank Branch', 'Credit Union')) +
  labs(x = "", y = "", fill = "", subtitle = "") +
  scale_y_continuous(labels = comma) +
  theme(legend.position="bottom") +
  theme(text = element_text(size = 15), axis.text = element_text(size = 15))

Code

chart_2 <- figure_1 + figure_2

We classify CSD locations into urban and rural following Statistics Canada. In this regard, an urban centre has a population of at least 1,000 people and a population density of 400 people or more per square kilometre, based on the Census. All areas outside of such population centres are classified as rural. Taken together, urban and rural areas cover all of Canada. Urban centres are further classified into three groups, according to their populations: small population centres with a population of between 1,000 and 29,999; medium-sized population centres with a population of between 30,000 and 99,999; and large urban population centres with a population of 100,000 or more.

Code

#
cash_density_23 <- read_csv("data/processed/density/cash_density_19_23.csv")

data_3 <- cash_density_23 %>%
  group_by(date, pop_center, class) %>%
  summarise(
    count = n(),
    pop = sum(pop, na.rm = TRUE) / 1000000
  ) %>%
  mutate(
    percent_count = paste(round(count * 100 / (sum(count, na.rm = TRUE)), 1), "%"),
    percent_pop = paste(round(pop * 100 / (sum(pop, na.rm = TRUE)), 1), "%")
  ) %>%
  mutate(class = factor(class, levels = c("FI ABM", "WL ABM Only", "None"))) %>% 
  ungroup() %>% 
  mutate(pop_center = forcats::fct_relevel(pop_center, c("Large", "Medium", "Small", "Rural")))

data_4 <- cash_density_23 %>%
  group_by(date, pop_center, class_2) %>%
  summarise(
    count = n(),
    pop = sum(pop, na.rm = TRUE) / 1000000
  ) %>%
  mutate(
    percent_count = paste(round(count * 100 / (sum(count, na.rm = TRUE)), 1), "%"),
    percent_pop = paste(round(pop * 100 / (sum(pop, na.rm = TRUE)), 1), "%")
  ) %>%
  mutate(class_2 = factor(class_2, levels = c("Branch", "None"))) %>% 
  ungroup() %>% 
  mutate(pop_center = forcats::fct_relevel(pop_center, c("Large", "Medium", "Small", "Rural")))

######################################################################################################

chart_3 <- data_3 %>% 
  mutate(percent_count = ifelse(percent_count == "88 %", "88.0 %", percent_count)) %>% 
  # mutate(class = str_replace(class, "ABM", "ATM")) %>% 
  mutate(class = factor(class, levels = c("FI ABM", "WL ABM Only", "None"))) %>% 
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  mutate(pop_center_2 = paste0(pop_center, " Urban")) %>% 
  mutate(pop_center_2 = ifelse(pop_center_2 == "Rural Urban", "Rural", pop_center_2)) %>% 
  mutate(pop_center_2 = forcats::fct_relevel(pop_center_2, c("Large Urban", "Medium Urban", "Small Urban", "Rural"))) %>% 
  # mutate(percent_count = ifelse(percent_count == "4.5 %" | percent_count == "3.7 %" | percent_count == "4.1 %", NA, percent_count)) %>% 
  ggplot(data = ., aes(x = date_3, y = count, fill = class)) +
  geom_bar(position = "stack", stat = "identity", alpha = .8, color = "black") +
  # geom_bar_text(position = "stack", reflow = TRUE, aes(label = percent_count)) +
  geom_text(aes(label = percent_count), position = position_stack(vjust = 0.5), size = 3.2, fontface = "bold") +
  theme_classic() +
  labs(
    title = "",
    # caption = paste0(
    # "FI: Financial Institution ABM\nWL ATM: White-Label ABM\n",
    # "Small Urban (2019-Q4): None 4.5%\n",
    # "Small Urban (2022-Q4): None 3.7%\n",
    #  "Small Urban (2019-Q4): None 4.1%"
    # ),
    fill = "",
  ) +
  ylab("Number of CSDs") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 14.5), axis.text = element_text(size = 13)) +
  scale_fill_manual(values = c("forestgreen", "gold2", "firebrick2")) +
  facet_wrap(~pop_center_2, scales = "free_y")

chart_3

Code

#ggsave("output/figures/chart_3.png", height = 5, width = 9, dpi = 300, units = "in")

chart_4 <- data_4 %>% 
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(pop_center_2 = paste0(pop_center, " Urban")) %>% 
  mutate(pop_center_2 = ifelse(pop_center_2 == "Rural Urban", "Rural", pop_center_2)) %>% 
  mutate(pop_center_2 = forcats::fct_relevel(pop_center_2, c("Large Urban", "Medium Urban", "Small Urban", "Rural"))) %>% 
  ggplot(data = ., aes(x = date_3, y = count, fill = class_2)) +
  geom_bar(position = "stack", stat = "identity", alpha = .8, color = "black") +
  geom_text(aes(label = percent_count), position = position_stack(vjust = 0.5), size = 3.2, fontface = "bold") +
  theme_classic() +
  labs(
    title = "",
    fill = ""
  ) +
  ylab("Number of CSDs") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13)) +
  scale_fill_manual(values = c("forestgreen", "firebrick2")) +
  facet_wrap(~pop_center_2, scales = "free_y")

chart_4

Code

#ggsave("output/figures/chart_4.png", height = 5, width = 9, dpi = 300, units = "in")

# Appendix charts

appendix_branch <- data_4 %>% 
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(pop_center_2 = paste0(pop_center, " Urban")) %>% 
  mutate(pop_center_2 = ifelse(pop_center_2 == "Rural Urban", "Rural", pop_center_2)) %>% 
  mutate(pop_center_2 = forcats::fct_relevel(pop_center_2, c("Large Urban", "Medium Urban", "Small Urban", "Rural"))) %>% 
  ggplot(data = ., aes(x = date_3, y = pop, fill = class_2)) +
  geom_bar(position = "stack", stat = "identity", alpha = .8, color = "black") +
  geom_text(aes(label = percent_pop), position = position_stack(vjust = 0.5), size = 3.2, fontface = "bold") +
  theme_classic() +
  labs(
    title = "",
    fill = ""
  ) +
  ylab("Population (Millions)") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13)) +
  scale_fill_manual(values = c("forestgreen", "firebrick2")) +
  facet_wrap(~pop_center_2, scales = "free_y")

appendix_branch

Code

#ggsave("output/figures/appendix_1.png", height = 5, width = 9, dpi = 300, units = "in")

appendix_atm <- data_3 %>% 
  mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(date_3 = ifelse(date == "Nov-23", "2023-Q4", date_3)) %>% 
  #mutate(date_3 = ifelse(date == "Nov-19", "2019-Q4", "2022-Q4")) %>% 
  mutate(pop_center_2 = paste0(pop_center, " Urban")) %>% 
  mutate(pop_center_2 = ifelse(pop_center_2 == "Rural Urban", "Rural", pop_center_2)) %>% 
  mutate(pop_center_2 = forcats::fct_relevel(pop_center_2, c("Large Urban", "Medium Urban", "Small Urban", "Rural"))) %>% 
  #mutate(percent_count = ifelse(pop_center == "Small" & class == "WL ABM Only" , NA, percent_count)) %>% 
  ggplot(data = ., aes(x = date_3, y = pop, fill = class)) +
  geom_bar(position = "stack", stat = "identity", alpha = .8, color = "black") +
  geom_text(aes(label = percent_pop), position = position_stack(vjust = 0.5), size = 3.2, fontface = "bold") +
  theme_classic() +
  labs(
    title = "",
    fill = "",
    caption = "FI: Financial Institution ABM\nWL ABM: White-Label ABM"
  ) +
  ylab("Population (Millions)") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(text = element_text(size = 14.5), axis.text = element_text(size = 13)) +
  scale_fill_manual(values = c("forestgreen", "gold2", "firebrick2")) +
  facet_wrap(~pop_center_2, scales = "free_y")

appendix_atm

Travel Metrics

To identify the origin points for our analysis, we use the Pseudo-Household Demographic Distribution provided by Statistics Canada to proxy for Canadians’ home locations.

Code

# import packages

pacman::p_load(tidyverse, here, janitor, sf, nngeo, hereR)

# # sf s2
sf_use_s2(FALSE)

# travel metric ATMs

atm_19 <- read_csv("data/processed/ATM/atm_19.csv") %>% rowid_to_column(., "atm_id")

atm_22 <- read_csv("data/processed/ATM/atm_22.csv") %>% rowid_to_column(., "atm_id")

atm_23 <- read_csv("data/processed/ATM/atm_23_travel.csv") %>% rowid_to_column(., "atm_id")

# reading in phh sample data
phh_sample <- read_csv("data/processed/phh/sample_phh.csv") 

# convert atm data to sf

atm_19_sf <- atm_19 %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) 

atm_22_sf  <- atm_22 %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) 

atm_23_sf  <- atm_23 %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) 

# convert phh to sf

phh_sample_sf <- phh_sample %>% st_as_sf(coords = c("longitude", "latitude"), crs = 4326)

Determine the five nearest ABMs or FI branches by geodesic distance (as the crow flies)

Code

################## get nearest 5 ##########################

# nearest atm 2019
nearest_atm_19 <- st_nn(phh_sample_sf, atm_19_sf,
                     returnDist = T, k =5, progress = TRUE, parallel = 16
)

# nearest atm 2022
nearest_atm_22 <- st_nn(phh_sample_sf, atm_22_sf,
                        returnDist = T, k = 5, progress = TRUE, parallel = 16
)
# nearest atm 2023
nearest_atm_23 <- st_nn(phh_sample_sf, atm_23_sf,
                        returnDist = T, k = 5, progress = TRUE, parallel = 16
)

################# get coordinates for ATMs and PHH ################


atm_19_coords <- atm_19 %>% select(atm_id, atm_lat = latitude, atm_lon = longitude)

atm_22_coords <- atm_22 %>% select(atm_id, atm_lat = latitude, atm_lon = longitude)

atm_23_coords <- atm_23 %>% select(atm_id, atm_lat = latitude, atm_lon = longitude)

phh <- phh_sample %>% rename(phh_lat = latitude, phh_lon = longitude)

Determine the five nearest ABMs or FI branches by geodesic distance (as the crow flies)

Code

# Define a function to process the travel matrix
process_travel_matrix <- function(nearest_atm, phh_sample_sf, phh, atm_coords, year) {
  travel_matrix <- as_tibble(nearest_atm) %>%
    mutate(phh_id = phh_sample_sf$phh_id) %>%
    unnest(c(nn, dist)) %>%
    rename(atm_id = nn) %>%
    inner_join(phh) %>%
    inner_join(atm_coords) %>%
    mutate(geo_dist_km = dist / 1000) %>%
    select(-dist)
  
  # Save to CSV
  write_csv(travel_matrix, paste0("data/temp/routes/travel_matrix_atm_", year, ".csv"))
  
  return(travel_matrix)
}

# Process travel matrices for each year
#travel_matrix_atm_19 <- process_travel_matrix(nearest_atm_19, phh_sample_sf, phh, atm_19_coords, 19)
#travel_matrix_atm_22 <- process_travel_matrix(nearest_atm_22, phh_sample_sf, phh, atm_22_coords, 22)
#travel_matrix_atm_23 <- process_travel_matrix(nearest_atm_23, phh_sample_sf, phh, atm_23_coords, 23)

Code

# Read travel matrices from CSV
travel_matrix_atm_19 <- read_csv("data/temp/routes/travel_matrix_atm_19.csv")

travel_matrix_atm_22 <- read_csv("data/temp/routes/travel_matrix_atm_22.csv")

travel_matrix_atm_23 <- read_csv("data/temp/routes/travel_matrix_atm_23.csv")

# Define a function to process origin and destination
process_origin_destination <- function(travel_matrix, year) {
  origin <- travel_matrix %>%
    select(phh_id, phh_lat, phh_lon) %>%
    st_as_sf(coords = c("phh_lon", "phh_lat"), crs = 4326) %>%
    rowid_to_column(., "trip_id")
  
  destination <- travel_matrix %>%
    select(phh_id, atm_lon, atm_lat) %>%
    st_as_sf(coords = c("atm_lon", "atm_lat"), crs = 4326) %>%
    rowid_to_column(., "trip_id")
  
  list(origin = origin, destination = destination)
}

# Process origins and destinations for each year
origin_destination_atm_19 <- process_origin_destination(travel_matrix_atm_19, 19)
origin_destination_atm_22 <- process_origin_destination(travel_matrix_atm_22, 22)
origin_destination_atm_23 <- process_origin_destination(travel_matrix_atm_23, 23)