fuzzyjoin: Join data frames on inexact matching-Part-1

Library

library(rlang)
library(fuzzyjoin)
library(dplyr)

Data

#create data frames
df1 <- data.frame(team=c('Mavericks', 'Nets', 'Warriors', 'Heat', 'Lakers'),
                  points=c(99, 90, 104, 117, 100))
df2 <- data.frame(team=c('Mavricks', 'Warrors', 'Heat', 'Netts', 'Kings', 'Lakes'),
                  assists=c(22, 29, 17, 40, 32, 30))

#view data frames
print(df1)

##        team points
## 1 Mavericks     99
## 2      Nets     90
## 3  Warriors    104
## 4      Heat    117
## 5    Lakers    100

print(df2)

##       team assists
## 1 Mavricks      22
## 2  Warrors      29
## 3     Heat      17
## 4    Netts      40
## 5    Kings      32
## 6    Lakes      30

Join data on inexact matching

Fuzzy logic in R

stringdist_join(df1, df2, 
                by='team', #match based on team
                mode='left', #use left join
                method = "jw", #use jw distance metric
                max_dist=99, 
                distance_col='dist') %>%
  group_by(team.x) %>%
  slice_min(order_by=dist, n=1)

## # A tibble: 5 × 5
## # Groups:   team.x [5]
##   team.x    points team.y   assists   dist
##   <chr>      <dbl> <chr>      <dbl>  <dbl>
## 1 Heat         117 Heat          17 0     
## 2 Lakers       100 Lakes         30 0.0556
## 3 Mavericks     99 Mavricks      22 0.0370
## 4 Nets          90 Netts         40 0.0667
## 5 Warriors     104 Warrors       29 0.0417

Correcting misspellings against a dictionary

Data

data(misspellings)
library(qdapDictionaries)
words <- tbl_df(DICTIONARY)

Sample

set.seed(2016)
sub_misspellings <- misspellings %>%
  sample_n(1000)
sub_misspellings

## # A tibble: 1,000 × 2
##    misspelling   correct      
##    <chr>         <chr>        
##  1 Sanhedrim     Sanhedrin    
##  2 cyclinder     cylinder     
##  3 beastiality   bestiality   
##  4 consicousness consciousness
##  5 affilate      affiliate    
##  6 repubicans    republicans  
##  7 comitted      committed    
##  8 emmisions     emissions    
##  9 acquited      acquitted    
## 10 decompositing decomposing  
## # … with 990 more rows

Join data

joined <- sub_misspellings %>%
  stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 1)

Count

joined %>%
  count(misspelling, correct)

## # A tibble: 462 × 3
##    misspelling correct         n
##    <chr>       <chr>       <int>
##  1 abilty      ability         1
##  2 accademic   academic        1
##  3 accademy    academy         1
##  4 accension   accession       2
##  5 acceptence  acceptance      1
##  6 acedemic    academic        1
##  7 achive      achieve         4
##  8 acommodate  accommodate     1
##  9 acuracy     accuracy        1
## 10 addmission  admission       1
## # … with 452 more rows

closest matches (1 or 2 letters away)

joined_dists <- sub_misspellings %>%
  stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 2,
                        distance_col = "distance")
joined_dists

## # A tibble: 8,435 × 5
##    misspelling   correct     word          syllables distance
##    <chr>         <chr>       <chr>             <dbl>    <dbl>
##  1 cyclinder     cylinder    cylinder              3        1
##  2 beastiality   bestiality  bestiality            5        1
##  3 affilate      affiliate   affiliate             4        1
##  4 comitted      committed   committee             3        2
##  5 acquited      acquitted   acquire               2        2
##  6 acquited      acquitted   acquit                2        2
##  7 decompositing decomposing decomposition         5        2
##  8 decieved      deceived    deceive               2        2
##  9 asociated     associated  associate             4        2
## 10 supress       suppress    cypress               2        2
## # … with 8,425 more rows

fuzzyjoin: Join data frames on inexact matching-Part-1

Naimul Islam

2022-10-11

Library

Data

Join data on inexact matching

Fuzzy logic in R

Correcting misspellings against a dictionary

Data

Sample

Join data

Count

closest matches (1 or 2 letters away)