Library
library(rlang)
library(fuzzyjoin)
library(dplyr)
Data
#create data frames
df1 <- data.frame(team=c('Mavericks', 'Nets', 'Warriors', 'Heat', 'Lakers'),
points=c(99, 90, 104, 117, 100))
df2 <- data.frame(team=c('Mavricks', 'Warrors', 'Heat', 'Netts', 'Kings', 'Lakes'),
assists=c(22, 29, 17, 40, 32, 30))
#view data frames
print(df1)
## team points
## 1 Mavericks 99
## 2 Nets 90
## 3 Warriors 104
## 4 Heat 117
## 5 Lakers 100
print(df2)
## team assists
## 1 Mavricks 22
## 2 Warrors 29
## 3 Heat 17
## 4 Netts 40
## 5 Kings 32
## 6 Lakes 30
Join data on inexact matching
Fuzzy logic in R
stringdist_join(df1, df2,
by='team', #match based on team
mode='left', #use left join
method = "jw", #use jw distance metric
max_dist=99,
distance_col='dist') %>%
group_by(team.x) %>%
slice_min(order_by=dist, n=1)
## # A tibble: 5 × 5
## # Groups: team.x [5]
## team.x points team.y assists dist
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Heat 117 Heat 17 0
## 2 Lakers 100 Lakes 30 0.0556
## 3 Mavericks 99 Mavricks 22 0.0370
## 4 Nets 90 Netts 40 0.0667
## 5 Warriors 104 Warrors 29 0.0417
Correcting misspellings against a dictionary
Data
data(misspellings)
library(qdapDictionaries)
words <- tbl_df(DICTIONARY)
Sample
set.seed(2016)
sub_misspellings <- misspellings %>%
sample_n(1000)
sub_misspellings
## # A tibble: 1,000 × 2
## misspelling correct
## <chr> <chr>
## 1 Sanhedrim Sanhedrin
## 2 cyclinder cylinder
## 3 beastiality bestiality
## 4 consicousness consciousness
## 5 affilate affiliate
## 6 repubicans republicans
## 7 comitted committed
## 8 emmisions emissions
## 9 acquited acquitted
## 10 decompositing decomposing
## # … with 990 more rows
Join data
joined <- sub_misspellings %>%
stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 1)
Count
joined %>%
count(misspelling, correct)
## # A tibble: 462 × 3
## misspelling correct n
## <chr> <chr> <int>
## 1 abilty ability 1
## 2 accademic academic 1
## 3 accademy academy 1
## 4 accension accession 2
## 5 acceptence acceptance 1
## 6 acedemic academic 1
## 7 achive achieve 4
## 8 acommodate accommodate 1
## 9 acuracy accuracy 1
## 10 addmission admission 1
## # … with 452 more rows
closest matches (1 or 2 letters away)
joined_dists <- sub_misspellings %>%
stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 2,
distance_col = "distance")
joined_dists
## # A tibble: 8,435 × 5
## misspelling correct word syllables distance
## <chr> <chr> <chr> <dbl> <dbl>
## 1 cyclinder cylinder cylinder 3 1
## 2 beastiality bestiality bestiality 5 1
## 3 affilate affiliate affiliate 4 1
## 4 comitted committed committee 3 2
## 5 acquited acquitted acquire 2 2
## 6 acquited acquitted acquit 2 2
## 7 decompositing decomposing decomposition 5 2
## 8 decieved deceived deceive 2 2
## 9 asociated associated associate 4 2
## 10 supress suppress cypress 2 2
## # … with 8,425 more rows