Let’s compare how similar our names are! But how?
people <- c('ZuJe', 'ZegRcad', 'SliaDne',
'SollKnn', 'SnhTg', 'SnhRmia', 'SamDea',
'SsdaaRtih', 'SfrZcay', 'RdiuzEcie',
'ReeSme', 'OesHny', 'NtTiht',
'MsoDne', 'MyrCar', 'MLuhiEa',
'MrieGbila','LccMthw', 'IlmMla',
'IpltMcal', 'HmelJsu', 'HrcEi',
'HrasBa', 'HlyEhn', 'GedagJra',
'GnroMrAeada', 'GtcPdo',
'FaknegTlr', 'FruoCril', 'CseCsada',
'CnolJsp', 'CniSa', 'CaVc',
'CmoGbil', 'BrsvDiry', 'AaaoEtbn')
length(people)
## [1] 36
kbl(data.frame('people' = people))
|
people
|
|
ZuJe
|
|
ZegRcad
|
|
SliaDne
|
|
SollKnn
|
|
SnhTg
|
|
SnhRmia
|
|
SamDea
|
|
SsdaaRtih
|
|
SfrZcay
|
|
RdiuzEcie
|
|
ReeSme
|
|
OesHny
|
|
NtTiht
|
|
MsoDne
|
|
MyrCar
|
|
MLuhiEa
|
|
MrieGbila
|
|
LccMthw
|
|
IlmMla
|
|
IpltMcal
|
|
HmelJsu
|
|
HrcEi
|
|
HrasBa
|
|
HlyEhn
|
|
GedagJra
|
|
GnroMrAeada
|
|
GtcPdo
|
|
FaknegTlr
|
|
FruoCril
|
|
CseCsada
|
|
CnolJsp
|
|
CniSa
|
|
CaVc
|
|
CmoGbil
|
|
BrsvDiry
|
|
AaaoEtbn
|
knitr::include_graphics('alphabetWheel.png')
Idea: Use a Manhattan-type score for name pairs.
\(d_{Manhattan}(X, Y) = \mid\mid X - Y \mid\mid _{1}\)
\(= \mid x_{1}-y_{1}\mid + \mid x_{2}-y_{2}\mid + \cdot\cdot\cdot\)
Using the wheel as a model of distance, convert the names to lowercase, start with the first letters in each name pair, see how far apart they are on the wheel, and take the average of the distances for each letter pair, to better compare varying lengths.
name_1: ReeSme
name_2: OesHny
dist(R, O) = 3, dist(e, e) = 0, etc.
Start by converting letters to integers, to make the math easier
# Input a word of mixed-case letters, output a vector of 1-52 equivs,
## optionally lower-cased with second parameter
string2nums <- function(string, makeLower = FALSE) {
chars <- unlist(strsplit(string, ''))
nums <- lapply(chars, function(ch){match(ch, c(letters, LETTERS))})
nums <- as.numeric(nums)
if (makeLower) {nums <- nums %% 26}
nums
}
cat("Convert 'HlyEhn' to integers: ", string2nums('HlyEhn', makeLower = T))
## Convert 'HlyEhn' to integers: 8 12 25 5 8 14
Find the distance between any pair of letters
# how far apart are 2 letters on a wheel?
# Inputs are 2 integers from 1-26, representing a-z, or A-Z
# Image taken from https://kodlogs.com/blog/618/alphabet-wheel
cycleDist <- function(letternum1, letternum2) {
d <- letternum1 - letternum2
if (d > 0) {
d <- min(d, letternum2 + 26 - letternum1)
} else if (d < 0) {
d <- min(-d, letternum1 + 26 - letternum2)
}
d
}
c(cycleDist(1,1), cycleDist(3,22), cycleDist(48,29), cycleDist(5,19))
## [1] 0 7 7 12
Try it out with ‘ReeSme’ and ‘OesHny’
total <- 0
name1 <- string2nums('ReeSme', makeLower = T)
name2 <- string2nums('OesHry', makeLower = T)
for (i in 1:length(name1)) {
total <- total + cycleDist(name1[i], name2[i])
}
cat("'ReeSme' and 'OesHry' have an average distance of",
round(total / length(name1), 2), "per letter pair.")
## 'ReeSme' and 'OesHry' have an average distance of 6.17 per letter pair.
Problem: What does this really mean? Are “p” and “q” actually 7 times more similar than “p” and “i”?
Problem: What about names that have different lengths?
—————————————————
Much Better Idea: Use other distance metrics
\(d_{Jaccard}(X, Y) = 1 - \frac{\mid X \cap Y \mid}{\mid X \cup Y \mid}\)
#jaccard can compare strings directly, without converting to ints
jaccard <- function(string1, string2) {
set1 <- unique(unlist(strsplit(string1, '')))
set2 <- unique(unlist(strsplit(string2, '')))
1 - length(intersect(set1, set2)) / length(union(set1, set2))
}
cat("'carjacked' and 'jaccard' have a jaccard distance of", jaccard('carjacked', 'jaccard'))
## 'carjacked' and 'jaccard' have a jaccard distance of 0.2857143
X = ‘carjacked’, Y = ‘jaccard’
\(\frac{\mid carjacked \cap jaccard \mid}{\mid carjacked \cup jaccard \mid}\)
\(= \frac{\mid (c, a, r, j, d) \mid}{\mid (c, a, r, j, d, e, k) \mid}\)
\(= \frac{5}{7}\)
That’s their similarity, so their distance is 1 - that, as calculated before.
\(d_{cosine}(X, Y) = 1 - \frac{X \cdot Y}{\mid\mid X\mid\mid_{2}\cdot\mid\mid Y\mid\mid_{2}}\)
X = ‘bad dad’
Y = ‘cabbed’
\(X = < 2, 1, 0, 3, 0, 0, ... >\)
\(Y = < 1, 2, 1, 1, 1, 0, ... >\)
\(d_{cosine}(X, Y) = 1 - \frac{< 2, 1, 0, 3, 0, 0, ... > \cdot < 1, 2, 1, 1, 1, 0, ... >}{\sqrt{2^{2}+1^{2}+3^{2}}\cdot \sqrt{1^{2}+2^{2}+1^{2}+1^{2}+1^{2}}}\)
cosine <- function(string1, string2, len = 52) {
nums1 <- string2nums(string1)
nums2 <- string2nums(string2)
vec1 <- rep(0, len)
vec2 <- rep(0, len)
for (n in nums1) {
vec1[n] <- vec1[n] + 1
}
for (n in nums2) {
vec2[n] <- vec2[n] + 1
}
dot <- sum(vec1 * vec2)
mags <- sqrt(sum(vec1 * vec1)) * sqrt(sum(vec2 * vec2))
1 - dot / mags
}
findClosest <- function(stringlist, func) {
names <- stringlist # just copy to initialize return values
scores <- rep(-1, length(stringlist))
for (i in 1:length(stringlist)) {
best <- Inf
name <- ""
string1 <- stringlist[i]
others <- stringlist[-c(i)]
for (other in others) {
score <- func(string1, other)
if (score < best){
best <- score
name <- other
}
}
names[i] <- name
scores[i] <- best
}
paste(stringlist, names, round(scores, 2))
}
cosines <- findClosest(people, cosine)
jaccards <- findClosest(people, jaccard)
lowCosines <- findClosest(tolower(people), cosine)
lowAdists <- findClosest(tolower(people), adist)
all_sims <- data.frame('People' = people, 'Jaccard' = jaccards,
'Cosine' = cosines, 'LowerCosines' = lowCosines)
all_sims %>%
kbl() %>%
kable_material_dark() %>%
row_spec(19:20, background = "#D7261E") # most similar title goes to...
|
People
|
Jaccard
|
Cosine
|
LowerCosines
|
|
ZuJe
|
ZuJe HmelJsu 0.62
|
ZuJe HmelJsu 0.43
|
zuje hmeljsu 0.43
|
|
ZegRcad
|
ZegRcad GedagJra 0.6
|
ZegRcad GedagJra 0.4
|
zegrcad gedagjra 0.24
|
|
SliaDne
|
SliaDne SamDea 0.5
|
SliaDne CniSa 0.32
|
sliadne cnisa 0.32
|
|
SollKnn
|
SollKnn SliaDne 0.67
|
SollKnn SliaDne 0.43
|
sollknn cnoljsp 0.32
|
|
SnhTg
|
SnhTg SnhRmia 0.67
|
SnhTg SnhRmia 0.49
|
snhtg nttiht 0.35
|
|
SnhRmia
|
SnhRmia SsdaaRtih 0.5
|
SnhRmia SsdaaRtih 0.32
|
snhrmia ssdaartih 0.27
|
|
SamDea
|
SamDea SliaDne 0.5
|
SamDea SliaDne 0.33
|
samdea csecsada 0.24
|
|
SsdaaRtih
|
SsdaaRtih SnhRmia 0.5
|
SsdaaRtih SnhRmia 0.32
|
ssdaartih hrasba 0.22
|
|
SfrZcay
|
SfrZcay MyrCar 0.67
|
SfrZcay MyrCar 0.47
|
sfrzcay myrcar 0.33
|
|
RdiuzEcie
|
RdiuzEcie ZegRcad 0.64
|
RdiuzEcie HrcEi 0.46
|
rdiuzecie hrcei 0.26
|
|
ReeSme
|
ReeSme SamDea 0.5
|
ReeSme SamDea 0.49
|
reesme msodne 0.41
|
|
OesHny
|
OesHny MsoDne 0.67
|
OesHny MsoDne 0.5
|
oeshny hlyehn 0.28
|
|
NtTiht
|
NtTiht SsdaaRtih 0.7
|
NtTiht SsdaaRtih 0.57
|
nttiht snhtg 0.35
|
|
MsoDne
|
MsoDne GnroMrAeada 0.64
|
MsoDne OesHny 0.5
|
msodne oeshny 0.33
|
|
MyrCar
|
MyrCar SfrZcay 0.67
|
MyrCar GnroMrAeada 0.36
|
myrcar sfrzcay 0.33
|
|
MLuhiEa
|
MLuhiEa LccMthw 0.7
|
MLuhiEa SsdaaRtih 0.54
|
mluhiea ilmmla 0.28
|
|
MrieGbila
|
MrieGbila GnroMrAeada 0.58
|
MrieGbila SliaDne 0.43
|
mriegbila mluhiea 0.32
|
|
LccMthw
|
LccMthw MLuhiEa 0.7
|
LccMthw IpltMcal 0.58
|
lccmthw ipltmcal 0.37
|
|
IlmMla
|
IlmMla IpltMcal 0.5
|
IlmMla IpltMcal 0.22
|
ilmmla ipltmcal 0.2
|
|
IpltMcal
|
IpltMcal IlmMla 0.5
|
IpltMcal IlmMla 0.22
|
ipltmcal ilmmla 0.2
|
|
HmelJsu
|
HmelJsu ZuJe 0.62
|
HmelJsu ZuJe 0.43
|
hmeljsu mluhiea 0.29
|
|
HrcEi
|
HrcEi RdiuzEcie 0.7
|
HrcEi RdiuzEcie 0.46
|
hrcei rdiuzecie 0.26
|
|
HrasBa
|
HrasBa BrsvDiry 0.67
|
HrasBa CseCsada 0.43
|
hrasba ssdaartih 0.22
|
|
HlyEhn
|
HlyEhn OesHny 0.67
|
HlyEhn OesHny 0.5
|
hlyehn oeshny 0.28
|
|
GedagJra
|
GedagJra GnroMrAeada 0.55
|
GedagJra GnroMrAeada 0.27
|
gedagjra gnromraeada 0.21
|
|
GnroMrAeada
|
GnroMrAeada GedagJra 0.55
|
GnroMrAeada GedagJra 0.27
|
gnromraeada gedagjra 0.21
|
|
GtcPdo
|
GtcPdo GnroMrAeada 0.75
|
GtcPdo LccMthw 0.59
|
gtcpdo zegrcad 0.54
|
|
FaknegTlr
|
FaknegTlr SliaDne 0.67
|
FaknegTlr GedagJra 0.47
|
faknegtlr gnromraeada 0.39
|
|
FruoCril
|
FruoCril CmoGbil 0.6
|
FruoCril MyrCar 0.44
|
fruocril hrcei 0.43
|
|
CseCsada
|
CseCsada ZegRcad 0.67
|
CseCsada HrasBa 0.43
|
csecsada samdea 0.24
|
|
CnolJsp
|
CnolJsp SollKnn 0.67
|
CnolJsp SollKnn 0.43
|
cnoljsp sollknn 0.32
|
|
CniSa
|
CniSa SliaDne 0.5
|
CniSa SliaDne 0.32
|
cnisa csecsada 0.28
|
|
CaVc
|
CaVc MyrCar 0.71
|
CaVc CseCsada 0.47
|
cavc csecsada 0.35
|
|
CmoGbil
|
CmoGbil FruoCril 0.6
|
CmoGbil MrieGbila 0.43
|
cmogbil mriegbila 0.32
|
|
BrsvDiry
|
BrsvDiry HrasBa 0.67
|
BrsvDiry MyrCar 0.44
|
brsvdiry myrcar 0.44
|
|
AaaoEtbn
|
AaaoEtbn GnroMrAeada 0.67
|
AaaoEtbn GnroMrAeada 0.43
|
aaaoetbn gnromraeada 0.26
|
What about preserving the order of the letters in the names? At least the failed Manhattan distance gave some notion of that.
One solution: Edit distance
Focus of edit distance: Alignment/sequence of letters matters.
R does provide an edit distance function, aDist, in utils.
knitr::include_graphics('ninetyNineMice.png')

all_adists <- data.frame('People' = people, 'Edit Distances' = lowAdists)
all_adists %>%
kbl() %>%
kable_material_dark() %>%
row_spec(33, background = "#D7261E") # tie for most similar title goes to...
|
People
|
Edit.Distances
|
|
ZuJe
|
zuje hrcei 4
|
|
ZegRcad
|
zegrcad myrcar 4
|
|
SliaDne
|
sliadne msodne 4
|
|
SollKnn
|
sollknn sliadne 5
|
|
SnhTg
|
snhtg snhrmia 4
|
|
SnhRmia
|
snhrmia snhtg 4
|
|
SamDea
|
samdea ilmmla 4
|
|
SsdaaRtih
|
ssdaartih snhrmia 6
|
|
SfrZcay
|
sfrzcay myrcar 4
|
|
RdiuzEcie
|
rdiuzecie reesme 6
|
|
ReeSme
|
reesme zuje 5
|
|
OesHny
|
oeshny sollknn 5
|
|
NtTiht
|
nttiht snhtg 5
|
|
MsoDne
|
msodne sliadne 4
|
|
MyrCar
|
myrcar zegrcad 4
|
|
MLuhiEa
|
mluhiea zuje 5
|
|
MrieGbila
|
mriegbila cmogbil 5
|
|
LccMthw
|
lccmthw snhtg 6
|
|
IlmMla
|
ilmmla samdea 4
|
|
IpltMcal
|
ipltmcal ilmmla 4
|
|
HmelJsu
|
hmeljsu cnoljsp 4
|
|
HrcEi
|
hrcei zuje 4
|
|
HrasBa
|
hrasba hrcei 4
|
|
HlyEhn
|
hlyehn hrcei 4
|
|
GedagJra
|
gedagjra zegrcad 6
|
|
GnroMrAeada
|
gnromraeada zegrcad 8
|
|
GtcPdo
|
gtcpdo zegrcad 5
|
|
FaknegTlr
|
faknegtlr aaaoetbn 6
|
|
FruoCril
|
fruocril hrcei 5
|
|
CseCsada
|
csecsada cnisa 5
|
|
CnolJsp
|
cnoljsp hmeljsu 4
|
|
CniSa
|
cnisa snhtg 4
|
|
CaVc
|
cavc zuje 4
|
|
CmoGbil
|
cmogbil mriegbila 5
|
|
BrsvDiry
|
brsvdiry oeshny 6
|
|
AaaoEtbn
|
aaaoetbn samdea 6
|
Keyboard neighbors as weightings for penalties, to assist in spell-correcting, e.g.
a-neighbors: c('q','w','s','z'), b-neighbors: c('v', 'g', 'h', 'n'), c-neighbors: c('x', 'd', 'f', 'v'), d-neighbors: c('e', 'r', 'f', 'c', 'x', 's'), e-neighbors: c('w', 's', 'd', 'r'), ..., z-neighbors: c('a', 's', 'x')
You can enter custom costs: adist(str1, str2, costs = c(insert=1,delete=1,substitute=1.5)) but you can’t enter a function to calculate a cost, e.g. based on whether the letter’s in the neighbors list shown above. If you want to implement your own edit distance metric for something like that (involving dynamic programming), I recommend this video:
Tim Roughgarden’s dynamic programming video for sequence alignment, on Coursera
This week’s chapter only mentions Levershtein metric for gene alignment, but Needleman-Wunsch score is more common, and helps with our names similarity here more. Needleman-Wunsch scores +1 for every match, and -1 for any mismatch or skip. This will help the scores of longer names that actually match on some letters. But we need to negate the score, to keep thinking “lower is more similar”.
#install.packages('NameNeedle')
library(NameNeedle)
n <- needles('fruocril', 'hrcei')
-n[[1]]
## [1] 2
n[[2]]
## [1] "fruocril"
n[[3]]
## [1] "hr**cei*"
findNeedles <- function(stringlist) {
names1 <- stringlist # just copy to initialize return values
names2 <- stringlist
scores <- rep(-1, length(stringlist))
result <- c(names1, names2, scores)
myParams <- defaultNeedleParams
myParams$GAPCHAR <- '=' #the default asterisk triggers markdown in kable frames
for (i in 1:length(stringlist)) {
best <- -Inf
name1 <- ""
name2 <- ""
string1 <- stringlist[i]
others <- stringlist[-c(i)]
for (other in others) {
score <- needles(string1, other, myParams)
if (score[[1]] > best){
best <- score[[1]]
name1 <- score[[2]]
name2 <- score[[3]]
}
}
result[i*3-2] <- name1
result[i*3-1] <- name2
result[i*3] <- best * -1
}
result
}
needlenames <- findNeedles(tolower(people))
needy <- data.frame("People" = rep(people, each=3), "Levenshtein" = rep(lowAdists, each=3), "Needles" = needlenames)
needy %>%
kbl() %>%
collapse_rows(columns = 1:2, valign = "top") %>%
kable_material_dark() %>%
row_spec(c(7:9,40:42,49:51,55:60,100:102), background = "#D7261E") # tie for most similar title goes to...
|
People
|
Levenshtein
|
Needles
|
|
ZuJe
|
zuje hrcei 4
|
=zu=je=
|
|
mluhiea
|
|
3
|
|
ZegRcad
|
zegrcad myrcar 4
|
zegrcad
|
|
=myrcar
|
|
1
|
|
SliaDne
|
sliadne msodne 4
|
=sliadne
|
|
ms==odne
|
|
0
|
|
SollKnn
|
sollknn sliadne 5
|
sol=lknn
|
|
s=liadne
|
|
2
|
|
SnhTg
|
snhtg snhrmia 4
|
snh==tg
|
|
snhrmia
|
|
1
|
|
SnhRmia
|
snhrmia snhtg 4
|
snhrmia
|
|
snh==tg
|
|
1
|
|
SamDea
|
samdea ilmmla 4
|
s==amd=ea
|
|
slia=dne=
|
|
1
|
|
SsdaaRtih
|
ssdaartih snhrmia 6
|
ssdaartih
|
|
=s=nhrmia
|
|
3
|
|
SfrZcay
|
sfrzcay myrcar 4
|
sfrzcay
|
|
myr=car
|
|
1
|
|
RdiuzEcie
|
rdiuzecie reesme 6
|
rdiuzecie
|
|
r===eesme
|
|
3
|
|
ReeSme
|
reesme zuje 5
|
rees=m=e=
|
|
===samdea
|
|
3
|
|
OesHny
|
oeshny sollknn 5
|
==oeshny
|
|
hlye=hn=
|
|
2
|
|
NtTiht
|
nttiht snhtg 5
|
=nttiht=
|
|
sn===htg
|
|
2
|
|
MsoDne
|
msodne sliadne 4
|
ms==odne
|
|
=sliadne
|
|
0
|
|
MyrCar
|
myrcar zegrcad 4
|
=myrcar
|
|
zegrcad
|
|
1
|
|
MLuhiEa
|
mluhiea zuje 5
|
mluhiea
|
|
=zu=je=
|
|
3
|
|
MrieGbila
|
mriegbila cmogbil 5
|
=mriegbila
|
|
cm==ogbil=
|
|
0
|
|
LccMthw
|
lccmthw snhtg 6
|
lccmt=hw
|
|
==nttiht
|
|
4
|
|
IlmMla
|
ilmmla samdea 4
|
i=lmmla=
|
|
ipltmcal
|
|
0
|
|
IpltMcal
|
ipltmcal ilmmla 4
|
ipltmcal
|
|
i=lmmla=
|
|
0
|
|
HmelJsu
|
hmeljsu cnoljsp 4
|
hmeljsu
|
|
cnoljsp
|
|
1
|
|
HrcEi
|
hrcei zuje 4
|
==hrcei=
|
|
snhr=mia
|
|
2
|
|
HrasBa
|
hrasba hrcei 4
|
==hrasba
|
|
snhr=mia
|
|
2
|
|
HlyEhn
|
hlyehn hrcei 4
|
hlye=hn=
|
|
==oeshny
|
|
2
|
|
GedagJra
|
gedagjra zegrcad 6
|
gedagjr=a=
|
|
ze==g=rcad
|
|
2
|
|
GnroMrAeada
|
gnromraeada zegrcad 8
|
gn=romraeada
|
|
snhr=m====ia
|
|
4
|
|
GtcPdo
|
gtcpdo zegrcad 5
|
==gtcpdo
|
|
zegrcad=
|
|
2
|
|
FaknegTlr
|
faknegtlr aaaoetbn 6
|
faknegtlr
|
|
aaaoe=tbn
|
|
3
|
|
FruoCril
|
fruocril hrcei 5
|
fruocril
|
|
hr==cei=
|
|
2
|
|
CseCsada
|
csecsada cnisa 5
|
cse==csada
|
|
=zegrc=ad=
|
|
2
|
|
CnolJsp
|
cnoljsp hmeljsu 4
|
cnoljsp
|
|
hmeljsu
|
|
1
|
|
CniSa
|
cnisa snhtg 4
|
cn==isa
|
|
cnoljsp
|
|
1
|
|
CaVc
|
cavc zuje 4
|
===cavc
|
|
myrca=r
|
|
3
|
|
CmoGbil
|
cmogbil mriegbila 5
|
cm==ogbil=
|
|
=mriegbila
|
|
0
|
|
BrsvDiry
|
brsvdiry oeshny 6
|
brsvdiry
|
|
oes==hny
|
|
4
|
|
AaaoEtbn
|
aaaoetbn samdea 6
|
aaaoetbn=
|
|
===oeshny
|
|
3
|