Let’s compare how similar our names are! But how?

people <- c('ZuJe', 'ZegRcad', 'SliaDne',
              'SollKnn', 'SnhTg', 'SnhRmia', 'SamDea', 
              'SsdaaRtih', 'SfrZcay', 'RdiuzEcie',
              'ReeSme', 'OesHny', 'NtTiht',
              'MsoDne', 'MyrCar', 'MLuhiEa', 
              'MrieGbila','LccMthw', 'IlmMla', 
              'IpltMcal', 'HmelJsu', 'HrcEi',
              'HrasBa', 'HlyEhn', 'GedagJra', 
              'GnroMrAeada', 'GtcPdo',
              'FaknegTlr', 'FruoCril', 'CseCsada', 
              'CnolJsp', 'CniSa', 'CaVc',
              'CmoGbil', 'BrsvDiry', 'AaaoEtbn')
length(people)
## [1] 36
kbl(data.frame('people' = people))
people
ZuJe
ZegRcad
SliaDne
SollKnn
SnhTg
SnhRmia
SamDea
SsdaaRtih
SfrZcay
RdiuzEcie
ReeSme
OesHny
NtTiht
MsoDne
MyrCar
MLuhiEa
MrieGbila
LccMthw
IlmMla
IpltMcal
HmelJsu
HrcEi
HrasBa
HlyEhn
GedagJra
GnroMrAeada
GtcPdo
FaknegTlr
FruoCril
CseCsada
CnolJsp
CniSa
CaVc
CmoGbil
BrsvDiry
AaaoEtbn
knitr::include_graphics('alphabetWheel.png')

Idea: Use a Manhattan-type score for name pairs.

\(d_{Manhattan}(X, Y) = \mid\mid X - Y \mid\mid _{1}\)

\(= \mid x_{1}-y_{1}\mid + \mid x_{2}-y_{2}\mid + \cdot\cdot\cdot\)

Using the wheel as a model of distance, convert the names to lowercase, start with the first letters in each name pair, see how far apart they are on the wheel, and take the average of the distances for each letter pair, to better compare varying lengths.

name_1: ReeSme

name_2: OesHny

dist(R, O) = 3, dist(e, e) = 0, etc.

Start by converting letters to integers, to make the math easier

# Input a word of mixed-case letters, output a vector of 1-52 equivs, 
##  optionally lower-cased with second parameter
string2nums <- function(string, makeLower = FALSE) {
  chars <- unlist(strsplit(string, ''))
  nums <- lapply(chars, function(ch){match(ch, c(letters, LETTERS))})
  nums <- as.numeric(nums)
  if (makeLower) {nums <- nums %% 26}
  nums
}
cat("Convert 'HlyEhn' to integers: ", string2nums('HlyEhn', makeLower = T))
## Convert 'HlyEhn' to integers:  8 12 25 5 8 14

Find the distance between any pair of letters

# how far apart are 2 letters on a wheel?
#  Inputs are 2 integers from 1-26, representing a-z, or A-Z
#  Image taken from https://kodlogs.com/blog/618/alphabet-wheel
cycleDist <- function(letternum1, letternum2) {
  d <- letternum1 - letternum2
  if (d > 0) {
    d <- min(d, letternum2 + 26 - letternum1)
  } else if (d < 0) {
    d <- min(-d, letternum1 + 26 - letternum2)
  }
  d
}
c(cycleDist(1,1), cycleDist(3,22), cycleDist(48,29), cycleDist(5,19))
## [1]  0  7  7 12

Try it out with ‘ReeSme’ and ‘OesHny’

total <- 0
name1 <- string2nums('ReeSme', makeLower = T)
name2 <- string2nums('OesHry', makeLower = T)
for (i in 1:length(name1)) {
  total <- total + cycleDist(name1[i], name2[i])
}
cat("'ReeSme' and 'OesHry' have an average distance of",
    round(total / length(name1), 2), "per letter pair.")
## 'ReeSme' and 'OesHry' have an average distance of 6.17 per letter pair.

Problem: What does this really mean? Are “p” and “q” actually 7 times more similar than “p” and “i”?

Problem: What about names that have different lengths?

—————————————————

Much Better Idea: Use other distance metrics

\(d_{Jaccard}(X, Y) = 1 - \frac{\mid X \cap Y \mid}{\mid X \cup Y \mid}\)

#jaccard can compare strings directly, without converting to ints
jaccard <- function(string1, string2) {
  set1 <- unique(unlist(strsplit(string1, '')))
  set2 <- unique(unlist(strsplit(string2, '')))
  1 - length(intersect(set1, set2)) / length(union(set1, set2))
}
cat("'carjacked' and 'jaccard' have a jaccard distance of", jaccard('carjacked', 'jaccard'))
## 'carjacked' and 'jaccard' have a jaccard distance of 0.2857143

X = ‘carjacked’, Y = ‘jaccard’

\(\frac{\mid carjacked \cap jaccard \mid}{\mid carjacked \cup jaccard \mid}\)

\(= \frac{\mid (c, a, r, j, d) \mid}{\mid (c, a, r, j, d, e, k) \mid}\)

\(= \frac{5}{7}\)

That’s their similarity, so their distance is 1 - that, as calculated before.

\(d_{cosine}(X, Y) = 1 - \frac{X \cdot Y}{\mid\mid X\mid\mid_{2}\cdot\mid\mid Y\mid\mid_{2}}\)

X = ‘bad dad’

Y = ‘cabbed’

\(X = < 2, 1, 0, 3, 0, 0, ... >\)

\(Y = < 1, 2, 1, 1, 1, 0, ... >\)

\(d_{cosine}(X, Y) = 1 - \frac{< 2, 1, 0, 3, 0, 0, ... > \cdot < 1, 2, 1, 1, 1, 0, ... >}{\sqrt{2^{2}+1^{2}+3^{2}}\cdot \sqrt{1^{2}+2^{2}+1^{2}+1^{2}+1^{2}}}\)

cosine <- function(string1, string2, len = 52) {
  nums1 <- string2nums(string1)
  nums2 <- string2nums(string2)
  vec1 <- rep(0, len)
  vec2 <- rep(0, len)
  for (n in nums1) {
    vec1[n] <- vec1[n] + 1
  }
  for (n in nums2) {
    vec2[n] <- vec2[n] + 1
  }
  dot <- sum(vec1 * vec2)
  mags <- sqrt(sum(vec1 * vec1)) * sqrt(sum(vec2 * vec2))
  1 - dot / mags
}

findClosest <- function(stringlist, func) {
  names <- stringlist  # just copy to initialize return values
  scores <- rep(-1, length(stringlist))
  for (i in 1:length(stringlist)) {
    best <- Inf
    name <- ""
    string1 <- stringlist[i]
    others <- stringlist[-c(i)]
    for (other in others) {
      score <- func(string1, other)
      if (score < best){
        best <- score
        name <- other
      }
    }
    names[i] <- name
    scores[i] <- best
  }
  paste(stringlist, names, round(scores, 2))
}
cosines <- findClosest(people, cosine)
jaccards <- findClosest(people, jaccard)
lowCosines <- findClosest(tolower(people), cosine)
lowAdists <- findClosest(tolower(people), adist)
all_sims <- data.frame('People' = people, 'Jaccard' = jaccards,
                       'Cosine' = cosines, 'LowerCosines' = lowCosines)
all_sims %>%
  kbl() %>%
  kable_material_dark() %>%
  row_spec(19:20, background = "#D7261E") # most similar title goes to...
People Jaccard Cosine LowerCosines
ZuJe ZuJe HmelJsu 0.62 ZuJe HmelJsu 0.43 zuje hmeljsu 0.43
ZegRcad ZegRcad GedagJra 0.6 ZegRcad GedagJra 0.4 zegrcad gedagjra 0.24
SliaDne SliaDne SamDea 0.5 SliaDne CniSa 0.32 sliadne cnisa 0.32
SollKnn SollKnn SliaDne 0.67 SollKnn SliaDne 0.43 sollknn cnoljsp 0.32
SnhTg SnhTg SnhRmia 0.67 SnhTg SnhRmia 0.49 snhtg nttiht 0.35
SnhRmia SnhRmia SsdaaRtih 0.5 SnhRmia SsdaaRtih 0.32 snhrmia ssdaartih 0.27
SamDea SamDea SliaDne 0.5 SamDea SliaDne 0.33 samdea csecsada 0.24
SsdaaRtih SsdaaRtih SnhRmia 0.5 SsdaaRtih SnhRmia 0.32 ssdaartih hrasba 0.22
SfrZcay SfrZcay MyrCar 0.67 SfrZcay MyrCar 0.47 sfrzcay myrcar 0.33
RdiuzEcie RdiuzEcie ZegRcad 0.64 RdiuzEcie HrcEi 0.46 rdiuzecie hrcei 0.26
ReeSme ReeSme SamDea 0.5 ReeSme SamDea 0.49 reesme msodne 0.41
OesHny OesHny MsoDne 0.67 OesHny MsoDne 0.5 oeshny hlyehn 0.28
NtTiht NtTiht SsdaaRtih 0.7 NtTiht SsdaaRtih 0.57 nttiht snhtg 0.35
MsoDne MsoDne GnroMrAeada 0.64 MsoDne OesHny 0.5 msodne oeshny 0.33
MyrCar MyrCar SfrZcay 0.67 MyrCar GnroMrAeada 0.36 myrcar sfrzcay 0.33
MLuhiEa MLuhiEa LccMthw 0.7 MLuhiEa SsdaaRtih 0.54 mluhiea ilmmla 0.28
MrieGbila MrieGbila GnroMrAeada 0.58 MrieGbila SliaDne 0.43 mriegbila mluhiea 0.32
LccMthw LccMthw MLuhiEa 0.7 LccMthw IpltMcal 0.58 lccmthw ipltmcal 0.37
IlmMla IlmMla IpltMcal 0.5 IlmMla IpltMcal 0.22 ilmmla ipltmcal 0.2
IpltMcal IpltMcal IlmMla 0.5 IpltMcal IlmMla 0.22 ipltmcal ilmmla 0.2
HmelJsu HmelJsu ZuJe 0.62 HmelJsu ZuJe 0.43 hmeljsu mluhiea 0.29
HrcEi HrcEi RdiuzEcie 0.7 HrcEi RdiuzEcie 0.46 hrcei rdiuzecie 0.26
HrasBa HrasBa BrsvDiry 0.67 HrasBa CseCsada 0.43 hrasba ssdaartih 0.22
HlyEhn HlyEhn OesHny 0.67 HlyEhn OesHny 0.5 hlyehn oeshny 0.28
GedagJra GedagJra GnroMrAeada 0.55 GedagJra GnroMrAeada 0.27 gedagjra gnromraeada 0.21
GnroMrAeada GnroMrAeada GedagJra 0.55 GnroMrAeada GedagJra 0.27 gnromraeada gedagjra 0.21
GtcPdo GtcPdo GnroMrAeada 0.75 GtcPdo LccMthw 0.59 gtcpdo zegrcad 0.54
FaknegTlr FaknegTlr SliaDne 0.67 FaknegTlr GedagJra 0.47 faknegtlr gnromraeada 0.39
FruoCril FruoCril CmoGbil 0.6 FruoCril MyrCar 0.44 fruocril hrcei 0.43
CseCsada CseCsada ZegRcad 0.67 CseCsada HrasBa 0.43 csecsada samdea 0.24
CnolJsp CnolJsp SollKnn 0.67 CnolJsp SollKnn 0.43 cnoljsp sollknn 0.32
CniSa CniSa SliaDne 0.5 CniSa SliaDne 0.32 cnisa csecsada 0.28
CaVc CaVc MyrCar 0.71 CaVc CseCsada 0.47 cavc csecsada 0.35
CmoGbil CmoGbil FruoCril 0.6 CmoGbil MrieGbila 0.43 cmogbil mriegbila 0.32
BrsvDiry BrsvDiry HrasBa 0.67 BrsvDiry MyrCar 0.44 brsvdiry myrcar 0.44
AaaoEtbn AaaoEtbn GnroMrAeada 0.67 AaaoEtbn GnroMrAeada 0.43 aaaoetbn gnromraeada 0.26

What about preserving the order of the letters in the names? At least the failed Manhattan distance gave some notion of that.

One solution: Edit distance

Focus of edit distance: Alignment/sequence of letters matters.

R does provide an edit distance function, aDist, in utils.

knitr::include_graphics('ninetyNineMice.png')

all_adists <- data.frame('People' = people, 'Edit Distances' = lowAdists)
all_adists %>%
  kbl() %>%
  kable_material_dark() %>%
  row_spec(33, background = "#D7261E") # tie for most similar title goes to...
People Edit.Distances
ZuJe zuje hrcei 4
ZegRcad zegrcad myrcar 4
SliaDne sliadne msodne 4
SollKnn sollknn sliadne 5
SnhTg snhtg snhrmia 4
SnhRmia snhrmia snhtg 4
SamDea samdea ilmmla 4
SsdaaRtih ssdaartih snhrmia 6
SfrZcay sfrzcay myrcar 4
RdiuzEcie rdiuzecie reesme 6
ReeSme reesme zuje 5
OesHny oeshny sollknn 5
NtTiht nttiht snhtg 5
MsoDne msodne sliadne 4
MyrCar myrcar zegrcad 4
MLuhiEa mluhiea zuje 5
MrieGbila mriegbila cmogbil 5
LccMthw lccmthw snhtg 6
IlmMla ilmmla samdea 4
IpltMcal ipltmcal ilmmla 4
HmelJsu hmeljsu cnoljsp 4
HrcEi hrcei zuje 4
HrasBa hrasba hrcei 4
HlyEhn hlyehn hrcei 4
GedagJra gedagjra zegrcad 6
GnroMrAeada gnromraeada zegrcad 8
GtcPdo gtcpdo zegrcad 5
FaknegTlr faknegtlr aaaoetbn 6
FruoCril fruocril hrcei 5
CseCsada csecsada cnisa 5
CnolJsp cnoljsp hmeljsu 4
CniSa cnisa snhtg 4
CaVc cavc zuje 4
CmoGbil cmogbil mriegbila 5
BrsvDiry brsvdiry oeshny 6
AaaoEtbn aaaoetbn samdea 6

Problem: Edit distance rewards short inputs too much. What about different weightings?

knitr::include_graphics('keysPhoto.png')

Keyboard neighbors as weightings for penalties, to assist in spell-correcting, e.g.

a-neighbors: c('q','w','s','z'), b-neighbors: c('v', 'g', 'h', 'n'), c-neighbors: c('x', 'd', 'f', 'v'), d-neighbors: c('e', 'r', 'f', 'c', 'x', 's'), e-neighbors: c('w', 's', 'd', 'r'), ..., z-neighbors: c('a', 's', 'x')

You can enter custom costs: adist(str1, str2, costs = c(insert=1,delete=1,substitute=1.5)) but you can’t enter a function to calculate a cost, e.g. based on whether the letter’s in the neighbors list shown above. If you want to implement your own edit distance metric for something like that (involving dynamic programming), I recommend this video:

Tim Roughgarden’s dynamic programming video for sequence alignment, on Coursera

This week’s chapter only mentions Levershtein metric for gene alignment, but Needleman-Wunsch score is more common, and helps with our names similarity here more. Needleman-Wunsch scores +1 for every match, and -1 for any mismatch or skip. This will help the scores of longer names that actually match on some letters. But we need to negate the score, to keep thinking “lower is more similar”.
#install.packages('NameNeedle')
library(NameNeedle)
n <- needles('fruocril', 'hrcei')
-n[[1]]
## [1] 2
n[[2]]
## [1] "fruocril"
n[[3]]
## [1] "hr**cei*"
findNeedles <- function(stringlist) {
  names1 <- stringlist  # just copy to initialize return values
  names2 <- stringlist
  scores <- rep(-1, length(stringlist))
  result <- c(names1, names2, scores)
  myParams <- defaultNeedleParams
  myParams$GAPCHAR <- '=' #the default asterisk triggers markdown in kable frames
  for (i in 1:length(stringlist)) {
    best <- -Inf
    name1 <- ""
    name2 <- ""
    string1 <- stringlist[i]
    others <- stringlist[-c(i)]
    for (other in others) {
      score <- needles(string1, other, myParams)
      if (score[[1]] > best){
        best <- score[[1]]
        name1 <- score[[2]]
        name2 <- score[[3]]
      }
    }
    result[i*3-2] <- name1
    result[i*3-1] <- name2
    result[i*3] <- best * -1
  }
  result
}
needlenames <- findNeedles(tolower(people))
needy <- data.frame("People" = rep(people, each=3), "Levenshtein" = rep(lowAdists, each=3), "Needles" = needlenames)
needy %>% 
  kbl() %>% 
  collapse_rows(columns = 1:2, valign = "top") %>%
  kable_material_dark() %>%
  row_spec(c(7:9,40:42,49:51,55:60,100:102), background = "#D7261E") # tie for most similar title goes to...
People Levenshtein Needles
ZuJe zuje hrcei 4 =zu=je=
mluhiea
3
ZegRcad zegrcad myrcar 4 zegrcad
=myrcar
1
SliaDne sliadne msodne 4 =sliadne
ms==odne
0
SollKnn sollknn sliadne 5 sol=lknn
s=liadne
2
SnhTg snhtg snhrmia 4 snh==tg
snhrmia
1
SnhRmia snhrmia snhtg 4 snhrmia
snh==tg
1
SamDea samdea ilmmla 4 s==amd=ea
slia=dne=
1
SsdaaRtih ssdaartih snhrmia 6 ssdaartih
=s=nhrmia
3
SfrZcay sfrzcay myrcar 4 sfrzcay
myr=car
1
RdiuzEcie rdiuzecie reesme 6 rdiuzecie
r===eesme
3
ReeSme reesme zuje 5 rees=m=e=
===samdea
3
OesHny oeshny sollknn 5 ==oeshny
hlye=hn=
2
NtTiht nttiht snhtg 5 =nttiht=
sn===htg
2
MsoDne msodne sliadne 4 ms==odne
=sliadne
0
MyrCar myrcar zegrcad 4 =myrcar
zegrcad
1
MLuhiEa mluhiea zuje 5 mluhiea
=zu=je=
3
MrieGbila mriegbila cmogbil 5 =mriegbila
cm==ogbil=
0
LccMthw lccmthw snhtg 6 lccmt=hw
==nttiht
4
IlmMla ilmmla samdea 4 i=lmmla=
ipltmcal
0
IpltMcal ipltmcal ilmmla 4 ipltmcal
i=lmmla=
0
HmelJsu hmeljsu cnoljsp 4 hmeljsu
cnoljsp
1
HrcEi hrcei zuje 4 ==hrcei=
snhr=mia
2
HrasBa hrasba hrcei 4 ==hrasba
snhr=mia
2
HlyEhn hlyehn hrcei 4 hlye=hn=
==oeshny
2
GedagJra gedagjra zegrcad 6 gedagjr=a=
ze==g=rcad
2
GnroMrAeada gnromraeada zegrcad 8 gn=romraeada
snhr=m====ia
4
GtcPdo gtcpdo zegrcad 5 ==gtcpdo
zegrcad=
2
FaknegTlr faknegtlr aaaoetbn 6 faknegtlr
aaaoe=tbn
3
FruoCril fruocril hrcei 5 fruocril
hr==cei=
2
CseCsada csecsada cnisa 5 cse==csada
=zegrc=ad=
2
CnolJsp cnoljsp hmeljsu 4 cnoljsp
hmeljsu
1
CniSa cnisa snhtg 4 cn==isa
cnoljsp
1
CaVc cavc zuje 4 ===cavc
myrca=r
3
CmoGbil cmogbil mriegbila 5 cm==ogbil=
=mriegbila
0
BrsvDiry brsvdiry oeshny 6 brsvdiry
oes==hny
4
AaaoEtbn aaaoetbn samdea 6 aaaoetbn=
===oeshny
3