# define two lists of 10.000 words (wl) each
wl1 <- replicate(1e4, paste0(sample(letters, 10, TRUE), collapse = ""))
head(wl1)

## [1] "pyzepwbehj" "tosphrleny" "fkyazfekcd" "jxdhpagxxv" "qsppdoorsa"
## [6] "balhpjqaki"

wl2 <- replicate(1e4, paste0(sample(letters,10, TRUE), collapse = ""))
head(wl2, 10)

##  [1] "jhqyufjvhq" "hskqecvfvy" "kefwuxyfnl" "bccrdzibzn" "ojrfosndvy"
##  [6] "swhehnmnpg" "dzfhstbjgj" "fbwgtrszlb" "meydovynpa" "rgqayonhcx"

# make shure about 1000 words are in both lists
wl2[sample.int(1e4, 1000)] <- wl1[sample.int(1e4, 1000)]

# an example with loops and a hash table -- can definitively be approved on
hash <- hashtab(size = 10000)

# put the whole wl1 into the hashtable and do it with for for fun
for (word in wl1) sethash(hash, word, TRUE) 

# search for words in wl2 that are in wl1 with apply
wl2.in.wl1 <- sapply(wl2, \(word) gethash(hash, word, nomatch = FALSE))
head(wl2.in.wl1, 10)

## jhqyufjvhq hskqecvfvy kefwuxyfnl bccrdzibzn ojrfosndvy swhehnmnpg dzfhstbjgj 
##      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE 
## fbwgtrszlb lrvoachjmo rgqayonhcx 
##      FALSE       TRUE      FALSE

# how many did we retrieve?
sum(wl2.in.wl1)

## [1] 1000

microbenchmark::microbenchmark(
                  store = for (word in wl1) sethash(hash, word, TRUE),
                  lookup = wl2.in.wl1 <- sapply(wl2, \(word) gethash(hash, word, nomatch = FALSE)),
                  times = 1000 )

## Unit: milliseconds
##    expr     min      lq     mean  median      uq     max neval cld
##   store 11.0593 11.7742 13.05586 12.7431 13.8254 52.5560  1000  a 
##  lookup 15.9881 17.0171 18.63056 18.3363 19.4883 60.6511  1000   b

Without a loop

common.words <- intersect(wl1, wl2)
length(common.words)

## [1] 1000

microbenchmark::microbenchmark(intersect = intersect(wl1, wl2))

## Unit: microseconds
##       expr     min      lq    mean   median       uq      max neval
##  intersect 528.001 726.201 864.874 744.7505 831.4515 6262.901   100

common.words <- wl1 |> 
                      data.frame() |>
                      subset(wl2 %in% wl1)
nrow(common.words)

## [1] 1000

microbenchmark::microbenchmark(subset = wl1 |> data.frame() |> subset(wl2 %in% wl1))

## Unit: microseconds
##    expr     min       lq   mean   median      uq    max neval
##  subset 718.201 792.4505 949.36 861.6015 923.851 6206.7   100

How to search 10000 words in 10000 words

2022-09-10

Without a loop