# define two lists of 10.000 words (wl) each
wl1 <- replicate(1e4, paste0(sample(letters, 10, TRUE), collapse = ""))
head(wl1)
## [1] "pyzepwbehj" "tosphrleny" "fkyazfekcd" "jxdhpagxxv" "qsppdoorsa"
## [6] "balhpjqaki"
wl2 <- replicate(1e4, paste0(sample(letters,10, TRUE), collapse = ""))
head(wl2, 10)
## [1] "jhqyufjvhq" "hskqecvfvy" "kefwuxyfnl" "bccrdzibzn" "ojrfosndvy"
## [6] "swhehnmnpg" "dzfhstbjgj" "fbwgtrszlb" "meydovynpa" "rgqayonhcx"
# make shure about 1000 words are in both lists
wl2[sample.int(1e4, 1000)] <- wl1[sample.int(1e4, 1000)]
# an example with loops and a hash table -- can definitively be approved on
hash <- hashtab(size = 10000)
# put the whole wl1 into the hashtable and do it with for for fun
for (word in wl1) sethash(hash, word, TRUE)
# search for words in wl2 that are in wl1 with apply
wl2.in.wl1 <- sapply(wl2, \(word) gethash(hash, word, nomatch = FALSE))
head(wl2.in.wl1, 10)
## jhqyufjvhq hskqecvfvy kefwuxyfnl bccrdzibzn ojrfosndvy swhehnmnpg dzfhstbjgj
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## fbwgtrszlb lrvoachjmo rgqayonhcx
## FALSE TRUE FALSE
# how many did we retrieve?
sum(wl2.in.wl1)
## [1] 1000
microbenchmark::microbenchmark(
store = for (word in wl1) sethash(hash, word, TRUE),
lookup = wl2.in.wl1 <- sapply(wl2, \(word) gethash(hash, word, nomatch = FALSE)),
times = 1000 )
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## store 11.0593 11.7742 13.05586 12.7431 13.8254 52.5560 1000 a
## lookup 15.9881 17.0171 18.63056 18.3363 19.4883 60.6511 1000 b
Without a loop
common.words <- intersect(wl1, wl2)
length(common.words)
## [1] 1000
microbenchmark::microbenchmark(intersect = intersect(wl1, wl2))
## Unit: microseconds
## expr min lq mean median uq max neval
## intersect 528.001 726.201 864.874 744.7505 831.4515 6262.901 100
common.words <- wl1 |>
data.frame() |>
subset(wl2 %in% wl1)
nrow(common.words)
## [1] 1000
microbenchmark::microbenchmark(subset = wl1 |> data.frame() |> subset(wl2 %in% wl1))
## Unit: microseconds
## expr min lq mean median uq max neval
## subset 718.201 792.4505 949.36 861.6015 923.851 6206.7 100