Common string Problem

Common string

match function in R will find the index of common elements of two items. match(A,B).
Thus, the common strings can be found by testing match\((A_i,A_{\neg i})\).
When we have \(N\) items, this requires \(D \times N\) Matrix.
By putting shortest string at the front, \((\min {D_i}) \times N\) matrix.
Below code will show how to do this, and this is actually good because it doesn’t require all the possible combinations of sub-strings.

Test string

library(dplyr)
library(magrittr)
rm(list = ls())
x = c("나는 정말로 행복합니다.", "너도 행복하니 정말로?", "이곳은 정말로 행복한 세상", 
    "너는 정말로 행복하니?", "그곳은 정말로 행복한가요?")
print(x)

## [1] "나는 정말로 행복합니다."   "너도 행복하니 정말로?"    
## [3] "이곳은 정말로 행복한 세상" "너는 정말로 행복하니?"    
## [5] "그곳은 정말로 행복한가요?"

Shortest string

shortest_x is the shortest string.

shortest_ix <- which.min(sapply(x, nchar))  # find shortest charater set (index)
shortest_x <- x[shortest_ix]  # find shortest charater set
s_x_splt <- unlist(strsplit(shortest_x, split = ""))  # splitstr
sprintf("Shortest string is: %s", shortest_x)

## [1] "Shortest string is: 너도 행복하니 정말로?"

print("Splitted version: ")

## [1] "Splitted version: "

print(s_x_splt)

##  [1] "너" "도" " "  "행" "복" "하" "니" " "  "정" "말" "로" "?"

# if want to exclude ' ', then include below line.  s_x_splt[s_x_splt=='
# ']<-'&' # This is to remove ' '. This works only for simple case...
x <- x[-shortest_ix]  # remove shortest one  # This is A_{-i}

Match over the matrix

In the input, we have 5 strings. I removed the shortest one from x matrix.
The shortest \(shortest_x\) will be compared with other elements in \(\textbf{X}=[\textbf{x}_1,\textbf{x}_2,\textbf{x}_3,\textbf{x}_4]\) matrix using match function.
Below tables show the results. Each rows indicates the the element in shortest x is matched with the other elemnt in \(\textbf{X}\) matrix one by one. For example, the first element in shortest x, is observed only in \(\textbf{x}_3\).

na_mat <- sapply(seq(1, length(x)), function(i) {
    match(s_x_splt, unlist(strsplit(x[i], split = "")))
})
knitr::kable(data.frame(na_mat) %>% mutate(shortest = s_x_splt), row.names = T)

	X1	X2	X3	X4	shortest
1	NA	NA	1	NA	너
2	NA	NA	NA	NA	도
3	3	4	3	4
4	8	9	8	9	행
5	9	10	9	10	복
6	NA	NA	10	NA	하
7	11	NA	11	NA	니
8	3	4	3	4
9	4	5	4	5	정
10	5	6	5	6	말
11	6	7	6	7	로
12	NA	NA	12	14	?

The remove_na shows that the location of common string. In the below example, you can oberve numeric in 3, 4, 5 and 8,9,10,11 rows.
This indicates there are two common words {3,4,5} and {8,9,10,11} over the dataset. and , 정, 말, 로
Now our task is to find the longest one and get the string from the shortest string.

remove_na <- rowMeans(na_mat)  # find common strings (non-na)
knitr::kable(data.frame(remove_na) %>% mutate(shortest = s_x_splt) %>% set_rownames(seq(1, 
    length(remove_na))), row.names = T)

	remove_na	shortest
1	NA	너
2	NA	도
3	3.5
4	8.5	행
5	9.5	복
6	NA	하
7	NA	니
8	3.5
9	4.5	정
10	5.5	말
11	6.5	로
12	NA	?

I am not going to explain below codes since it is really bad. There might be bettwe ways..
But, as you can see, we can get correct output.

# remove_01<-sapply(remove_na,function(x){ifelse(is.na(remove_na[x]),0,1)})
# below is to find the longest substring.. there might be better ways.. like
# rle function
remove_na_bk <- remove_na
remove_na[!is.na(remove_na_bk)] <- 1
remove_na[is.na(remove_na_bk)] <- 0

d_r_na <- diff(remove_na)
change_point <- which(d_r_na == -1)
start_point <- which(d_r_na == 1)
if (remove_na[1] == 1) {
    startpoint = c(0, start_point)
}
lengths <- change_point - start_point
lenn <- lengths[which.max(lengths)]
endp <- change_point[which.max(lengths)]
output <- s_x_splt[(endp - lenn + 1):(endp)]
print(paste(output, collapse = ""))

## [1] " 정말로"

Comparison

# mhLCS().R
rm(list = ls())
### Original code: http://rpubs.com/cardiomoon/280342

allsubstr = function(x) {
    res = c()
    for (i in 1:nchar(x)) res = c(res, unique(substring(x, i, i:nchar(x))))
    res
}


myLCS_sub = function(a, b) {
    result = intersect(allsubstr(a), allsubstr(b))
    if (length(result) == 0) 
        return(NULL)
    result[which.max(nchar(result))]
}



# @ ------
myLCS = function(x) {
    Reduce(myLCS_sub, x)
}
# myLCS

allsubstr1 = function(x) {
    unlist(sapply(seq_len(nchar(x)), function(i) unique(substring(x, i, i:nchar(x)))))
    
}
myLCS_sub1 = function(a, b) {
    result = intersect(allsubstr1(a), allsubstr1(b))
    if (length(result) == 0) 
        return(NULL)
    result[which.max(nchar(result))]
}
myLCS1 = function(x) {
    Reduce(myLCS_sub1, x)
}
######### myLCS1(x)

allsubstr2 = function(x, max = NULL) {
    res = c()
    if (is.null(max)) 
        max = nchar(x)
    for (i in 1:max) res = c(res, unique(substring(x, 1:(nchar(x) - i + 1), 
        i:nchar(x))))
    res
}


myLCS_sub2 = function(a, b) {
    n = min(nchar(a), nchar(b))
    result = intersect(allsubstr2(a, n), allsubstr2(b, n))
    if (length(result) == 0) 
        return(NULL)
    result[which.max(nchar(result))]
}
myLCS2 = function(x) {
    Reduce(myLCS_sub2, x)
}


# @ mh -----
mhLCS_sub = function(a, b) {
    if (nchar(a) < nchar(b)) {
        tmp = a
        a = b
        b = tmp
    }
    a.vec = unlist(strsplit(a, ""))
    b.vec = unlist(strsplit(b, ""))
    res = c()
    for (i in 1:length(a.vec)) {
        for (j in which(b.vec == a.vec[i])) {
            k = kk = 0
            while (kk == 0) {
                kk = 1
                if ((i + k + 1) <= length(a.vec)) {
                  if ((j + k + 1) <= length(b.vec)) {
                    if (identical(a.vec[i:(i + k + 1)], b.vec[j:(j + k + 1)])) {
                      k = k + 1
                      kk = 0
                    }
                  }
                }
            }
            res = c(res, Reduce(paste0, a.vec[i:(i + k)]))
        }
    }
    res[which.max(nchar(res))]
}


mhLCS = function(x) {
    Reduce(mhLCS_sub, x)
}

###### mhLCS##########


# library(magrittr)
my_str <- function(x) {
    shortest_ix <- which.min(sapply(x, nchar))  # find shortest charater set (index)
    shortest_x <- x[shortest_ix]  # find shortest charater set
    s_x_splt <- unlist(strsplit(shortest_x, split = ""))  # splitstr
    # if want to exclude ' ', then include below line.  s_x_splt[s_x_splt=='
    # ']<-'&' # This is to remove ' '. This works only for simple case...
    x <- x[-shortest_ix]  # remove shortest one
    
    # this function compares splited shortest character vs the other one (use
    # match function)
    
    # mf<-function(s_x_splt,x_i){
    # return(match(s_x_splt,unlist(strsplit(x_i,split = '')))) } # use sapply.
    # This takes large Matrix O(shortest dim x data number)
    # na_mat<-sapply(seq(1,length(x)),function(i){mf(s_x_splt,x[i])}) below
    # na_mat ~ code is equivalent to above 5 lines.
    na_mat <- sapply(seq(1, length(x)), function(i) {
        match(s_x_splt, unlist(strsplit(x[i], split = "")))
    })
    
    remove_na <- rowMeans(na_mat)  # find common strings (non-na)
    
    # remove_01<-sapply(remove_na,function(x){ifelse(is.na(remove_na[x]),0,1)})
    # below is to find the longest substring.. there might be better ways.. like
    # rle function
    remove_na_bk <- remove_na
    remove_na[!is.na(remove_na_bk)] <- 1
    remove_na[is.na(remove_na_bk)] <- 0
    
    d_r_na <- diff(remove_na)
    change_point <- which(d_r_na == -1)
    start_point <- which(d_r_na == 1)
    if (remove_na[1] == 1) {
        startpoint = c(0, start_point)
    }
    lengths <- change_point - start_point
    lenn <- lengths[which.max(lengths)]
    endp <- change_point[which.max(lengths)]
    output <- s_x_splt[(endp - lenn + 1):(endp)]
    return(paste(output, collapse = ""))
}


x = c("나는 정말로 행복합니다.", "너도 행복하니 정말로?", "이곳은 정말로 행복한 세상", 
    "너는 정말로 행복하니?", "그곳은 정말로 행복한가요?")
x <- paste0(1:100, x)
# x=paste0(1:100,'행복',100:1)


# 
library(microbenchmark)
res <- microbenchmark(myLCS(x), myLCS2(x), mhLCS(x), my_str(x))
library(ggplot2)
print(res)

## Unit: microseconds
##       expr       min        lq       mean    median        uq      max
##   myLCS(x) 22238.621 23554.827 24327.3778 24088.310 24892.665 34774.49
##  myLCS2(x) 10873.077 11682.991 12462.0205 12066.691 12734.933 20896.13
##   mhLCS(x)  5411.149  5855.843  6560.3561  6194.773  6901.476 23761.40
##  my_str(x)   640.901   737.802   974.6081   779.868   825.690 15829.32
##  neval
##    100
##    100
##    100
##    100

autoplot(res)