match function in R will find the index of common elements of two items. match(A,B).
Thus, the common strings can be found by testing match\((A_i,A_{\neg i})\).
When we have \(N\) items, this requires \(D \times N\) Matrix.
By putting shortest string at the front, \((\min {D_i}) \times N\) matrix.
Below code will show how to do this, and this is actually good because it doesn’t require all the possible combinations of sub-strings.
library(dplyr)
library(magrittr)
rm(list = ls())
x = c("나는 정말로 행복합니다.", "너도 행복하니 정말로?", "이곳은 정말로 행복한 세상",
"너는 정말로 행복하니?", "그곳은 정말로 행복한가요?")
print(x)
## [1] "나는 정말로 행복합니다." "너도 행복하니 정말로?"
## [3] "이곳은 정말로 행복한 세상" "너는 정말로 행복하니?"
## [5] "그곳은 정말로 행복한가요?"
shortest_x is the shortest string.
shortest_ix <- which.min(sapply(x, nchar)) # find shortest charater set (index)
shortest_x <- x[shortest_ix] # find shortest charater set
s_x_splt <- unlist(strsplit(shortest_x, split = "")) # splitstr
sprintf("Shortest string is: %s", shortest_x)
## [1] "Shortest string is: 너도 행복하니 정말로?"
print("Splitted version: ")
## [1] "Splitted version: "
print(s_x_splt)
## [1] "너" "도" " " "행" "복" "하" "니" " " "정" "말" "로" "?"
# if want to exclude ' ', then include below line. s_x_splt[s_x_splt=='
# ']<-'&' # This is to remove ' '. This works only for simple case...
x <- x[-shortest_ix] # remove shortest one # This is A_{-i}
In the input, we have 5 strings. I removed the shortest one from x matrix.
The shortest \(shortest_x\) will be compared with other elements in \(\textbf{X}=[\textbf{x}_1,\textbf{x}_2,\textbf{x}_3,\textbf{x}_4]\) matrix using match function.
Below tables show the results. Each rows indicates the the element in shortest x is matched with the other elemnt in \(\textbf{X}\) matrix one by one. For example, the first element in shortest x, is observed only in \(\textbf{x}_3\).
na_mat <- sapply(seq(1, length(x)), function(i) {
match(s_x_splt, unlist(strsplit(x[i], split = "")))
})
knitr::kable(data.frame(na_mat) %>% mutate(shortest = s_x_splt), row.names = T)
| X1 | X2 | X3 | X4 | shortest | |
|---|---|---|---|---|---|
| 1 | NA | NA | 1 | NA | 너 |
| 2 | NA | NA | NA | NA | 도 |
| 3 | 3 | 4 | 3 | 4 | |
| 4 | 8 | 9 | 8 | 9 | 행 |
| 5 | 9 | 10 | 9 | 10 | 복 |
| 6 | NA | NA | 10 | NA | 하 |
| 7 | 11 | NA | 11 | NA | 니 |
| 8 | 3 | 4 | 3 | 4 | |
| 9 | 4 | 5 | 4 | 5 | 정 |
| 10 | 5 | 6 | 5 | 6 | 말 |
| 11 | 6 | 7 | 6 | 7 | 로 |
| 12 | NA | NA | 12 | 14 | ? |
The remove_na shows that the location of common string. In the below example, you can oberve numeric in 3, 4, 5 and 8,9,10,11 rows.
This indicates there are two common words {3,4,5} and {8,9,10,11} over the dataset. and , 정, 말, 로
Now our task is to find the longest one and get the string from the shortest string.
remove_na <- rowMeans(na_mat) # find common strings (non-na)
knitr::kable(data.frame(remove_na) %>% mutate(shortest = s_x_splt) %>% set_rownames(seq(1,
length(remove_na))), row.names = T)
| remove_na | shortest | |
|---|---|---|
| 1 | NA | 너 |
| 2 | NA | 도 |
| 3 | 3.5 | |
| 4 | 8.5 | 행 |
| 5 | 9.5 | 복 |
| 6 | NA | 하 |
| 7 | NA | 니 |
| 8 | 3.5 | |
| 9 | 4.5 | 정 |
| 10 | 5.5 | 말 |
| 11 | 6.5 | 로 |
| 12 | NA | ? |
I am not going to explain below codes since it is really bad. There might be bettwe ways..
But, as you can see, we can get correct output.
# remove_01<-sapply(remove_na,function(x){ifelse(is.na(remove_na[x]),0,1)})
# below is to find the longest substring.. there might be better ways.. like
# rle function
remove_na_bk <- remove_na
remove_na[!is.na(remove_na_bk)] <- 1
remove_na[is.na(remove_na_bk)] <- 0
d_r_na <- diff(remove_na)
change_point <- which(d_r_na == -1)
start_point <- which(d_r_na == 1)
if (remove_na[1] == 1) {
startpoint = c(0, start_point)
}
lengths <- change_point - start_point
lenn <- lengths[which.max(lengths)]
endp <- change_point[which.max(lengths)]
output <- s_x_splt[(endp - lenn + 1):(endp)]
print(paste(output, collapse = ""))
## [1] " 정말로"
# mhLCS().R
rm(list = ls())
### Original code: http://rpubs.com/cardiomoon/280342
allsubstr = function(x) {
res = c()
for (i in 1:nchar(x)) res = c(res, unique(substring(x, i, i:nchar(x))))
res
}
myLCS_sub = function(a, b) {
result = intersect(allsubstr(a), allsubstr(b))
if (length(result) == 0)
return(NULL)
result[which.max(nchar(result))]
}
# @ ------
myLCS = function(x) {
Reduce(myLCS_sub, x)
}
# myLCS
allsubstr1 = function(x) {
unlist(sapply(seq_len(nchar(x)), function(i) unique(substring(x, i, i:nchar(x)))))
}
myLCS_sub1 = function(a, b) {
result = intersect(allsubstr1(a), allsubstr1(b))
if (length(result) == 0)
return(NULL)
result[which.max(nchar(result))]
}
myLCS1 = function(x) {
Reduce(myLCS_sub1, x)
}
######### myLCS1(x)
allsubstr2 = function(x, max = NULL) {
res = c()
if (is.null(max))
max = nchar(x)
for (i in 1:max) res = c(res, unique(substring(x, 1:(nchar(x) - i + 1),
i:nchar(x))))
res
}
myLCS_sub2 = function(a, b) {
n = min(nchar(a), nchar(b))
result = intersect(allsubstr2(a, n), allsubstr2(b, n))
if (length(result) == 0)
return(NULL)
result[which.max(nchar(result))]
}
myLCS2 = function(x) {
Reduce(myLCS_sub2, x)
}
# @ mh -----
mhLCS_sub = function(a, b) {
if (nchar(a) < nchar(b)) {
tmp = a
a = b
b = tmp
}
a.vec = unlist(strsplit(a, ""))
b.vec = unlist(strsplit(b, ""))
res = c()
for (i in 1:length(a.vec)) {
for (j in which(b.vec == a.vec[i])) {
k = kk = 0
while (kk == 0) {
kk = 1
if ((i + k + 1) <= length(a.vec)) {
if ((j + k + 1) <= length(b.vec)) {
if (identical(a.vec[i:(i + k + 1)], b.vec[j:(j + k + 1)])) {
k = k + 1
kk = 0
}
}
}
}
res = c(res, Reduce(paste0, a.vec[i:(i + k)]))
}
}
res[which.max(nchar(res))]
}
mhLCS = function(x) {
Reduce(mhLCS_sub, x)
}
###### mhLCS##########
# library(magrittr)
my_str <- function(x) {
shortest_ix <- which.min(sapply(x, nchar)) # find shortest charater set (index)
shortest_x <- x[shortest_ix] # find shortest charater set
s_x_splt <- unlist(strsplit(shortest_x, split = "")) # splitstr
# if want to exclude ' ', then include below line. s_x_splt[s_x_splt=='
# ']<-'&' # This is to remove ' '. This works only for simple case...
x <- x[-shortest_ix] # remove shortest one
# this function compares splited shortest character vs the other one (use
# match function)
# mf<-function(s_x_splt,x_i){
# return(match(s_x_splt,unlist(strsplit(x_i,split = '')))) } # use sapply.
# This takes large Matrix O(shortest dim x data number)
# na_mat<-sapply(seq(1,length(x)),function(i){mf(s_x_splt,x[i])}) below
# na_mat ~ code is equivalent to above 5 lines.
na_mat <- sapply(seq(1, length(x)), function(i) {
match(s_x_splt, unlist(strsplit(x[i], split = "")))
})
remove_na <- rowMeans(na_mat) # find common strings (non-na)
# remove_01<-sapply(remove_na,function(x){ifelse(is.na(remove_na[x]),0,1)})
# below is to find the longest substring.. there might be better ways.. like
# rle function
remove_na_bk <- remove_na
remove_na[!is.na(remove_na_bk)] <- 1
remove_na[is.na(remove_na_bk)] <- 0
d_r_na <- diff(remove_na)
change_point <- which(d_r_na == -1)
start_point <- which(d_r_na == 1)
if (remove_na[1] == 1) {
startpoint = c(0, start_point)
}
lengths <- change_point - start_point
lenn <- lengths[which.max(lengths)]
endp <- change_point[which.max(lengths)]
output <- s_x_splt[(endp - lenn + 1):(endp)]
return(paste(output, collapse = ""))
}
x = c("나는 정말로 행복합니다.", "너도 행복하니 정말로?", "이곳은 정말로 행복한 세상",
"너는 정말로 행복하니?", "그곳은 정말로 행복한가요?")
x <- paste0(1:100, x)
# x=paste0(1:100,'행복',100:1)
#
library(microbenchmark)
res <- microbenchmark(myLCS(x), myLCS2(x), mhLCS(x), my_str(x))
library(ggplot2)
print(res)
## Unit: microseconds
## expr min lq mean median uq max
## myLCS(x) 22238.621 23554.827 24327.3778 24088.310 24892.665 34774.49
## myLCS2(x) 10873.077 11682.991 12462.0205 12066.691 12734.933 20896.13
## mhLCS(x) 5411.149 5855.843 6560.3561 6194.773 6901.476 23761.40
## my_str(x) 640.901 737.802 974.6081 779.868 825.690 15829.32
## neval
## 100
## 100
## 100
## 100
autoplot(res)