回覆 https://www.ptt.cc/bbs/R_Language/M.1514527291.A.C47.html

改自原問題的程式碼

產生測試資料

library(magrittr)

p.combination = character(1000)
for(i in 1:length(p.combination)){
  p.combination[i] = LETTERS[sample(1:26,5)] %>% paste0(collapse = ", ")
}

p.com.allowed = character(1000)
for(i in 1:length(p.com.allowed)){
  p.com.allowed[i] = LETTERS[sample(1:26,5)] %>% paste0(collapse = ", ")
}

data = data.frame(
  stringsAsFactors = FALSE,
  p.combination = p.combination, 
  p.com.allowed = p.com.allowed)
player = LETTERS[1:26]

解決問題

input.matrix0 = function(data, player, off){
  X = matrix(ncol = length(player), nrow = dim(data)[1])
  for(i in 1:dim(data)[1]){
    if(off) {
      colnames(X) = paste0("O_",player)
      coding = 1
      pp = data$p.combination
    } else {
      colnames(X) = paste0("D_",player)
      coding = -1
      pp = data$p.com.allowed
    }
    player.temp = pp[i] %>% gsub(", ", "|",.)
    index = grep(player.temp, player)
    X[i,index] = coding
    X[i,-index] = 0
  }
  return(X)
}

mowgur = function(data, player){
  X.off = input.matrix0(data, player, T)
  X.def = input.matrix0(data, player, F)
  return(cbind(X.off, X.def))
}

tan800630 的方法

w2long=function(array){
    lapply(c(1:length(array)),function(i) {
    data.table("id"=i,"player"=as.character(array[i]) %>%
    strsplit(", ") %>% .[[1]])
    }) %>% rbindlist()
}
tan800630 <- function(data, player) {
  out=merge(
  w2long(data$p.combination) %>% mutate(show=1) %>%
  dcast(id~player,value.var="show"),
  w2long(data$p.com.allowed) %>% mutate(show=-1) %>%
  dcast(id~player,value.var="show"),by="id",suffix=c("_O","_D")
  )
  out[is.na(out)]=0
  out
}

改自Andrew 的方法

library(magrittr)
library(plyr)
library(data.table)

andrew <- function(data, name.player) {
  colnames(data) <- c("p.attack", "p.defence")
  dt <- data.table(data)
  out.attack <-
    strsplit(dt$p.attack, ", ") %>%
    sapply(., function(x) {
      name.player %in% x
    }) %>%
    t %>%
    set_colnames(paste0("att_", name.player)) %>%
    mapvalues(., c(T, F), c(1L, 0L))
  out.defence <-
    strsplit(dt$p.attack, ", ") %>%
    sapply(., function(x) {
      name.player %in% x
    }) %>%
    t %>%
    set_colnames(paste0("def_", name.player)) %>%
    mapvalues(., c(T, F), c(-1L, 0L))
  cbind(out.attack, out.defence)
}

text2vec + sparse matrix

library(Matrix)
library(text2vec)
wush <- function(data, player) {
  it <- itoken(data[[1]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  it2 <- itoken(data[[2]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  vocab <- create_vocabulary(player)
  vectorizer <- vocab_vectorizer(vocab)
  m1 <- create_dtm(it, vectorizer)
  m2 <- create_dtm(it2, vectorizer)
  m2@x[] <- -1
  cbind(m1, m2)
}

Benchmark

library(rbenchmark)
result <- benchmark(
  replications = 10,
  out1 <- mowgur(data, player),
  out2 <- tan800630(data, player),
  out3 <- andrew(data, player),
  out4 <- wush(data, player)
)
knitr::kable(result)
test replications elapsed relative user.self sys.self user.child sys.child
out1 <- mowgur(data, player) 10 4.050 30.916 4.032 0.008 0 0
out2 <- tan800630(data, player) 10 8.272 63.145 8.204 0.024 0 0
out3 <- andrew(data, player) 10 0.131 1.000 0.128 0.000 0 0
out4 <- wush(data, player) 10 0.490 3.740 0.456 0.028 0 0
out2 <- out2[,-1]
colnames(out1) <- colnames(out2) <- colnames(out3) <- colnames(out4) <- rep(player, 2)
which(out1[1,] == 1)

D P Q S T 4 16 17 19 20

which(out2[1,] == 1)

[1] 4 16 17 19 20

which(out3[1,] == 1)

D P Q S T 4 16 17 19 20

which(out4[1,] == 1)

D P Q S T 4 16 17 19 20

which(out1[1,] == -1)

D I N X Z 30 35 40 50 52

which(out2[1,] == -1)

[1] 30 35 40 50 52

which(out3[1,] == -1)

D P Q S T 30 42 43 45 46

which(out4[1,] == -1)

D I N X Z 30 35 40 50 52

Benchmark bigger matrix with parallelized text2vec iterator

library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(8)
wush <- function(data, player) {
  it <- itoken_parallel(data[[1]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  it2 <- itoken_parallel(data[[2]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  vocab <- create_vocabulary(player)
  vectorizer <- vocab_vectorizer(vocab)
  m1 <- create_dtm(it, vectorizer)
  m2 <- create_dtm(it2, vectorizer)
  m2@x[] <- -1
  cbind(m1, m2)
}
n <- 1e+5
p.combination = character(n)
for(i in 1:length(p.combination)){
  p.combination[i] = LETTERS[sample(1:26,5)] %>% paste0(collapse = ", ")
}

p.com.allowed = character(n)
for(i in 1:length(p.com.allowed)){
  p.com.allowed[i] = LETTERS[sample(1:26,5)] %>% paste0(collapse = ", ")
}

data = data.frame(
  stringsAsFactors = FALSE,
  p.combination = p.combination, 
  p.com.allowed = p.com.allowed)
player = LETTERS[1:26]
result2 <- benchmark(
  replications = 10,
#  out1 <- mowgur(data, player),
#  out2 <- tan800630(data, player),
  out3 <- andrew(data, player),
  out4 <- wush(data, player)
)
knitr::kable(result2)
test replications elapsed relative user.self sys.self user.child sys.child
out3 <- andrew(data, player) 10 16.916 1.528 16.78 0.128 0.000 0.000
out4 <- wush(data, player) 10 11.071 1.000 2.04 1.556 39.584 7.348
sparse 比較省記憶體
object.size(out3)
## 20803320 bytes
object.size(out4)
## 13283376 bytes