1. 개요

2. 영화평 데이터 설명

Ignoring unknown parameters: binwidth, bins, pad

3. Pre-trained Vectors 설명

4. 결과

      test.y
y.pred   0   1   2   3   4   5   6   7   8   9
     0 682 377 326 253 210 107  95  92  82 143
     1 249 294 309 241 200 145  81  67  63  52
     2 169 266 248 231 189 151  88  53  39  43
     3  94 211 192 252 241 168 126  65  42  27
     4  55  89 121 114 126 146  93  52  32  25
     5 101 155 214 250 278 351 314 187  85  41
     6  38  67  84 127 144 241 258 212 141  70
     7  44  66  83 106 133 226 343 444 330 177
     8  75  71 101 101 122 153 257 382 489 370
     9 407 340 262 288 302 244 296 432 587 936
test.sentence <- c("잼나요흥미있고 즐거운시간이었어요", 
                   "재미없다",
                   "쓰레기 영화",
                   "볼만함",
                   "스토리 구성이 억지스럽다." )
tmp <- multiple.input.pipeline(test.sentence, dict, max.seq)
sess$run(y_pred, feed_dict=dict(X=tmp, dropout.p=1.0))
[1] 9 0 0 9 2

실제로 아무 영화평을 만들어서 집어넣어보니 학습이 되는중이긴 한 것 같다.

5. 결론

  1. 어느정도 인터넷 용어들이 들어가있긴 한데 그래도 많은 단어들이 여전히 Missing
  2. 한동안 한국어 NLP 처리시에는 현재로서는 선택지가 없으니 그냥 Pretrained Vectors를 사용하게 될 것 같다

6. Appendix

library(tensorflow)
rm(list=".Random.seed", envir=globalenv()) 
batch.size = 128L
max.step = 500L
print.every = 10L
hidden.dim = 64L
dropout = 0.5

tf$reset_default_graph()
X <- tf$placeholder(tf$int64, shape(NULL, max.seq))
y <- tf$placeholder(tf$int64, shape(NULL))
dropout.p <- tf$placeholder(tf$float32)

embed.W <- tf$Variable(embedding_m) # pretrained vectors
after.embed <- tf$nn$embedding_lookup(embed.W, X)
after.embed <- tf$cast(after.embed, tf$float32)
get.length <- function(input) {
    # https://danijar.com/variable-sequence-lengths-in-tensorflow/
    used <- tf$sign(tf$reduce_max(tf$abs(input), axis = 2L))
    length <- tf$reduce_sum(used, axis = 1L)
    length <- tf$cast(length, tf$int32)
    length
}
lstm <- tf$contrib$rnn$BasicLSTMCell(hidden.dim)
result <- tf$nn$dynamic_rnn(lstm, after.embed, 
                            dtype = tf$float32, 
                            sequence_length = get.length(after.embed))
output <- result[[1]]
state <- result[[2]]

last.relevant <- function(output, length) {
    batch_size = tf$shape(output)[0]
    max_length = tf$shape(output)[1]
    out_size = hidden.dim
    index = tf$multiply(tf$cast(tf$range(0, batch_size), tf$int32), max_length) + tf$subtract(length, 1L)
    flat = tf$reshape(output, c(-1L, out_size))
    tf$gather(flat, index)
}

interest <- last.relevant(output, get.length(after.embed))
output <- interest#state$h
output <- tf$contrib$layers$fully_connected(output, hidden.dim, activation_fn=tf$nn$relu)
output <- tf$contrib$layers$dropout(output, keep_prob=dropout.p)
output <- tf$contrib$layers$linear(output, 10L)

loss <- tf$nn$sparse_softmax_cross_entropy_with_logits(logits=output, labels=y)
loss <- tf$reduce_mean(loss)

train.op <- tf$train$AdamOptimizer()$minimize(loss)

y_pred <- tf$argmax(output, axis=1L)
equal <- tf$equal(y_pred, y)
acc <- tf$reduce_mean(tf$cast(equal, tf$float32))

sess = tf$InteractiveSession()
init = tf$global_variables_initializer()
sess$run(init)


batch.generator <- function(X, y, batch_size=32, batch_idx=c()) {
    if (length(batch_idx) < batch_size) {
        batch_idx <- sample(1:nrow(X))
    } 
    idx <- sample(batch_idx, size = batch_size, replace = F)
    next_idx <- setdiff(batch_idx, idx)
    
    result <- list()
    result[['X']] <- X[idx, ]
    result[['y']] <- y[idx]
    result[['batch_idx']] <- next_idx
    result[['dropout.p']] <- dropout
    result
}

batch.idx <- c()
for(i in 1:max.step) {
    batch <- batch.generator(train.tensor, train.y, batch.size, batch.idx)
    batch.idx <- batch[['batch_idx']]
    result <- sess$run(c(train.op, loss, acc), 
                       feed_dict=dict(X=batch[['X']], y=batch[['y']], dropout.p=batch[['dropout.p']]))
    if (i %% print.every == 0 | i == max.step | i == 1) {
        cat(sprintf("\n[Step: %5d] Loss: %f Acc: %f", i, result[[2]], result[[3]]))
    }
}
