1. 개요

2. 영화평 데이터 설명

Ignoring unknown parameters: binwidth, bins, pad

3. Pre-trained Vectors 설명

4. 결과

      test.y
y.pred   0   1   2   3   4   5   6   7   8   9
     0 682 377 326 253 210 107  95  92  82 143
     1 249 294 309 241 200 145  81  67  63  52
     2 169 266 248 231 189 151  88  53  39  43
     3  94 211 192 252 241 168 126  65  42  27
     4  55  89 121 114 126 146  93  52  32  25
     5 101 155 214 250 278 351 314 187  85  41
     6  38  67  84 127 144 241 258 212 141  70
     7  44  66  83 106 133 226 343 444 330 177
     8  75  71 101 101 122 153 257 382 489 370
     9 407 340 262 288 302 244 296 432 587 936
test.sentence <- c("잼나요흥미있고 즐거운시간이었어요", 
                   "재미없다",
                   "쓰레기 영화",
                   "볼만함",
                   "스토리 구성이 억지스럽다." )
tmp <- multiple.input.pipeline(test.sentence, dict, max.seq)
sess$run(y_pred, feed_dict=dict(X=tmp, dropout.p=1.0))
[1] 9 0 0 9 2

실제로 아무 영화평을 만들어서 집어넣어보니 학습이 되는중이긴 한 것 같다.

5. 결론

  1. 어느정도 인터넷 용어들이 들어가있긴 한데 그래도 많은 단어들이 여전히 Missing
  2. 한동안 한국어 NLP 처리시에는 현재로서는 선택지가 없으니 그냥 Pretrained Vectors를 사용하게 될 것 같다

6. Appendix

library(tensorflow)
rm(list=".Random.seed", envir=globalenv()) 
batch.size = 128L
max.step = 500L
print.every = 10L
hidden.dim = 64L
dropout = 0.5

tf$reset_default_graph()
X <- tf$placeholder(tf$int64, shape(NULL, max.seq))
y <- tf$placeholder(tf$int64, shape(NULL))
dropout.p <- tf$placeholder(tf$float32)

embed.W <- tf$Variable(embedding_m) # pretrained vectors
after.embed <- tf$nn$embedding_lookup(embed.W, X)
after.embed <- tf$cast(after.embed, tf$float32)
get.length <- function(input) {
    # https://danijar.com/variable-sequence-lengths-in-tensorflow/
    used <- tf$sign(tf$reduce_max(tf$abs(input), axis = 2L))
    length <- tf$reduce_sum(used, axis = 1L)
    length <- tf$cast(length, tf$int32)
    length
}
lstm <- tf$contrib$rnn$BasicLSTMCell(hidden.dim)
result <- tf$nn$dynamic_rnn(lstm, after.embed, 
                            dtype = tf$float32, 
                            sequence_length = get.length(after.embed))
output <- result[[1]]
state <- result[[2]]

last.relevant <- function(output, length) {
    batch_size = tf$shape(output)[0]
    max_length = tf$shape(output)[1]
    out_size = hidden.dim
    index = tf$multiply(tf$cast(tf$range(0, batch_size), tf$int32), max_length) + tf$subtract(length, 1L)
    flat = tf$reshape(output, c(-1L, out_size))
    tf$gather(flat, index)
}

interest <- last.relevant(output, get.length(after.embed))
output <- interest#state$h
output <- tf$contrib$layers$fully_connected(output, hidden.dim, activation_fn=tf$nn$relu)
output <- tf$contrib$layers$dropout(output, keep_prob=dropout.p)
output <- tf$contrib$layers$linear(output, 10L)

loss <- tf$nn$sparse_softmax_cross_entropy_with_logits(logits=output, labels=y)
loss <- tf$reduce_mean(loss)

train.op <- tf$train$AdamOptimizer()$minimize(loss)

y_pred <- tf$argmax(output, axis=1L)
equal <- tf$equal(y_pred, y)
acc <- tf$reduce_mean(tf$cast(equal, tf$float32))

sess = tf$InteractiveSession()
init = tf$global_variables_initializer()
sess$run(init)


batch.generator <- function(X, y, batch_size=32, batch_idx=c()) {
    if (length(batch_idx) < batch_size) {
        batch_idx <- sample(1:nrow(X))
    } 
    idx <- sample(batch_idx, size = batch_size, replace = F)
    next_idx <- setdiff(batch_idx, idx)
    
    result <- list()
    result[['X']] <- X[idx, ]
    result[['y']] <- y[idx]
    result[['batch_idx']] <- next_idx
    result[['dropout.p']] <- dropout
    result
}

batch.idx <- c()
for(i in 1:max.step) {
    batch <- batch.generator(train.tensor, train.y, batch.size, batch.idx)
    batch.idx <- batch[['batch_idx']]
    result <- sess$run(c(train.op, loss, acc), 
                       feed_dict=dict(X=batch[['X']], y=batch[['y']], dropout.p=batch[['dropout.p']]))
    if (i %% print.every == 0 | i == max.step | i == 1) {
        cat(sprintf("\n[Step: %5d] Loss: %f Acc: %f", i, result[[2]], result[[3]]))
    }
}
LS0tCnRpdGxlOiAiRmFjZWJvb2sgUHJldHJhaW5lZCBXb3JkIFZlY3RvcnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KYGBge3IgaW5jbHVkZV9saWJyYXJ5LCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KERUKQpsaWJyYXJ5KGdncGxvdDIpCmBgYAoKIyMgMS4g6rCc7JqUCi0gRmFjZWJvb2vsl5DshJwg6rO16rCc7ZWcIFtwcmV0cmFpbmVkIHdvcmQgdmVjdG9yXShodHRwczovL2dpdGh1Yi5jb20vZmFjZWJvb2tyZXNlYXJjaC9mYXN0VGV4dCnsnYQg7IKs7Jqp7ZW067O06riwCi0g7JiB7ZmU7Y+JIOyVvSAxMDAsMDAwIOqxtOyXkCDrjIDtlZwgQ2xhc3NpZmljYXRpb24g66eM65Ok7Ja067O06riwCi0g7Iuk7KCcIOyduO2EsOuEt+yXkOyEnCDrrLjrspXsl5Ag66ee7KeAIOyViuydgCDri6jslrTrk6Tsl5Ag64yA7ZW0IOuwlOuhnCDsgqzsmqnsnbQg6rCA64ql7ZWc7KeAIO2ZleyduO2VtOuzuOuLpAoKIyMgMi4g7JiB7ZmU7Y+JIOuNsOydtO2EsCDshKTrqoUKLSDsoITsspjrpqzripQg7ZWc6riA66eMIOuCqOq4sOuKlCDsqr3snLzroZwg7LWc7IaM7ZmUCiAgICAtIOudhOyWtOyTsOq4sCDsoJXqt5ztmZQKICAgIC0g7Iir7J6QIOygnOqxsAogICAgLSDslYztjIzrsrMg7KCc6rGwCi0g7ZWc6riA7JiB7ZmU7Y+JIDEw66eM6rCc66W8IO2BrOuhpOunge2VmOyXrCDspIDruYQKICAgIC0gVHJhaW4g6rO8IFRlc3Qg7IWL7J2AIDg6MuuhnCDrgpjriITslrQg7IKs7JqpCmBgYHtyLCBlY2hvPUZ9CmlkeCA8LSBzYW1wbGUoMTpucm93KG1vdmllLmNvbW1lbnRzKSwgc2l6ZT0xMDAsIHJlcGxhY2U9RikKZGF0YXRhYmxlKG1vdmllLmNvbW1lbnRzW2lkeCwgYygnY29tbWVudCcsICdyYXRpbmcnLCAnbW92aWUnKV0pCmBgYAotIOyYge2ZlCDtj4nsoJAg67aE7Y+sICgxIH4gMTDsoJApCiAgICAtIOqwgeqwgeydmCDsiqTsvZTslrTrs4TroZwg64yA6561IDEwLDAwMOyXrOqwnOyUqSDsobTsnqwKICAgIC0g7Iuk7KCcIO2biOugqOyLnOyXkOuKlCAtMeydhCDrubzspJjshJwgMCB+IDnsoJDsnLzroZwg66eM65Ok7Ja07IScIOyCrOyaqQogICAgICAgIC0gUuyXkOyEnCBUZW5zb3JmbG9364+M66Ck64+EIFB5dGhvbiDsnITsl5DshJwg64+M7JWE6rCA6riwIOuVjOusuOyXkCBQeXRob24gSW5kZXjroZwg66ee7Law7KSY7JW8IO2VmOq4sCDrlYzrrLgKICAgICAgICAtIOydtO2bhOuhnOuKlCAxIH4gMTDsoJAg7Iqk7LyA7J28IOuMgOyLoCAwIH4gOeygkCDsiqTsvIDsnbzroZwg7IKs7JqpCmBgYHtyLCBlY2hvPUYsIG1lc3NhZ2U9Riwgd2FybmluZz1GfQpnIDwtIGdncGxvdChkYXRhPW1vdmllLmNvbW1lbnRzLCBhZXMoeD1yYXRpbmcpKQpnIDwtIGcgKyBnZW9tX2hpc3RvZ3JhbShzdGF0ID0gImNvdW50IikgCmcgPC0gZyArIHhsYWIoIlJhdGluZyIpICsgeWxhYigiIikKZyA8LSBnICsgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IHNjYWxlczo6Y29tbWEpCmcKYGBgCgoKIyMgMy4gUHJlLXRyYWluZWQgVmVjdG9ycyDshKTrqoUKLSBbZmFzdFRleHQgR2l0aHViXShodHRwczovL2dpdGh1Yi5jb20vZmFjZWJvb2tyZXNlYXJjaC9mYXN0VGV4dC9ibG9iL21hc3Rlci9wcmV0cmFpbmVkLXZlY3RvcnMubWQp7JeQ7IScIEtvcmVhbiB0ZXh066W8IOuLpOyatOuhnOuTnAotIOyggOyepeuQmOyWtCDsnojripQg64uo7Ja0IOyImDogYHIgbnJvdyh3b3JkLnZlYylgCi0g64uo7Ja065Ok7J2EIOyCtO2OtOuztOuptCDtlZzquIDrv5Drp4wg7JWE64uI6528IOyYgeyWtCDtlZzsnpAg7J207IOB7ZWcIOq4gOyekOuTpOuPhCDtj6ztlajrkJjslrQg7J6I7J2MCi0gYGRhdGEudGFibGVgIO2MqO2CpOyngCBgZnJlYWRg66GcIOu2iOufrOyYpOuptCDslYTrnpjsmYAg6rCZ7J2AIO2Yle2DnOuhnCDsl7TrprwKYGBge3IsIGVjaG89Rn0KaWR4IDwtIHNhbXBsZSgxOm5yb3cod29yZC52ZWMpLCBzaXplPTUwMCwgcmVwbGFjZSA9IEYpCndvcmQudmVjW2lkeCwgXQpgYGAKCiMjIDQuIOqysOqzvAotIFLsl5DshJwg6riw67O4IExTVE0g66qo642466GcIOyVvSA167aEIOygleuPhCDtlZnsirUKLSBUcmFpbiDsoJXtmZXrj4Q6IGByIHNlc3MkcnVuKGFjYywgZmVlZF9kaWN0PWRpY3QoWD10cmFpbi50ZW5zb3IsIHk9dHJhaW4ueSwgZHJvcG91dC5wPTEuMCkpIGAKLSBUZXN0IOygle2ZleuPhDogYHIgc2VzcyRydW4oYWNjLCBmZWVkX2RpY3Q9ZGljdChYPXRlc3QudGVuc29yLCB5PXRlc3QueSwgZHJvcG91dC5wPTEuMCkpYAotIEJhc2VsaW5lIOygle2ZleuPhDogMTAlICgxLzEwIOuenOuNpCDshKDtg50pCi0g7JiI7Lih7ZW07JW8IO2VmOuKlCDtgbTrnpjsiqTqsIAgMTDqsJzsnbTri6Qg67O064uIIOygle2ZleuPhOqwgCDrgq7qs6Ag7Jik67KE7ZWPIOuQmOyXiOydjAotIDDsoJDqs7wgOeygkOydmCDsoJXtmZXrj4TripQg6re465+t7KCA65+tIOq0nOywruydgOuNsCDspJHqsIQg6rWs7Jet7JeQIOuMgO2VnCDsmIjsuKHsnbQg7Ja066Ck7JuMIOygle2ZleuPhOqwgCDrgq7snYwKYGBge3IsIGV2YWw9VCwgZWNobz1GfQp5LnByZWQgPC0gc2VzcyRydW4oeV9wcmVkLCBmZWVkX2RpY3Q9ZGljdChYPXRlc3QudGVuc29yLCBkcm9wb3V0LnA9MS4wKSkKdGFibGUoeS5wcmVkLCB0ZXN0LnkpCmBgYAoKYGBge3IsIGV2YWw9VCwgZWNobz1UfQp0ZXN0LnNlbnRlbmNlIDwtIGMoIuyevOuCmOyalO2dpeuvuOyeiOqzoCDsppDqsbDsmrTsi5zqsITsnbTsl4jslrTsmpQiLCAKICAgICAgICAgICAgICAgICAgICLsnqzrr7jsl4bri6QiLAogICAgICAgICAgICAgICAgICAgIuyTsOugiOq4sCDsmIHtmZQiLAogICAgICAgICAgICAgICAgICAgIuuzvOunjO2VqCIsCiAgICAgICAgICAgICAgICAgICAi7Iqk7Yag66asIOq1rOyEseydtCDslrXsp4DsiqTrn73ri6QuIiApCnRtcCA8LSBtdWx0aXBsZS5pbnB1dC5waXBlbGluZSh0ZXN0LnNlbnRlbmNlLCBkaWN0LCBtYXguc2VxKQpzZXNzJHJ1bih5X3ByZWQsIGZlZWRfZGljdD1kaWN0KFg9dG1wLCBkcm9wb3V0LnA9MS4wKSkKYGBgCuyLpOygnOuhnCDslYTrrLQg7JiB7ZmU7Y+J7J2EIOunjOuTpOyWtOyEnCDsp5HslrTrhKPslrTrs7Tri4gg7ZWZ7Iq17J20IOuQmOuKlOykkeydtOq4tCDtlZwg6rKDIOqwmeuLpC4KCiMjIDUuIOqysOuhoAoxLiDslrTripDsoJXrj4Qg7J247YSw64S3IOyaqeyWtOuTpOydtCDrk6TslrTqsIDsnojquLQg7ZWc642wIOq3uOuemOuPhCDrp47snYAg64uo7Ja065Ok7J20IOyXrOyghO2eiCBNaXNzaW5nCjIuIO2VnOuPmeyViCDtlZzqta3slrQgTkxQIOyymOumrOyLnOyXkOuKlCDtmITsnqzroZzshJzripQg7ISg7YOd7KeA6rCAIOyXhuycvOuLiCDqt7jrg6UgUHJldHJhaW5lZCBWZWN0b3Jz66W8IOyCrOyaqe2VmOqyjCDrkKAg6rKDIOqwmeuLpAoKIyMgNi4gQXBwZW5kaXgKLSBUZW5zb3JmbG93IOy9lOuTnApgYGB7ciBldmFsPUZBTFNFfQpsaWJyYXJ5KHRlbnNvcmZsb3cpCnJtKGxpc3Q9Ii5SYW5kb20uc2VlZCIsIGVudmlyPWdsb2JhbGVudigpKSAKYmF0Y2guc2l6ZSA9IDEyOEwKbWF4LnN0ZXAgPSA1MDBMCnByaW50LmV2ZXJ5ID0gMTBMCmhpZGRlbi5kaW0gPSA2NEwKZHJvcG91dCA9IDAuNQoKdGYkcmVzZXRfZGVmYXVsdF9ncmFwaCgpClggPC0gdGYkcGxhY2Vob2xkZXIodGYkaW50NjQsIHNoYXBlKE5VTEwsIG1heC5zZXEpKQp5IDwtIHRmJHBsYWNlaG9sZGVyKHRmJGludDY0LCBzaGFwZShOVUxMKSkKZHJvcG91dC5wIDwtIHRmJHBsYWNlaG9sZGVyKHRmJGZsb2F0MzIpCgplbWJlZC5XIDwtIHRmJFZhcmlhYmxlKGVtYmVkZGluZ19tKSAjIHByZXRyYWluZWQgdmVjdG9ycwphZnRlci5lbWJlZCA8LSB0ZiRubiRlbWJlZGRpbmdfbG9va3VwKGVtYmVkLlcsIFgpCmFmdGVyLmVtYmVkIDwtIHRmJGNhc3QoYWZ0ZXIuZW1iZWQsIHRmJGZsb2F0MzIpCmdldC5sZW5ndGggPC0gZnVuY3Rpb24oaW5wdXQpIHsKICAgICMgaHR0cHM6Ly9kYW5pamFyLmNvbS92YXJpYWJsZS1zZXF1ZW5jZS1sZW5ndGhzLWluLXRlbnNvcmZsb3cvCiAgICB1c2VkIDwtIHRmJHNpZ24odGYkcmVkdWNlX21heCh0ZiRhYnMoaW5wdXQpLCBheGlzID0gMkwpKQogICAgbGVuZ3RoIDwtIHRmJHJlZHVjZV9zdW0odXNlZCwgYXhpcyA9IDFMKQogICAgbGVuZ3RoIDwtIHRmJGNhc3QobGVuZ3RoLCB0ZiRpbnQzMikKICAgIGxlbmd0aAp9CmxzdG0gPC0gdGYkY29udHJpYiRybm4kQmFzaWNMU1RNQ2VsbChoaWRkZW4uZGltKQpyZXN1bHQgPC0gdGYkbm4kZHluYW1pY19ybm4obHN0bSwgYWZ0ZXIuZW1iZWQsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgZHR5cGUgPSB0ZiRmbG9hdDMyLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlcXVlbmNlX2xlbmd0aCA9IGdldC5sZW5ndGgoYWZ0ZXIuZW1iZWQpKQpvdXRwdXQgPC0gcmVzdWx0W1sxXV0Kc3RhdGUgPC0gcmVzdWx0W1syXV0KCmxhc3QucmVsZXZhbnQgPC0gZnVuY3Rpb24ob3V0cHV0LCBsZW5ndGgpIHsKICAgIGJhdGNoX3NpemUgPSB0ZiRzaGFwZShvdXRwdXQpWzBdCiAgICBtYXhfbGVuZ3RoID0gdGYkc2hhcGUob3V0cHV0KVsxXQogICAgb3V0X3NpemUgPSBoaWRkZW4uZGltCiAgICBpbmRleCA9IHRmJG11bHRpcGx5KHRmJGNhc3QodGYkcmFuZ2UoMCwgYmF0Y2hfc2l6ZSksIHRmJGludDMyKSwgbWF4X2xlbmd0aCkgKyB0ZiRzdWJ0cmFjdChsZW5ndGgsIDFMKQogICAgZmxhdCA9IHRmJHJlc2hhcGUob3V0cHV0LCBjKC0xTCwgb3V0X3NpemUpKQogICAgdGYkZ2F0aGVyKGZsYXQsIGluZGV4KQp9CgppbnRlcmVzdCA8LSBsYXN0LnJlbGV2YW50KG91dHB1dCwgZ2V0Lmxlbmd0aChhZnRlci5lbWJlZCkpCm91dHB1dCA8LSBpbnRlcmVzdCNzdGF0ZSRoCm91dHB1dCA8LSB0ZiRjb250cmliJGxheWVycyRmdWxseV9jb25uZWN0ZWQob3V0cHV0LCBoaWRkZW4uZGltLCBhY3RpdmF0aW9uX2ZuPXRmJG5uJHJlbHUpCm91dHB1dCA8LSB0ZiRjb250cmliJGxheWVycyRkcm9wb3V0KG91dHB1dCwga2VlcF9wcm9iPWRyb3BvdXQucCkKb3V0cHV0IDwtIHRmJGNvbnRyaWIkbGF5ZXJzJGxpbmVhcihvdXRwdXQsIDEwTCkKCmxvc3MgPC0gdGYkbm4kc3BhcnNlX3NvZnRtYXhfY3Jvc3NfZW50cm9weV93aXRoX2xvZ2l0cyhsb2dpdHM9b3V0cHV0LCBsYWJlbHM9eSkKbG9zcyA8LSB0ZiRyZWR1Y2VfbWVhbihsb3NzKQoKdHJhaW4ub3AgPC0gdGYkdHJhaW4kQWRhbU9wdGltaXplcigpJG1pbmltaXplKGxvc3MpCgp5X3ByZWQgPC0gdGYkYXJnbWF4KG91dHB1dCwgYXhpcz0xTCkKZXF1YWwgPC0gdGYkZXF1YWwoeV9wcmVkLCB5KQphY2MgPC0gdGYkcmVkdWNlX21lYW4odGYkY2FzdChlcXVhbCwgdGYkZmxvYXQzMikpCgpzZXNzID0gdGYkSW50ZXJhY3RpdmVTZXNzaW9uKCkKaW5pdCA9IHRmJGdsb2JhbF92YXJpYWJsZXNfaW5pdGlhbGl6ZXIoKQpzZXNzJHJ1bihpbml0KQoKCmJhdGNoLmdlbmVyYXRvciA8LSBmdW5jdGlvbihYLCB5LCBiYXRjaF9zaXplPTMyLCBiYXRjaF9pZHg9YygpKSB7CiAgICBpZiAobGVuZ3RoKGJhdGNoX2lkeCkgPCBiYXRjaF9zaXplKSB7CiAgICAgICAgYmF0Y2hfaWR4IDwtIHNhbXBsZSgxOm5yb3coWCkpCiAgICB9IAogICAgaWR4IDwtIHNhbXBsZShiYXRjaF9pZHgsIHNpemUgPSBiYXRjaF9zaXplLCByZXBsYWNlID0gRikKICAgIG5leHRfaWR4IDwtIHNldGRpZmYoYmF0Y2hfaWR4LCBpZHgpCiAgICAKICAgIHJlc3VsdCA8LSBsaXN0KCkKICAgIHJlc3VsdFtbJ1gnXV0gPC0gWFtpZHgsIF0KICAgIHJlc3VsdFtbJ3knXV0gPC0geVtpZHhdCiAgICByZXN1bHRbWydiYXRjaF9pZHgnXV0gPC0gbmV4dF9pZHgKICAgIHJlc3VsdFtbJ2Ryb3BvdXQucCddXSA8LSBkcm9wb3V0CiAgICByZXN1bHQKfQoKYmF0Y2guaWR4IDwtIGMoKQpmb3IoaSBpbiAxOm1heC5zdGVwKSB7CiAgICBiYXRjaCA8LSBiYXRjaC5nZW5lcmF0b3IodHJhaW4udGVuc29yLCB0cmFpbi55LCBiYXRjaC5zaXplLCBiYXRjaC5pZHgpCiAgICBiYXRjaC5pZHggPC0gYmF0Y2hbWydiYXRjaF9pZHgnXV0KICAgIHJlc3VsdCA8LSBzZXNzJHJ1bihjKHRyYWluLm9wLCBsb3NzLCBhY2MpLCAKICAgICAgICAgICAgICAgICAgICAgICBmZWVkX2RpY3Q9ZGljdChYPWJhdGNoW1snWCddXSwgeT1iYXRjaFtbJ3knXV0sIGRyb3BvdXQucD1iYXRjaFtbJ2Ryb3BvdXQucCddXSkpCiAgICBpZiAoaSAlJSBwcmludC5ldmVyeSA9PSAwIHwgaSA9PSBtYXguc3RlcCB8IGkgPT0gMSkgewogICAgICAgIGNhdChzcHJpbnRmKCJcbltTdGVwOiAlNWRdIExvc3M6ICVmIEFjYzogJWYiLCBpLCByZXN1bHRbWzJdXSwgcmVzdWx0W1szXV0pKQogICAgfQp9CmBgYA==