Params
KID_CORPUS_PATH <- here("data/processed/other/downsampled_kidbook.txt")
MIN_WORD_COUNT <- c(5, 10, 15, 20)
VECTOR_SIZE <- c(200, 300, 400, 500)
WINDOW_SIZE <- c(5, 10, 15, 20)
EPOCHS <- c(5, 10, 15, 20)
train_model <- function(params, corpus){
min_word_count <- params[[1]]
vector_size <- params[[2]]
window_size <- params[[3]]
epoch_size <- params[[4]]
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = corpus, con = tmp_file_txt)
execute(commands = c("skipgram",
#set the input and output files
"-input", tmp_file_txt,
"-output", tmp_file_model,
#set the window size if needed, default is 5
"-ws", window_size,
#min length of char ngram, default is 3
"-minn", 1,
#max length of char ngram, default is 6, minn and maxn both as 1 if don’t want to use ngrams
"-maxn", 1,
#minimal number of word occurrences, default is 5, can set to 100 instead
"-minCount", min_word_count,
#max length of word ngram, default is 1
#”-wordNgrams",1,
#number of epochs, default is 5
"-epoch", epoch_size,
#set the number of dimensions, default is 100
"-dim", vector_size,
"-verbose", 1))
walk(list.files(dirname(tmp_file_model), full.names = T),
~try(file.remove(.x, silent = TRUE)))
walk(list.files(dirname(tmp_file_txt), full.names = T),
~try(file.remove(.x, silent = TRUE)))
}
# get corpus
corpus <- read_lines(KID_CORPUS_PATH) %>%
str_split(" ") %>%
unlist()
print(MIN_WORD_COUNT)
## [1] 5 10 15 20
cross(list(MIN_WORD_COUNT,
VECTOR_SIZE[1],
WINDOW_SIZE[1],
EPOCHS[1])) %>%
map(train_model, corpus)
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71134 lr: 0.000000 loss: 2.972194 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 1771
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71346 lr: 0.000000 loss: 1.808053 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 1294
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71568 lr: 0.000000 loss: 2.608432 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 991
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 70858 lr: 0.000000 loss: 3.175025 ETA: 0h 0m
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
print(VECTOR_SIZE)
## [1] 200 300 400 500
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE,
WINDOW_SIZE[1],
EPOCHS[1])) %>%
map(train_model, corpus)
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71311 lr: 0.000000 loss: 3.160637 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 70930 lr: 0.000000 loss: 2.481266 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71255 lr: 0.000000 loss: 2.826568 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 70764 lr: 0.000000 loss: 2.099673 ETA: 0h 0m
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
print(WINDOW_SIZE)
## [1] 5 10 15 20
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE[1],
WINDOW_SIZE,
EPOCHS[1])) %>%
map(train_model, corpus)
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71320 lr: 0.000000 loss: 3.372495 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 70974 lr: 0.000000 loss: 2.403857 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71221 lr: 0.000000 loss: 3.012125 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71241 lr: 0.000000 loss: 2.567020 ETA: 0h 0m
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
print(EPOCHS)
## [1] 5 10 15 20
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE[1],
WINDOW_SIZE[1],
EPOCHS)) %>%
map(train_model, corpus)
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 71463 lr: 0.000000 loss: 3.042435 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 140860 lr: 0.000000 loss: 2.129227 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 207707 lr: 0.000000 loss: 1.515688 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 275790 lr: 0.000000 loss: 1.433119 ETA: 0h 0m
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)