Params
KID_CORPUS_PATH <- here("data/processed/other/downsampled_kidbook.txt")
MIN_WORD_COUNT <- c(5, 10, 15, 20)
VECTOR_SIZE <- c(200, 300, 400, 500)
WINDOW_SIZE <- c(5, 10, 15, 20)
EPOCHS <- c(1, 5, 10, 15, 20, 50)
train_model <- function(params, corpus){
min_word_count <- params[[1]]
vector_size <- params[[2]]
window_size <- params[[3]]
epoch_size <- params[[4]]
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = corpus, con = tmp_file_txt)
execute(commands = c("skipgram",
#set the input and output files
"-input", tmp_file_txt,
"-output", tmp_file_model,
#set the window size if needed, default is 5
"-ws", window_size,
#min length of char ngram, default is 3
"-minn", 1,
#max length of char ngram, default is 6, minn and maxn both as 1 if don’t want to use ngrams
"-maxn", 1,
#minimal number of word occurrences, default is 5, can set to 100 instead
"-minCount", min_word_count,
#max length of word ngram, default is 1
#”-wordNgrams",1,
#number of epochs, default is 5
"-lr", .01,
"-epoch", epoch_size,
#set the number of dimensions, default is 100
"-dim", vector_size,
"-verbose", 1))
walk(list.files(dirname(tmp_file_model), full.names = T),
~try(file.remove(.x, silent = TRUE)))
walk(list.files(dirname(tmp_file_txt), full.names = T),
~try(file.remove(.x, silent = TRUE)))
}
# get corpus
corpus <- read_lines(KID_CORPUS_PATH) %>%
str_split(" ") %>%
unlist()
print(MIN_WORD_COUNT)
cross(list(MIN_WORD_COUNT,
VECTOR_SIZE[1],
WINDOW_SIZE[1],
EPOCHS[1])) %>%
map(train_model, corpus)
print(VECTOR_SIZE)
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE,
WINDOW_SIZE[1],
EPOCHS[1])) %>%
map(train_model, corpus)
print(WINDOW_SIZE)
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE[1],
WINDOW_SIZE,
EPOCHS[1])) %>%
map(train_model, corpus)
print(EPOCHS)
## [1] 1 5 10 15 20 50
cross(list(MIN_WORD_COUNT[1],
VECTOR_SIZE[1],
WINDOW_SIZE[1],
EPOCHS)) %>%
map(train_model, corpus)
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 14338 lr: 0.000000 loss: 4.135611 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 66358 lr: 0.000000 loss: 3.837582 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 139513 lr: 0.000000 loss: 3.655409 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 229531 lr: 0.000000 loss: 3.529545 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 323230 lr: 0.000000 loss: 2.799038 ETA: 0h 0m
##
##
Read 0M words
## Number of words: 2876
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 641987 lr: 0.000000 loss: 2.023787 ETA: 0h 0m
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)