Identify hyperparameters with lowest error

Min word count
Vector size
Window size
Epochs

Params

KID_CORPUS_PATH <- here("data/processed/other/downsampled_kidbook.txt")

MIN_WORD_COUNT <- c(5, 10, 15, 20)
VECTOR_SIZE <- c(200, 300, 400, 500)
WINDOW_SIZE <- c(5, 10, 15, 20)
EPOCHS <- c(5, 10, 15, 20)

train_model <- function(params, corpus){
    min_word_count <- params[[1]]
    vector_size <- params[[2]]
    window_size <- params[[3]]
    epoch_size <- params[[4]] 

    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()

    writeLines(text = corpus, con = tmp_file_txt)
    execute(commands = c("skipgram",
                         #set the input and output files
                         "-input", tmp_file_txt,
                         "-output", tmp_file_model,
                         #set the window size if needed, default is 5
                         "-ws", window_size,
                         #min length of char ngram, default is 3
                         "-minn", 1,
                         #max length of char ngram, default is 6, minn and maxn both as 1 if don’t want to use ngrams
                         "-maxn", 1,
                         #minimal number of word occurrences, default is 5, can set to 100 instead
                         "-minCount", min_word_count,
                         #max length of word ngram, default is 1
                         #”-wordNgrams",1,
                         #number of epochs, default is 5
                         "-epoch", epoch_size,
                         #set the number of dimensions, default is 100
                         "-dim", vector_size,
                         "-verbose", 1))

  walk(list.files(dirname(tmp_file_model), full.names = T),
   ~try(file.remove(.x, silent = TRUE)))
  
  walk(list.files(dirname(tmp_file_txt), full.names = T),
   ~try(file.remove(.x, silent = TRUE)))
    
}

# get corpus
corpus <- read_lines(KID_CORPUS_PATH)   %>%
  str_split(" ")  %>%
  unlist()

Min word count

print(MIN_WORD_COUNT)

## [1]  5 10 15 20

cross(list(MIN_WORD_COUNT,
           VECTOR_SIZE[1],
           WINDOW_SIZE[1],
           EPOCHS[1])) %>%
  map(train_model, corpus)

## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71134 lr:  0.000000 loss:  2.972194 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  1771
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71346 lr:  0.000000 loss:  1.808053 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  1294
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71568 lr:  0.000000 loss:  2.608432 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  991
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   70858 lr:  0.000000 loss:  3.175025 ETA:   0h 0m

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

Vector size

print(VECTOR_SIZE)

## [1] 200 300 400 500

cross(list(MIN_WORD_COUNT[1],
           VECTOR_SIZE,
           WINDOW_SIZE[1],
           EPOCHS[1])) %>%
  map(train_model, corpus)

## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71311 lr:  0.000000 loss:  3.160637 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   70930 lr:  0.000000 loss:  2.481266 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71255 lr:  0.000000 loss:  2.826568 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   70764 lr:  0.000000 loss:  2.099673 ETA:   0h 0m

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

Window size

print(WINDOW_SIZE)

## [1]  5 10 15 20

cross(list(MIN_WORD_COUNT[1],
           VECTOR_SIZE[1],
           WINDOW_SIZE,
           EPOCHS[1])) %>%
  map(train_model, corpus)

## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71320 lr:  0.000000 loss:  3.372495 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   70974 lr:  0.000000 loss:  2.403857 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71221 lr:  0.000000 loss:  3.012125 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71241 lr:  0.000000 loss:  2.567020 ETA:   0h 0m

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

Epochs

print(EPOCHS)

## [1]  5 10 15 20

cross(list(MIN_WORD_COUNT[1],
           VECTOR_SIZE[1],
           WINDOW_SIZE[1],
           EPOCHS)) %>%
  map(train_model, corpus)

## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   71463 lr:  0.000000 loss:  3.042435 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:  140860 lr:  0.000000 loss:  2.129227 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:  207707 lr:  0.000000 loss:  1.515688 ETA:   0h 0m
## 
## 
Read 0M words
## Number of words:  2876
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:  275790 lr:  0.000000 loss:  1.433119 ETA:   0h 0m

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

Identify hyperparameters with lowest error

Molly Lewis

2019-07-10

Min word count

Vector size

Window size

Epochs