# Load all required packages
library(tidyverse) # Also loads dplyr, ggplot2, and haven
library(quanteda) # For NLP
library(wordcloud) # To generate wordclouds
This paper is my attempt to find if there any influence of chosen seed at the word distribution. As before, the en_US locale is used.
I used a method of typing random numbers by keyboard, so they are true-random, not pseudo.
construct_seed <- function(seed) {
blogs <- corpus(readLines('./final/en_US/en_US.blogs.txt', encoding="UTF-8"))
news <- corpus(readLines('./final/en_US/en_US.news.txt', encoding="UTF-8"))
twitter <- corpus(readLines('./final/en_US/en_US.twitter.txt', encoding="UTF-8"))
set.seed(seed = seed)
blogs <- corpus_sample(blogs, size = 5000)
news <- corpus_sample(news, size = 5000)
twitter <- corpus_sample(twitter, size = 5000)
docnames(blogs) <- c(1:5000)
docnames(news) <- c(5001:10000)
docnames(twitter) <- c(10001:15000)
corp <- blogs + news + twitter
rm(blogs, news, twitter)
tokens <- tokens(corp)
#remove numbers and punctuation
tokens <- tokens(tokens, remove_punct = T, remove_numbers = T)
#remove stopwords
tokens <- tokens_select(tokens, pattern = stopwords('en'), selection = 'remove')
#Make matrix
matrx <- dfm(tokens)
matrx <- as.matrix(matrx)
words <- sort(colSums(matrx), decreasing = TRUE)
df <- data.frame(word = names(words), freq = words)
wc <- wordcloud(words = df$word, freq = df$freq,
min.freq = 1,
max.words = 300,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, 'Dark2'))
arg_name <- deparse(substitute(seed)) # Get argument name
var_name <- paste("wc", arg_name, sep="_") # Construct the name
assign(var_name, wc, env=.GlobalEnv) # Assign values to variable
# variable will be created in .GlobalEnv
}
I will compare the output of my function with different seeds using visual method (word clouds).
construct_seed(seed = 358)
construct_seed(seed = 3485)
construct_seed(seed = 57890)
construct_seed(seed = 23568)
Looks like there’s no significant difference, so any random seed can be used in constructing the final model.