# Load all required packages
library(tidyverse)        # Also loads dplyr, ggplot2, and haven
library(quanteda)         # For NLP
library(wordcloud)        # To generate wordclouds

Overview

This paper is my attempt to find if there any influence of chosen seed at the word distribution. As before, the en_US locale is used.

I used a method of typing random numbers by keyboard, so they are true-random, not pseudo.

Function

construct_seed <- function(seed) {
  blogs <- corpus(readLines('./final/en_US/en_US.blogs.txt', encoding="UTF-8"))
  news <- corpus(readLines('./final/en_US/en_US.news.txt', encoding="UTF-8"))
  twitter <- corpus(readLines('./final/en_US/en_US.twitter.txt', encoding="UTF-8"))
  set.seed(seed = seed)
  blogs <- corpus_sample(blogs, size = 5000)
  news <- corpus_sample(news, size = 5000)
  twitter <- corpus_sample(twitter, size = 5000)
  docnames(blogs) <- c(1:5000)
  docnames(news) <- c(5001:10000)
  docnames(twitter) <- c(10001:15000)
  corp <- blogs + news + twitter
  rm(blogs, news, twitter)
  tokens <- tokens(corp)
  #remove numbers and punctuation
  tokens <- tokens(tokens, remove_punct = T, remove_numbers = T)
  #remove stopwords
  tokens <- tokens_select(tokens, pattern = stopwords('en'), selection = 'remove')
  
  #Make matrix
  matrx <- dfm(tokens)
  matrx <- as.matrix(matrx) 
  words <- sort(colSums(matrx), decreasing = TRUE) 
  df <- data.frame(word = names(words), freq = words)
  wc <- wordcloud(words = df$word, freq = df$freq, 
          min.freq = 1, 
          max.words = 300, 
          random.order = FALSE, 
          rot.per = 0.35,
          colors = brewer.pal(8, 'Dark2'))
  arg_name <- deparse(substitute(seed)) # Get argument name
  var_name <- paste("wc", arg_name, sep="_") # Construct the name
  assign(var_name, wc, env=.GlobalEnv) # Assign values to variable
  # variable will be created in .GlobalEnv
}

I will compare the output of my function with different seeds using visual method (word clouds).

Seed 358

construct_seed(seed = 358)

Seed 3485

construct_seed(seed = 3485)

Seed 57890

construct_seed(seed = 57890)

Seed 23568

construct_seed(seed = 23568)

Looks like there’s no significant difference, so any random seed can be used in constructing the final model.

Seed influence