our data comes from HC Corpora, and can be downloaded directly from here
First, we will read all our files:
US_blogs <- readLines("./final/en_US/en_US.blogs.txt")
US_news <- readLines("./final/en_US/en_US.news.txt")
US_twitter <- readLines("./final/en_US/en_US.twitter.txt")
## File FileSize LineCount WordCount
## 1 US Blogs 255.4 Mb 899288 38154238
## 2 US News 19.8 Mb 77259 2693898
## 3 US Twitter 319 Mb 2360148 30218125
library(qdap)
set.seed(09092020)
# will take samples per country
sampleUS_blogs <- sample(US_blogs,1000, replace = FALSE)
sampleUS_news <- sample(US_news,1000, replace = FALSE)
sampleUS_twitter <- sample(US_twitter,1000, replace = FALSE)
# combine sampled data per language
en_SampledData <- paste(sampleUS_blogs, sampleUS_news, sampleUS_twitter)
# detect sentences in combined & individual sampled data
us_sentences <- sent_detect(en_SampledData, language = "en", model = NULL)
blogs_sentences <- sent_detect(sampleUS_blogs, language = "en", model = NULL)
news_sentences <- sent_detect(sampleUS_news, language = "en", model = NULL)
twitter_sentences <- sent_detect(sampleUS_twitter, language = "en", model = NULL)
# remove unneeded variables
remove(sampleUS_blogs, sampleUS_news, sampleUS_twitter,en_SampledData,
US_blogs, US_news, US_twitter)
Here we start cleaning sentences collected from our samples above for both combined and individual data, however only shown code for the combined set
library(tm)
# vectorize my ENGLISH data for further cleaning
us_sentences <- VCorpus(VectorSource(us_sentences))
# cleaning of vectorized data will be done on several steps:
us_sentences <- tm_map(us_sentences, removeNumbers) # remove numbers
us_sentences <- tm_map(us_sentences, stripWhitespace) # remove whitespaces
us_sentences <- tm_map(us_sentences, content_transformer(tolower)) # lowercase
us_sentences <- tm_map(us_sentences, removePunctuation) # remove special characters
us_sentences <- tm_map(us_sentences, removeWords, stopwords("english")) # remove stop words
Here we structure our cleaned up data into a dataframe for tokenization to extract 1, 2 & 3-grams, shown code for combined data, same is done for individual data sets
library(RWeka)
#structure into dataframes .. entire ENGLISH set
us_sentences <- data.frame(us_sentences,stringsAsFactors = FALSE)
# get 1,2 & 3-grams from ENGLISH structured data
us_UniGram <- NGramTokenizer(us_sentences, Weka_control(min = 1, max = 1))
us_BiGram <- NGramTokenizer(us_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
us_TriGram <- NGramTokenizer(us_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
Here we shall explore visually frequency of words, for 1-gram, for combined and individual samples
Plotting 2 & 3-grams most frequent words in our combined english data samples