I will use the quanteda package because it is more efficient, can work with multithreading and all matrices are sparse by default.
library(tm)
library(readtext)
library(spacyr)
library(stopwords)
library(quanteda)
library(ggplot2)
library(quanteda.textplots)
library(quanteda.textstats)
library(quanteda.dictionaries)
quanteda_options("threads"=10)
First, we inspect the size of the 3 files in Mb:
blogs_size <- round(file.size("en_US.blogs.txt")/1024^2,1)
news_size <- round(file.size("en_US.news.txt")/1024^2,1)
twitter_size <- round(file.size("en_US.twitter.txt")/1024^2,1)
file_sizes <- setNames(c(blogs_size, news_size, twitter_size), c("Blogs", "News", "Twitter"))
file_sizes
## Blogs News Twitter
## 200.4 196.3 159.4
Line counts:
blogs <- readLines(file("en_US.blogs.txt","r"), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(file("en_US.news.txt","r"), encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(file("en_US.twitter.txt","r"), encoding = "UTF-8", skipNul = TRUE)
line_counts <- setNames(c(length(blogs),length(news),length(twitter)), c("Blogs", "News", "Twitter"))
line_counts
## Blogs News Twitter
## 899288 1010242 2360148
Character counts:
char_counts <- setNames(c(sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter))), c("Blogs", "News", "Twitter"))
char_counts
## Blogs News Twitter
## 206824505 203223159 162096241
Average characters per line:
char_average <- setNames(c(round(mean(nchar(blogs)),0),round(mean(nchar(news)),0),round(mean(nchar(twitter)),0)), c("Blogs", "News", "Twitter"))
char_average
## Blogs News Twitter
## 230 201 69
Word counts:
word_blogs <-lengths(strsplit(blogs, " "))
word_news <- lengths(strsplit(news, " "))
word_twitter <- lengths(strsplit(twitter, " "))
word_counts <- setNames(c(sum(word_blogs), sum(word_news), sum(word_twitter)), c("Blogs", "News", "Twitter"))
word_counts
## Blogs News Twitter
## 37334131 34372530 30373583
Average words per line:
word_average <- setNames(c(round(mean(word_blogs),0),round(mean(word_news),0),round(mean(word_twitter),0)), c("Blogs", "News", "Twitter"))
word_average
## Blogs News Twitter
## 42 34 13
In order to tokenize and analyze the data further we will sample the data to analyze only 8000 lines for each set (approximately 10% of the data). The summary of that reduced corpus is (Types are number of unique tokens):
set.seed(3333)
blogs_sample <- sample(blogs, size=8000, replace = FALSE)
set.seed(3333)
news_sample <- sample(news, size=8000, replace = FALSE)
set.seed(3333)
twitter_sample <- sample(twitter, size=8000, replace = FALSE)
write.table(blogs_sample, file="Sampling/blogs_sample.txt",sep=" ")
write.table(news_sample, file="Sampling/news_sample.txt",sep=" ")
write.table(twitter_sample, file="Sampling/twitter_sample.txt",sep=" ")
#Generate a corpus with the sampled data:
tm_corpus <- VCorpus(DirSource(directory = "Sampling", pattern = "*.txt"))
q_corpus <- corpus(tm_corpus)
docvars(q_corpus, field = "TextId") <- c("Blogs","News", "Twitter")
summary(q_corpus)[,c("TextId", "Types", "Tokens", "Sentences", "language")]
## TextId Types Tokens Sentences language
## 1 Blogs 41346 416396 18258 en
## 2 News 41021 361075 14824 en
## 3 Twitter 25204 165669 8918 en
Perform an exploratory analysis to understand the words and frequency we have. First, we generate a list of the top terms by removing punctuation, numbers and stop words. Stop words are articles, conjunctions, prepositions, pronouns, and common verbs that do not have much meaning.
topTerms <- tokens_remove(tokens_tolower(tokens(q_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,)),
pattern = stopwords("english"))
topTerms_DFM <- dfm(topTerms)
topTerms_Top20 <- as.data.frame(topfeatures(topTerms_DFM, n=20))
topTerms_Top20$Words <- rownames(topTerms_Top20)
colnames(topTerms_Top20) <- c("Frequency", "Words")
topTerms_Top20$Words <- as.factor(topTerms_Top20$Words)
topTerms_Top20$Words <- reorder(topTerms_Top20$Words,topTerms_Top20$Frequency)
topTerms_Top20Plot <- ggplot(data=topTerms_Top20, aes(x=Frequency, y=Words))+
geom_col() +
ggtitle("Top 20 most frequent words")
topTerms_Top20Plot
We can also visualize the top terms with a word cloud:
textplot_wordcloud(topTerms_DFM,
min_count = 100,
color = topo.colors(20))
In order to prepare for the prediction of the next word, we have to generate N-Grams, which are sequences or words that appear together in our texts. The stopwords are not removed since they are essential for predicting the next word.
The two-grams and the 20 most frequently found:
twoGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,)), n=2, concatenator = " ")
twoGram_DFM <- dfm(twoGram)
twoGram_Top20 <- as.data.frame(topfeatures(twoGram_DFM, n=20))
twoGram_Top20$Words <- rownames(twoGram_Top20)
colnames(twoGram_Top20) <- c("Frequency", "Words")
twoGram_Top20$Words <- as.factor(twoGram_Top20$Words)
twoGram_Top20$Words <- reorder(twoGram_Top20$Words,twoGram_Top20$Frequency)
twoGram_Top20Plot <- ggplot(data=twoGram_Top20, aes(x=Frequency, y=Words))+
geom_col() +
ggtitle("Top 20 most frequent 2-Grams")
twoGram_Top20Plot
The three-grams and the 20 most frequently found:
threeGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,)), n=3, concatenator = " ")
threeGram_DFM <- dfm(threeGram)
threeGram_Top20 <- as.data.frame(topfeatures(threeGram_DFM, n=20))
threeGram_Top20$Words <- rownames(threeGram_Top20)
colnames(threeGram_Top20) <- c("Frequency", "Words")
threeGram_Top20$Words <- as.factor(threeGram_Top20$Words)
threeGram_Top20$Words <- reorder(threeGram_Top20$Words,threeGram_Top20$Frequency)
threeGram_Top20Plot <- ggplot(data=threeGram_Top20, aes(x=Frequency, y=Words))+
geom_col() +
ggtitle("Top 20 most frequent 3-Grams")
threeGram_Top20Plot
The four-grams and the 20 most frequently found:
fourGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,)), n=4, concatenator = " ")
fourGram_DFM <- dfm(fourGram)
fourGram_Top20 <- as.data.frame(topfeatures(fourGram_DFM, n=20))
fourGram_Top20$Words <- rownames(fourGram_Top20)
colnames(fourGram_Top20) <- c("Frequency", "Words")
fourGram_Top20$Words <- as.factor(fourGram_Top20$Words)
fourGram_Top20$Words <- reorder(fourGram_Top20$Words,fourGram_Top20$Frequency)
fourGram_Top20Plot <- ggplot(data=fourGram_Top20, aes(x=Frequency, y=Words))+
geom_col() +
ggtitle("Top 20 most frequent 4-Grams")
fourGram_Top20Plot
The data provided is too big to analyze in a normal computer. Once I interrogated the type of data and their size, I randomly took only 10% of the data to further analyze. The sampled data was tokenized and analyzed to get the most frequent words. Afterwards, a series of n-grams lists were generated to use later in the prediction algorithm. For visualization a bargraph with the 20 most frequent words, 2-grams, 3-grams and 4-grams were shown. The next step is to generate a prediction algorithm and the shiny app.