This document explains the initial exploratory analysis undertaken for the English versions of the Twitter, News, and Blogs data sets. This markdown document will walk through the process to load and tokenize the data, as well as explore summary statistics for all three data files. In addition, this report will explore some histograms for frequency analysis and estimates for effective dictionary sizes. One thing that becomes apparent when viewing these histograms and frequency tables is that there are a great many stopwords in these tables (words like “the”, “and”, “to” etc. that do not add semantic value). A conscious decision was made to include these stopwords since word prediction without them seems to become very cryptic and the result can be difficult to understand. It is possible that a dual approach may be warranted that uses a model which includes stopwords alongside one in which they are removed. In any case, completely elminating them seems like a bad idea.
If the reader is more interested in the frequency tables and histograms, it is recommended that they skip to the end (over the developmental code).
The raw data will be read in from the text data files and then converted to ISO-8859-1 standard. This makes it easier to do things like frequency analysis and tokenizing.
timestamp()
## ##------ Wed Jan 22 16:17:45 2020 ------##
message("Reading in twitter data file...")
## Reading in twitter data file...
lines_twitter <- suppressWarnings(readLines("en_US.twitter.txt"))
lines_twitter <- iconv(lines_twitter, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf(" %d lines read in.", length(lines_twitter)))
## 2360148 lines read in.
timestamp()
## ##------ Wed Jan 22 16:17:54 2020 ------##
message("Reading in news data file...")
## Reading in news data file...
lines_news <- suppressWarnings(readLines("en_US.news.txt"))
lines_news <- iconv(lines_news, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf(" %d lines read in.", length(lines_news)))
## 1010242 lines read in.
timestamp()
## ##------ Wed Jan 22 16:18:00 2020 ------##
message("Reading in blogs data file...")
## Reading in blogs data file...
lines_blogs <- suppressWarnings(readLines("en_US.blogs.txt"))
lines_blogs <- iconv(lines_blogs, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf(" %d lines read in.", length(lines_blogs)))
## 899288 lines read in.
message(" Done.")
## Done.
timestamp()
## ##------ Wed Jan 22 16:18:07 2020 ------##
Defining a few functions that will help with parsing the data, to do things like removing whitespace, punctuation and numbers, as well as define functions that will tabulate ngrams.
# Creating tokenizer functions
ngramTokenizer <- function(x, n) unlist(lapply(ngrams(words(x), n), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <- function(x) ngramTokenizer(x, 2)
TrigramTokenizer <- function(x) ngramTokenizer(x, 3)
# The tranform_corpus() function takes a corpus as an input, and creates a standardized output
# that can be used for tokenizing.
transform_corpus <- function(x) {
out <- tm_map(x, content_transformer(stripWhitespace)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(removePunctuation)) %>%
tm_map(content_transformer(stemDocument), language = "english") %>%
tm_map(content_transformer(removeNumbers))
return(out)
}
# The following functions make frequency tables out of an input document matrix
make_1gram <- function(x) {
terms <- x$dimnames$Terms
terms <- data.frame(terms, stringsAsFactors = FALSE)
colnames(terms) <- c("lastTerm")
freqs <- data.frame(table(x$i))
terms$IDX <- freqs$Var1
terms$frequency <- freqs$Freq
terms <- arrange(terms, desc(frequency))
return(terms)
}
make_2gram <- function(x, trimlow = 0) {
terms <- x$dimnames$Terms
terms <- strsplit(terms, " ", fixed = TRUE)
terms <- data.frame(matrix(unlist(terms), nrow = length(unlist(terms))/2, byrow = TRUE), stringsAsFactors = FALSE)
colnames(terms) <- c("firstTerms","lastTerm")
freqs <- data.frame(table(x$i))
terms$IDX <- freqs$Var1
terms$frequency <- freqs$Freq
terms <- arrange(terms, desc(frequency))
terms <- filter(terms, frequency > trimlow)
return(terms)
}
make_3gram <- function(x, trimlow = 0) {
terms <- x$dimnames$Terms
terms <- strsplit(terms, " ", fixed = TRUE)
terms <- data.frame(matrix(unlist(terms), nrow = length(unlist(terms))/3, byrow = TRUE), stringsAsFactors = FALSE)
terms <- mutate(terms, firstTerms = paste(X1,X2,sep = "_"), firstTerms2 = X2, lastTerm = X3)
terms <- select(terms, firstTerms, firstTerms2, lastTerm)
freqs <- data.frame(table(x$i))
terms$IDX <- freqs$Var1
terms$frequency <- freqs$Freq
terms <- arrange(terms, desc(frequency))
terms <- filter(terms, frequency > trimlow)
return(terms)
}
The data that was read in is just freeform text. Now separating the text into words, bigrams, and trigrams.
timestamp()
## ##------ Wed Jan 22 16:18:07 2020 ------##
vs_twitter <- VectorSource(lines_twitter)
vs_news <- VectorSource(lines_news)
vs_blogs <- VectorSource(lines_blogs)
ct_twitter_0 <- VCorpus(vs_twitter, readerControl = list(reader = readPlain, language = "en", load = TRUE))
ct_news_0 <- VCorpus(vs_news, readerControl = list(reader = readPlain, language = "en", load = TRUE))
ct_blogs_0 <- VCorpus(vs_blogs, readerControl = list(reader = readPlain, language = "en", load = TRUE))
ct_twitter <- transform_corpus(ct_twitter_0)
ct_news <- transform_corpus(ct_news_0)
ct_blogs <- transform_corpus(ct_blogs_0)
timestamp()
## ##------ Wed Jan 22 16:35:29 2020 ------##
# parsing 1-grams (single words)
tdm_twitter_1 <- TermDocumentMatrix(ct_twitter)
tdm_news_1 <- TermDocumentMatrix(ct_news)
tdm_blogs_1 <- TermDocumentMatrix(ct_blogs)
timestamp()
## ##------ Wed Jan 22 16:48:28 2020 ------##
# parsing bigrams
tdm_twitter_2 <- TermDocumentMatrix(ct_twitter, control = list(tokenize = BigramTokenizer))
tdm_news_2 <- TermDocumentMatrix(ct_news, control = list(tokenize = BigramTokenizer))
tdm_blogs_2 <- TermDocumentMatrix(ct_blogs, control = list(tokenize = BigramTokenizer))
timestamp()
## ##------ Wed Jan 22 17:13:57 2020 ------##
# parsing trigrams
tdm_twitter_3 <- TermDocumentMatrix(ct_twitter, control = list(tokenize = TrigramTokenizer))
tdm_news_3 <- TermDocumentMatrix(ct_news, control = list(tokenize = TrigramTokenizer))
tdm_blogs_3 <- TermDocumentMatrix(ct_blogs, control = list(tokenize = TrigramTokenizer))
timestamp()
## ##------ Wed Jan 22 17:48:45 2020 ------##
Next, the parsed data can be visualized as either single words in isolation (1-grams), bigrams, or trigrams. This process can also give insight into how big the dictionary needs to be. In the simplest example, a word, bigram, or trigram with a frequence of one may not be useful at all as a predictor, so it may be able to be immediately discarded.
timestamp()
## ##------ Wed Jan 22 17:48:45 2020 ------##
# creating 1-gram frequency tables
freq_twitter_1 <- make_1gram(tdm_twitter_1)
freq_news_1 <- make_1gram(tdm_news_1)
freq_blogs_1 <- make_1gram(tdm_blogs_1)
# showing the 20 most frequent 1-grams for each file
table_1 <- cbind(freq_twitter_1[1:20,], freq_news_1[1:20,], freq_blogs_1[1:20,])
colnames(table_1) <- c("Twitter", "Index", "Freq", "News", "Index", "Freq", "Blogs", "Index", "Freq")
print(table_1[,c(1,3,4,6,7,9)])
## Twitter Freq News Freq.1 Blogs Freq.2
## 1 the 704520 the 649843 the 354758
## 2 you 425145 and 458362 and 300686
## 3 and 376379 for 238726 that 161585
## 4 for 342123 that 237492 for 155095
## 5 that 245465 said 191214 with 127926
## 6 your 181841 with 179898 this 118706
## 7 have 168942 was 149771 you 107246
## 8 with 159768 but 118810 have 105713
## 9 this 151511 from 114477 was 98345
## 10 are 144110 have 108981 but 94516
## 11 just 141328 are 100175 are 84785
## 12 get 134899 his 98995 not 75977
## 13 thank 124490 has 90377 from 72795
## 14 what 119276 this 90015 all 72019
## 15 like 118671 not 84033 one 63946
## 16 but 117721 who 83117 out 56418
## 17 not 112785 year 82312 about 54154
## 18 all 111382 will 78388 they 53733
## 19 love 109909 they 75669 what 53396
## 20 out 108227 about 68621 like 53299
message(sprintf("For 1-grams, note that a total of all frequencies is essentially a word count: %d",
sum(freq_twitter_1$frequency)+sum(freq_news_1$frequency)+sum(freq_blogs_1$frequency)))
## For 1-grams, note that a total of all frequencies is essentially a word count: 54227667
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
dim(freq_twitter_1)[1]+dim(freq_news_1)[1]+dim(freq_blogs_1)[1]))
## Total size before pruning the elements with a frequency of one: 848144
# pruning
freq_twitter_1 <- freq_twitter_1[freq_twitter_1$frequency > 1,]
freq_news_1 <- freq_news_1[freq_news_1$frequency > 1,]
freq_blogs_1 <- freq_blogs_1[freq_blogs_1$frequency > 1,]
message(sprintf("Total size after pruning the elements with a frequency of one : %d",
dim(freq_twitter_1)[1]+dim(freq_news_1)[1]+dim(freq_blogs_1)[1]))
## Total size after pruning the elements with a frequency of one : 311683
# creating bigram frequency tables
freq_twitter_2 <- make_2gram(tdm_twitter_2)
freq_news_2 <- make_2gram(tdm_news_2)
freq_blogs_2 <- make_2gram(tdm_blogs_2)
# showing the 20 most frequent 1-grams for each file
table_2 <- cbind(freq_twitter_2[1:20,], freq_news_2[1:20,], freq_blogs_2[1:20,])
colnames(table_2) <- c("Twt-1", "Twt-2", "Index", "Freq", "News-1", "News-2", "Index", "Freq",
"Blogs-1", "Blogs-2", "Index", "Freq")
print(table_2[,c(1,2,4,5,6,8,9,10,12)])
## Twt-1 Twt-2 Freq News-1 News-2 Freq.1 Blogs-1 Blogs-2 Freq.2
## 1 in the 73615 of the 136756 of the 81085
## 2 for the 71221 in the 133242 in the 72485
## 3 of the 53205 to the 66733 to the 43087
## 4 on the 46149 on the 57870 on the 39536
## 5 go to 45321 for the 55678 to be 34083
## 6 to be 44348 at the 46995 for the 31490
## 7 thank for 43193 and the 42148 and the 30571
## 8 to the 41330 in a 41598 and i 28511
## 9 have a 37209 to be 36909 i have 26883
## 10 at the 35598 with the 35386 at the 25230
## 11 i love 34006 from the 30499 it was 24895
## 12 thank you 31923 with a 28290 in a 24573
## 13 want to 31422 of a 27011 is a 24317
## 14 if you 30786 he said 26880 i was 24256
## 15 i have 29700 as a 25372 with the 23884
## 16 for a 28333 for a 25100 it is 23716
## 17 i am 27309 that the 24759 i am 23412
## 18 i dont 27175 is a 23254 from the 20847
## 19 to see 27105 by the 23175 that i 20364
## 20 to get 26238 will be 22047 want to 20317
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
dim(freq_twitter_2)[1]+dim(freq_news_2)[1]+dim(freq_blogs_2)[1]))
## Total size before pruning the elements with a frequency of one: 12390006
# pruning
freq_twitter_2 <- freq_twitter_2[freq_twitter_2$frequency > 1,]
freq_news_2 <- freq_news_2[freq_news_2$frequency > 1,]
freq_blogs_2 <- freq_blogs_2[freq_blogs_2$frequency > 1,]
message(sprintf("Total size after pruning the elements with a frequency of one : %d",
dim(freq_twitter_2)[1]+dim(freq_news_2)[1]+dim(freq_blogs_2)[1]))
## Total size after pruning the elements with a frequency of one : 3789035
# creating trigram frequency tables
freq_twitter_3 <- make_3gram(tdm_twitter_3)
freq_news_3 <- make_3gram(tdm_news_3)
freq_blogs_3 <- make_3gram(tdm_blogs_3)
# showing the 20 most frequent 1-grams for each file
table_3 <- cbind(freq_twitter_3[1:20,], freq_news_3[1:20,], freq_blogs_3[1:20,])
colnames(table_3) <- c("Twt-1/2", "Twt-2", "Twt-3", "Index", "Freq", "News-1/2", "News-2", "News-3", "Index", "Freq",
"Blogs-1/2", "Blogs-2", "Blogs-3", "Index", "Freq")
print(table_3[,c(1,3,5,6,8,10,11,13,15)])
## Twt-1/2 Twt-3 Freq News-1/2 News-3 Freq.1 Blogs-1/2 Blogs-3 Freq.2
## 1 thank_for the 23460 one_of the 11901 one_of the 7739
## 2 look_forward to 11048 a_lot of 8573 a_lot of 6608
## 3 thank_you for 8410 part_of the 5615 i_want to 5936
## 4 cant_wait to 7967 as_well as 5282 it_was a 4178
## 5 for_the follow 7955 the_end of 4768 be_abl to 4068
## 6 i_want to 7688 accord_to the 4758 some_of the 3972
## 7 i_love you 7421 out_of the 4706 a_coupl of 3845
## 8 go_to be 7139 some_of the 4606 as_well as 3800
## 9 have_a great 6022 to_be a 4442 to_be a 3713
## 10 a_lot of 5998 in_the first 4416 out_of the 3705
## 11 to_be a 5827 the_unit state 3840 the_end of 3698
## 12 i_need to 5792 go_to be 3708 i_have been 3437
## 13 to_see you 5744 be_abl to 3703 i_have a 3321
## 14 i_have a 5490 the_first time 3496 this_is a 3181
## 15 im_go to 5414 it_was a 3467 i_have to 3100
## 16 one_of the 5340 member_of the 3352 the_rest of 3081
## 17 i_have to 4764 end_of the 3133 part_of the 3042
## 18 is_go to 4497 said_in a 2962 one_of my 3032
## 19 i_dont know 4438 for_the first 2911 i_had to 2916
## 20 let_me know 4314 of_the year 2891 there_is a 2787
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
dim(freq_twitter_3)[1]+dim(freq_news_3)[1]+dim(freq_blogs_3)[1]))
## Total size before pruning the elements with a frequency of one: 37952155
# pruning
freq_twitter_3 <- freq_twitter_3[freq_twitter_3$frequency > 1,]
freq_news_3 <- freq_news_3[freq_news_3$frequency > 1,]
freq_blogs_3 <- freq_blogs_3[freq_blogs_3$frequency > 1,]
message(sprintf("Total size after pruning the elements with a frequency of one : %d",
dim(freq_twitter_3)[1]+dim(freq_news_3)[1]+dim(freq_blogs_3)[1]))
## Total size after pruning the elements with a frequency of one : 6249408
It was especially important with the trigrams to prune the single instance trigrams from the data set, note that the final size was about 5% of the initial size.
The following histograms show graphically how often each of the top frequency words, bigrams, and trigrams appear in the files. The top 25 are shown here, but it is clear that the top few elements of each are significantly greater frequency than the rest.
# Twitter 1-grams
freq_twitter_1a <- freq_twitter_1[1:25,]
freq_twitter_1a <- arrange(freq_twitter_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_twitter_1a$frequency, names.arg = freq_twitter_1a$lastTerm, horiz = TRUE, las=1,
main = "Twitter 1-grams", xlab = "Frequency")
# Twitter bigrams
freq_twitter_2a <- freq_twitter_2[1:25,]
freq_twitter_2a <- arrange(freq_twitter_2a, frequency)
freq_twitter_2a$label <- paste(freq_twitter_2a$firstTerms, freq_twitter_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_twitter_2a$frequency, names.arg = freq_twitter_2a$label, horiz = TRUE, las=1,
main = "Twitter Bigrams", xlab = "Frequency")
# Twitter trigrams
freq_twitter_3a <- freq_twitter_3[1:25,]
freq_twitter_3a <- arrange(freq_twitter_3a, frequency)
freq_twitter_3a$label <- paste(freq_twitter_3a$firstTerms, freq_twitter_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_twitter_3a$frequency, names.arg = freq_twitter_3a$label, horiz = TRUE, las=1,
main = "Twitter Trigrams", xlab = "Frequency")
# News 1-grams
freq_news_1a <- freq_news_1[1:25,]
freq_news_1a <- arrange(freq_news_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_news_1a$frequency, names.arg = freq_news_1a$lastTerm, horiz = TRUE, las=1,
main = "News 1-grams", xlab = "Frequency")
# News bigrams
freq_news_2a <- freq_news_2[1:25,]
freq_news_2a <- arrange(freq_news_2a, frequency)
freq_news_2a$label <- paste(freq_news_2a$firstTerms, freq_news_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_news_2a$frequency, names.arg = freq_news_2a$label, horiz = TRUE, las=1,
main = "News Bigrams", xlab = "Frequency")
# News trigrams
freq_news_3a <- freq_news_3[1:25,]
freq_news_3a <- arrange(freq_news_3a, frequency)
freq_news_3a$label <- paste(freq_news_3a$firstTerms, freq_news_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_news_3a$frequency, names.arg = freq_news_3a$label, horiz = TRUE, las=1,
main = "News Trigrams", xlab = "Frequency")
# Blogs 1-grams
freq_blogs_1a <- freq_blogs_1[1:25,]
freq_blogs_1a <- arrange(freq_blogs_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_blogs_1a$frequency, names.arg = freq_blogs_1a$lastTerm, horiz = TRUE, las=1,
main = "Blogs 1-grams", xlab = "Frequency")
# Blogs bigrams
freq_blogs_2a <- freq_blogs_2[1:25,]
freq_blogs_2a <- arrange(freq_blogs_2a, frequency)
freq_blogs_2a$label <- paste(freq_blogs_2a$firstTerms, freq_blogs_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_blogs_2a$frequency, names.arg = freq_blogs_2a$label, horiz = TRUE, las=1,
main = "Blogs Bigrams", xlab = "Frequency")
# Blogs trigrams
freq_blogs_3a <- freq_blogs_3[1:25,]
freq_blogs_3a <- arrange(freq_blogs_3a, frequency)
freq_blogs_3a$label <- paste(freq_blogs_3a$firstTerms, freq_blogs_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_blogs_3a$frequency, names.arg = freq_blogs_3a$label, horiz = TRUE, las=1,
main = "Blogs Trigrams", xlab = "Frequency")