Exploring the data

This markdown aims to explore the data for the en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt datasets. The number of lines, number of words, and line with the most characters in each dataset are explored below

Set up required library

library(stringi)
library(tokenizers)

Loading in data

con_blogs = file("final/en_US/en_US.blogs.txt")
blogs = readLines(con_blogs,n=1000000000L)
con_news = file("final/en_US/en_US.news.txt")
news = readLines(con_news,n=1000000000L)
con_twitter = file("final/en_US/en_US.twitter.txt")
twitter = readLines(con_twitter,n=1000000000L)

Summary of data

longest = function(strings){
  longest_string <- strings[which.max(nchar(strings))]
  return(nchar(longest_string))
}
words = list()
words$blogs = stri_count_words(blogs)
words$news = stri_count_words(news)
words$twitter = stri_count_words(twitter)

lengths = list()
lengths$blogs = length(blogs)
lengths$news = length(news)
lengths$twitter = length(twitter)

longest_entry = list()
longest_entry$blogs = longest(blogs)
longest_entry$news = longest(news)
longest_entry$twitter = longest(twitter)

data.frame(total_words = unlist(lapply(words,sum)),
           min_words = unlist(lapply(words,min)),
           max_words = unlist(lapply(words,max)),
           mean_words = unlist(lapply(words,mean)),
           median_words = unlist(lapply(words,median)),
           line_counts = unlist(lengths),
           longest_entry_char = unlist(longest_entry)
           )
##         total_words min_words max_words mean_words median_words line_counts
## blogs      37546806         0      6726   41.75170           28      899288
## news        2674561         1      1126   34.61812           32       77259
## twitter    30096649         1        47   12.75202           12     2360148
##         longest_entry_char
## blogs                40833
## news                  5760
## twitter                144

Distribution of number of words

hist(log(words$blogs,2),col = rgb(1,0,0,0.5),ylim = c(0,300000),main = "Histogram of log(word count,2)")
hist(log(words$news,2),add=T,col = rgb(0,1,0,1))
hist(log(words$twitter,2),add=T,col = rgb(0,0,1,0.5))
legend("topleft",legend = c("blogs","news","twitter"),col = c(rgb(1,0,0,0.5),rgb(0,1,0,1),rgb(0,0,1,0.5)),pch=16)

N-gram analysis

set.seed(02052025)
par(mfrow=c(1,3))
sampleblogs = sample(blogs,lengths$blogs/5)
blogs3 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(blogs3),horiz=TRUE,las=1,cex.names =0.8)
blogs2 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(blogs2),horiz=TRUE,las=1,cex.names =0.8)
blogs1 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(blogs1),horiz=TRUE,las=1,cex.names =0.8)

samplenews = sample(news,lengths$news/5)
news3 = sort(table(unlist(tokenize_ngrams(samplenews,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(news3),horiz=TRUE,las=1,cex.names =0.8)
news2 = sort(table(unlist(tokenize_ngrams(samplenews,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(news2),horiz=TRUE,las=1,cex.names =0.8)
news1 = sort(table(unlist(tokenize_ngrams(samplenews,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(news1),horiz=TRUE,las=1,cex.names =0.8)

sampletwitter = sample(twitter,lengths$twitter/5)
twitter3 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(twitter3),horiz=TRUE,las=1,cex.names =0.8)
twitter2 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(twitter2),horiz=TRUE,las=1,cex.names =0.8)
twitter1 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(twitter1),horiz=TRUE,las=1,cex.names =0.8)

Top 20 occurences of n-grams of the datasets have many collocations

intersect(intersect(names(blogs3),names(news3)),names(twitter3))
## [1] "one of the"  "a lot of"    "to be a"     "going to be"
intersect(intersect(names(blogs2),names(news2)),names(twitter2))
## [1] "of the"  "in the"  "to the"  "on the"  "to be"   "for the" "at the" 
## [8] "is a"
intersect(intersect(names(blogs1),names(news1)),names(twitter1))
##  [1] "the"  "and"  "to"   "a"    "of"   "i"    "in"   "that" "is"   "it"  
## [11] "for"  "with" "on"

Final thoughts

From the distribution, twitter’s word count tends to be heavier on the larger side before cutting off abruptly, which is likely a result of twitter’s character limit. Word count for blogs also seem the most varied, having the biggest range (0 to 6726) and a significant difference between mean and median (41.75 vs 28), implying a skewed distribution. This can be due to the freedom of creative liberty for personal blogs, where there is no prescription of style, allowing for free-form stream-of-consciousness writing. Regardless of dataset, certain words frequently follow certain other words (collocation). These relations can be used to predict the next word eg using trigram to predict a word after an input bigram. A markov chain can be used, with highest probabilities for next word being suggested. To account for new collocations, we can compute probabilities on the given datasets, and then compute probabilities based on user input as another dataset, then aggregate the two with slightly more weight to user input (to fix the problem of user using collocations not found in dataset) to “learn” and apply new collocations.