This markdown aims to explore the data for the en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt datasets. The number of lines, number of words, and line with the most characters in each dataset are explored below
Set up required library
library(stringi)
library(tokenizers)
Loading in data
con_blogs = file("final/en_US/en_US.blogs.txt")
blogs = readLines(con_blogs,n=1000000000L)
con_news = file("final/en_US/en_US.news.txt")
news = readLines(con_news,n=1000000000L)
con_twitter = file("final/en_US/en_US.twitter.txt")
twitter = readLines(con_twitter,n=1000000000L)
Summary of data
longest = function(strings){
longest_string <- strings[which.max(nchar(strings))]
return(nchar(longest_string))
}
words = list()
words$blogs = stri_count_words(blogs)
words$news = stri_count_words(news)
words$twitter = stri_count_words(twitter)
lengths = list()
lengths$blogs = length(blogs)
lengths$news = length(news)
lengths$twitter = length(twitter)
longest_entry = list()
longest_entry$blogs = longest(blogs)
longest_entry$news = longest(news)
longest_entry$twitter = longest(twitter)
data.frame(total_words = unlist(lapply(words,sum)),
min_words = unlist(lapply(words,min)),
max_words = unlist(lapply(words,max)),
mean_words = unlist(lapply(words,mean)),
median_words = unlist(lapply(words,median)),
line_counts = unlist(lengths),
longest_entry_char = unlist(longest_entry)
)
## total_words min_words max_words mean_words median_words line_counts
## blogs 37546806 0 6726 41.75170 28 899288
## news 2674561 1 1126 34.61812 32 77259
## twitter 30096649 1 47 12.75202 12 2360148
## longest_entry_char
## blogs 40833
## news 5760
## twitter 144
Distribution of number of words
hist(log(words$blogs,2),col = rgb(1,0,0,0.5),ylim = c(0,300000),main = "Histogram of log(word count,2)")
hist(log(words$news,2),add=T,col = rgb(0,1,0,1))
hist(log(words$twitter,2),add=T,col = rgb(0,0,1,0.5))
legend("topleft",legend = c("blogs","news","twitter"),col = c(rgb(1,0,0,0.5),rgb(0,1,0,1),rgb(0,0,1,0.5)),pch=16)
N-gram analysis
set.seed(02052025)
par(mfrow=c(1,3))
sampleblogs = sample(blogs,lengths$blogs/5)
blogs3 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(blogs3),horiz=TRUE,las=1,cex.names =0.8)
blogs2 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(blogs2),horiz=TRUE,las=1,cex.names =0.8)
blogs1 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(blogs1),horiz=TRUE,las=1,cex.names =0.8)
samplenews = sample(news,lengths$news/5)
news3 = sort(table(unlist(tokenize_ngrams(samplenews,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(news3),horiz=TRUE,las=1,cex.names =0.8)
news2 = sort(table(unlist(tokenize_ngrams(samplenews,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(news2),horiz=TRUE,las=1,cex.names =0.8)
news1 = sort(table(unlist(tokenize_ngrams(samplenews,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(news1),horiz=TRUE,las=1,cex.names =0.8)
sampletwitter = sample(twitter,lengths$twitter/5)
twitter3 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(twitter3),horiz=TRUE,las=1,cex.names =0.8)
twitter2 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(twitter2),horiz=TRUE,las=1,cex.names =0.8)
twitter1 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(twitter1),horiz=TRUE,las=1,cex.names =0.8)
Top 20 occurences of n-grams of the datasets have many collocations
intersect(intersect(names(blogs3),names(news3)),names(twitter3))
## [1] "one of the" "a lot of" "to be a" "going to be"
intersect(intersect(names(blogs2),names(news2)),names(twitter2))
## [1] "of the" "in the" "to the" "on the" "to be" "for the" "at the"
## [8] "is a"
intersect(intersect(names(blogs1),names(news1)),names(twitter1))
## [1] "the" "and" "to" "a" "of" "i" "in" "that" "is" "it"
## [11] "for" "with" "on"
From the distribution, twitter’s word count tends to be heavier on the larger side before cutting off abruptly, which is likely a result of twitter’s character limit. Word count for blogs also seem the most varied, having the biggest range (0 to 6726) and a significant difference between mean and median (41.75 vs 28), implying a skewed distribution. This can be due to the freedom of creative liberty for personal blogs, where there is no prescription of style, allowing for free-form stream-of-consciousness writing. Regardless of dataset, certain words frequently follow certain other words (collocation). These relations can be used to predict the next word eg using trigram to predict a word after an input bigram. A markov chain can be used, with highest probabilities for next word being suggested. To account for new collocations, we can compute probabilities on the given datasets, and then compute probabilities based on user input as another dataset, then aggregate the two with slightly more weight to user input (to fix the problem of user using collocations not found in dataset) to “learn” and apply new collocations.