Exploring the data

This markdown aims to explore the data for the en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt datasets. The number of lines, number of words, and line with the most characters in each dataset are explored below

Set up required library

library(stringi)
library(tokenizers)

Loading in data

con_blogs = file("final/en_US/en_US.blogs.txt")
blogs = readLines(con_blogs,n=1000000000L)
con_news = file("final/en_US/en_US.news.txt")
news = readLines(con_news,n=1000000000L)
con_twitter = file("final/en_US/en_US.twitter.txt")
twitter = readLines(con_twitter,n=1000000000L)

Summary of data

longest = function(strings){
  longest_string <- strings[which.max(nchar(strings))]
  return(nchar(longest_string))
}
words = list()
words$blogs = stri_count_words(blogs)
words$news = stri_count_words(news)
words$twitter = stri_count_words(twitter)

lengths = list()
lengths$blogs = length(blogs)
lengths$news = length(news)
lengths$twitter = length(twitter)

longest_entry = list()
longest_entry$blogs = longest(blogs)
longest_entry$news = longest(news)
longest_entry$twitter = longest(twitter)

data.frame(total_words = unlist(lapply(words,sum)),
           min_words = unlist(lapply(words,min)),
           max_words = unlist(lapply(words,max)),
           mean_words = unlist(lapply(words,mean)),
           median_words = unlist(lapply(words,median)),
           line_counts = unlist(lengths),
           longest_entry_char = unlist(longest_entry)
           )

##         total_words min_words max_words mean_words median_words line_counts
## blogs      37546806         0      6726   41.75170           28      899288
## news        2674561         1      1126   34.61812           32       77259
## twitter    30096649         1        47   12.75202           12     2360148
##         longest_entry_char
## blogs                40833
## news                  5760
## twitter                144

Distribution of number of words

hist(log(words$blogs,2),col = rgb(1,0,0,0.5),ylim = c(0,300000),main = "Histogram of log(word count,2)")
hist(log(words$news,2),add=T,col = rgb(0,1,0,1))
hist(log(words$twitter,2),add=T,col = rgb(0,0,1,0.5))
legend("topleft",legend = c("blogs","news","twitter"),col = c(rgb(1,0,0,0.5),rgb(0,1,0,1),rgb(0,0,1,0.5)),pch=16)

N-gram analysis

set.seed(02052025)
par(mfrow=c(1,3))
sampleblogs = sample(blogs,lengths$blogs/5)
blogs3 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(blogs3),horiz=TRUE,las=1,cex.names =0.8)
blogs2 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(blogs2),horiz=TRUE,las=1,cex.names =0.8)
blogs1 = sort(table(unlist(tokenize_ngrams(sampleblogs,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(blogs1),horiz=TRUE,las=1,cex.names =0.8)

samplenews = sample(news,lengths$news/5)
news3 = sort(table(unlist(tokenize_ngrams(samplenews,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(news3),horiz=TRUE,las=1,cex.names =0.8)
news2 = sort(table(unlist(tokenize_ngrams(samplenews,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(news2),horiz=TRUE,las=1,cex.names =0.8)
news1 = sort(table(unlist(tokenize_ngrams(samplenews,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(news1),horiz=TRUE,las=1,cex.names =0.8)

sampletwitter = sample(twitter,lengths$twitter/5)
twitter3 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 3L))),decreasing = TRUE)[1:20]
barplot(rev(twitter3),horiz=TRUE,las=1,cex.names =0.8)
twitter2 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 2L))),decreasing = TRUE)[1:20]
barplot(rev(twitter2),horiz=TRUE,las=1,cex.names =0.8)
twitter1 = sort(table(unlist(tokenize_ngrams(sampletwitter,n = 1L))),decreasing = TRUE)[1:20]
barplot(rev(twitter1),horiz=TRUE,las=1,cex.names =0.8)

Top 20 occurences of n-grams of the datasets have many collocations

intersect(intersect(names(blogs3),names(news3)),names(twitter3))

## [1] "one of the"  "a lot of"    "to be a"     "going to be"

intersect(intersect(names(blogs2),names(news2)),names(twitter2))

## [1] "of the"  "in the"  "to the"  "on the"  "to be"   "for the" "at the" 
## [8] "is a"

intersect(intersect(names(blogs1),names(news1)),names(twitter1))

##  [1] "the"  "and"  "to"   "a"    "of"   "i"    "in"   "that" "is"   "it"  
## [11] "for"  "with" "on"

Module 2

2025-05-02

Exploring the data

Final thoughts