Milestone report

Rasmus Klitgaard

January 2025

Goal of report

The goal of this report is to do some exploratory analysis on the data. We will only look at the en_US data.

Loading data

blogs_path <- "final/en_US/en_US.blogs.txt"
connection <- file(blogs_path, open = "r")
blogs <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
close(connection)

news_path <- "final/en_US/en_US.news.txt"
connection <- file(news_path, open = "r")
news <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(connection, encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'final/en_US/en_US.news.txt'
close(connection)

twitter_path <- "final/en_US/en_US.twitter.txt"
connection <- file(twitter_path, open = "r")
twitter <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
close(connection)

The number of lines in each part of the data

print(length(blogs))
## [1] 899288
print(length(news))
## [1] 77259
print(length(twitter))
## [1] 2360148

The number of words in each part of the data

print(length(strsplit(blogs," ")))
## [1] 899288
print(length(strsplit(news," ")))
## [1] 77259
print(length(strsplit(twitter," ")))
## [1] 2360148

Now we can plot boxplots of the number of words per lines. For this bit we will just use the first 5000 lines in each.

blogs <- blogs[1:5000]
news <- news[1:5000]
twitter <- twitter[1:5000]
words_per_line_blogs <- sapply(blogs, function(x) {
  length(strsplit(x, "\\s+")[[1]])})
words_per_line_news <- sapply(news, function(x) {
  length(strsplit(x, "\\s+")[[1]])})
words_per_line_twitter <- sapply(twitter, function(x) {
  length(strsplit(x, "\\s+")[[1]])})
boxplot(words_per_line_blogs, words_per_line_news, words_per_line_twitter, main="Words per line", 
        xlab="Source", ylab = "Words per line", names=c("Blogs", "News", "Twitter"))

The most used words were

all_text_blogs <- paste(blogs, collapse = " ")
words_blogs <- unlist(strsplit(tolower(all_text_blogs), "\\W+"))
top_blog_words <- head(sort(table(words_blogs), decreasing = TRUE), 5)

all_text_news <- paste(news, collapse = " ")
words_news <- unlist(strsplit(tolower(all_text_news), "\\W+"))
top_news_words <- head(sort(table(words_news), decreasing = TRUE), 5)

all_text_twitter <- paste(twitter, collapse = " ")
words_twitter <- unlist(strsplit(tolower(all_text_twitter), "\\W+"))
top_twitter_words <- head(sort(table(words_twitter), decreasing = TRUE), 5)
print(top_blog_words)
## words_blogs
##   the   and    to     i     a 
## 10282  6035  5818  5146  4987
print(top_news_words)
## words_news
##  the   to    a  and   of 
## 9658 4504 4418 4377 3918
print(top_twitter_words)
## words_twitter
##  the    i   to    a  you 
## 1968 1893 1608 1303 1246