The goal of this report is to do some exploratory analysis on the data. We will only look at the en_US data.
blogs_path <- "final/en_US/en_US.blogs.txt"
connection <- file(blogs_path, open = "r")
blogs <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
close(connection)
news_path <- "final/en_US/en_US.news.txt"
connection <- file(news_path, open = "r")
news <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(connection, encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'final/en_US/en_US.news.txt'
close(connection)
twitter_path <- "final/en_US/en_US.twitter.txt"
connection <- file(twitter_path, open = "r")
twitter <- readLines(connection, encoding = "UTF-8", skipNul = TRUE)
close(connection)
The number of lines in each part of the data
print(length(blogs))
## [1] 899288
print(length(news))
## [1] 77259
print(length(twitter))
## [1] 2360148
The number of words in each part of the data
print(length(strsplit(blogs," ")))
## [1] 899288
print(length(strsplit(news," ")))
## [1] 77259
print(length(strsplit(twitter," ")))
## [1] 2360148
Now we can plot boxplots of the number of words per lines. For this bit we will just use the first 5000 lines in each.
blogs <- blogs[1:5000]
news <- news[1:5000]
twitter <- twitter[1:5000]
words_per_line_blogs <- sapply(blogs, function(x) {
length(strsplit(x, "\\s+")[[1]])})
words_per_line_news <- sapply(news, function(x) {
length(strsplit(x, "\\s+")[[1]])})
words_per_line_twitter <- sapply(twitter, function(x) {
length(strsplit(x, "\\s+")[[1]])})
boxplot(words_per_line_blogs, words_per_line_news, words_per_line_twitter, main="Words per line",
xlab="Source", ylab = "Words per line", names=c("Blogs", "News", "Twitter"))
The most used words were
all_text_blogs <- paste(blogs, collapse = " ")
words_blogs <- unlist(strsplit(tolower(all_text_blogs), "\\W+"))
top_blog_words <- head(sort(table(words_blogs), decreasing = TRUE), 5)
all_text_news <- paste(news, collapse = " ")
words_news <- unlist(strsplit(tolower(all_text_news), "\\W+"))
top_news_words <- head(sort(table(words_news), decreasing = TRUE), 5)
all_text_twitter <- paste(twitter, collapse = " ")
words_twitter <- unlist(strsplit(tolower(all_text_twitter), "\\W+"))
top_twitter_words <- head(sort(table(words_twitter), decreasing = TRUE), 5)
print(top_blog_words)
## words_blogs
## the and to i a
## 10282 6035 5818 5146 4987
print(top_news_words)
## words_news
## the to a and of
## 9658 4504 4418 4377 3918
print(top_twitter_words)
## words_twitter
## the i to a you
## 1968 1893 1608 1303 1246