Load data into memory and does some summary on the three datasets

twitter_conn <- file('data/en_US/en_US.twitter.txt')
news_conn <- file('data/en_US/en_US.news.txt')
blogs_conn <- file('data/en_US/en_US.blogs.txt')

open(twitter_conn)
open(news_conn)
open(blogs_conn)

twitter_data <- readLines(twitter_conn, skipNul=TRUE)
blogs_data <- readLines(blogs_conn, skipNul=TRUE)
news_data <- readLines(news_conn, skipNul=TRUE)

close(twitter_conn)
close(blogs_conn)
close(news_conn)

length(twitter_data)
## [1] 2360148
max(sapply(twitter_data, nchar))
## [1] 140
length(blogs_data)
## [1] 899288
max(sapply(blogs_data, nchar))
## [1] 40833
length(news_data)
## [1] 1010242
max(sapply(news_data, nchar))
## [1] 11384
sum(grepl('love', twitter_data)) / sum(grepl('hate', twitter_data))
## [1] 4.108592
head(twitter_data[grepl('biostats', twitter_data)])
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
sum(grepl('A computer once beat me at chess, but it was no match for me at kickboxing', twitter_data))
## [1] 3

Normalizes text in the dataset

The first step I want to perform is normalizing words in the datasets:
* Transform words to lowercase
* Remove all punctuation
* Words are separated by only spaces

normalize_paragraph <- function(paragraph) {
    remove_non_alphabetical <- function(sentence) {
        sentence <- strsplit(sentence, '')[[1]]
        sentence <- sentence[sentence >= 'a' & sentence <= 'z' | sentence == ' ']
        paste(sentence, collapse='')
    }
    n_sentences <- length(paragraph)
    answer <- list()
    counter <- 0
    for (i in 1:n_sentences) {
        if (class(paragraph[i]) == 'character') {
            paragraph[i] <- tolower(paragraph[i])
            paragraph[i] <- remove_non_alphabetical(paragraph[i])
            words <- strsplit(paragraph[i], ' ')[[1]]
            
            counter <- counter + 1
            answer[[counter]] <- words[nchar(words) > 0]
        }
    }
    answer
}
news_data <- normalize_paragraph(news_data)
blogs_data <- normalize_paragraph(blogs_data)
twitter_data <- normalize_paragraph(twitter_data)

One more optional step, I consider text from those three datasets has the same role. Thus I will merge those dataset into one

combined_paragraph <- do.call(c, list(blogs_data, news_data, twitter_data))
# rm(blogs_data, news_data, twitter_data)

Does exploratory analysis

length(combined_paragraph)
## [1] 4269678
sum(sapply(combined_paragraph, length))
## [1] 99639414
mean(sapply(combined_paragraph, length))
## [1] 23.33652

Get single words occurences

library(hash)
## hash-2.2.6.1 provided by Decision Patterns
dict <- hash()
increase_count <- function(word) {
    if (length(word) > 1) {
         word <- paste(word, ' ')
    }
    if (has.key(word, dict)) {
         dict[[word]] <<- dict[[word]] + 1
    } else {
         dict[[word]] <<- 1
    }
}
for (sentence in combined_paragraph) {
    for (word in sentence) {
        increase_count(word)
    }
}
one_gram_distribution <- unlist(as.list(dict))
one_gram_distribution <- sort(one_gram_distribution, decreasing=TRUE)
head(one_gram_distribution)
##     the      to     and       a      of      in 
## 4761104 2753596 2404075 2381066 2005576 1644687
library(ggplot2)
first_20_words <- one_gram_distribution[1:20]
first_20_words <- data.frame(word=factor(names(first_20_words), levels=names(first_20_words)), occurrences=unname(first_20_words))
ggplot(first_20_words, aes(x=word, y=occurrences, fill=word)) + geom_bar(stat="identity")

Some inference and conclusion