Checking The Document Details

Files’ size in MB

file.info("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")$size / (1024)^2
## [1] 159.3641
file.info("Coursera-SwiftKey/final/en_US/en_US.news.txt")$size / (1024)^2
## [1] 196.2775
file.info("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")$size / (1024)^2
## [1] 200.4242

Files’ line counts

suppressPackageStartupMessages(library(R.utils))
countLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")[1]
## [1] 2360148
countLines("Coursera-SwiftKey/final/en_US/en_US.news.txt")[1]
## [1] 1010242
countLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")[1]
## [1] 899288

Files’ word counts

system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.twitter.txt", stdout=TRUE)
## [1] "140 Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.news.txt", stdout=TRUE)
## [1] "11384 Coursera-SwiftKey/final/en_US/en_US.news.txt"
system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.blogs.txt", stdout=TRUE)
## [1] "40832 Coursera-SwiftKey/final/en_US/en_US.blogs.txt"

Sampling of The Data Set

This is an important step as the data given to us is extremely lengthy and helps to make the process a bit less lengthy and a bit faster still keeping its accuracy marginally the same

twitter_data <- readLines('Coursera-SwiftKey/final/en_US/en_US.twitter.txt', encoding = 'UTF-8')
news_data <- readLines('Coursera-SwiftKey/final/en_US/en_US.news.txt', encoding = 'UTF-8')
blogs_data <- readLines('Coursera-SwiftKey/final/en_US/en_US.blogs.txt', encoding = 'UTF-8')
set.seed(7777)
sTwitter <- twitter_data[sample(1:length(twitter_data),10000)]
sNews <- news_data[sample(1:length(news_data),10000)]
sBlogs <- blogs_data[sample(1:length(blogs_data),10000)]
sData <- c(sTwitter,sNews,sBlogs)
if (!dir.exists("sData")){
    dir.create("sData")
}
writeLines(sData, "./sData/sData.txt")

# remove temporary variables
rm(twitter_data,news_data,blogs_data,sTwitter,sNews,sBlogs,sData)

Cleaning the written sample data

Now to clean the data we will use the tm package available in R and use the following transformers (Use Version 0.6.2 of the tm package as there seems to be a problem in the 0.7th version and it does not work properly)

suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(SnowballC))
document <- Corpus(DirSource(file.path(".", "sData")))
document <- tm_map(document, content_transformer(tolower))
document <- tm_map(document, content_transformer(function(x, y) gsub(y, " ", x)), "/|@|\\|")
document <- tm_map(document, removePunctuation)
document <- tm_map(document, removeNumbers)
document <- tm_map(document, stripWhitespace)
document <- tm_map(document, removeWords, stopwords("english"))
document <- tm_map(document, stemDocument)

Producing N-Grams

library(RWeka)
Tokenizer <- function(x){
    NGramTokenizer(x, Weka_control(min = 1, max = 1))
}
uni_tok <- TermDocumentMatrix(document, control = list(tokenize = Tokenizer))
BigramTokenizer  <- function(x){
    NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
bi_tok <- TermDocumentMatrix(document, control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x){
    NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
tri_tok <- TermDocumentMatrix(document, control = list(tokenize = TrigramTokenizer))

Exploratory Data Analysis

Checking the top 6 highest frequencies in each of the following n-grams and the top most frequencies in each case using ggplot2 package in r

In the Unigrams

uni_freq <- sort(rowSums(as.matrix(uni_tok)), decreasing = TRUE)
uni_freq_word <- data.frame(word = names(uni_freq),freq = uni_freq)
head(uni_freq_word)
##      word freq
## will will 2978
## said said 2952
## one   one 2694
## like like 2356
## just just 2249
## get   get 2239
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
uni_freq_word %>% 
    filter(freq >= 1000) %>%
    ggplot(aes(word, freq)) +
    geom_bar(stat = "identity") +
    ggtitle("Unigrams with a frequency of more than and equal to 1000") +
    xlab("Unigrams") + ylab("Frequencies") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

In The Bigrams

bi_freq <- sort(rowSums(as.matrix(bi_tok)), decreasing = TRUE)
bi_freq_word <- data.frame(words = names(bi_freq), freq = bi_freq)
head(bi_freq_word)
##               words freq
## last year last year  193
## new york   new york  186
## right now right now  172
## look like look like  168
## year ago   year ago  154
## last week last week  142
bi_freq_word %>% 
    filter(freq >= 100) %>%
    ggplot(aes(words, freq)) +
    geom_bar(stat = "identity") +
    ggtitle("Bigrams with a frequency of more than and equal to 100") +
    xlab("Unigrams") + ylab("Frequencies") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

In the Trigrams

tri_freq <- sort(rowSums(as.matrix(tri_tok)), decreasing = TRUE)
tri_freq_word <- data.frame(words = names(tri_freq), freq = tri_freq)
head(tri_freq_word)
##                                   words freq
## ha ha ha                       ha ha ha   42
## presid barack obama presid barack obama   25
## new york citi             new york citi   23
## cant wait see             cant wait see   19
## happi new year           happi new year   14
## new york time             new york time   14
tri_freq_word %>% 
    filter(freq >= 10) %>%
    ggplot(aes(words, freq)) +
    geom_bar(stat = "identity") +
    ggtitle("Trigrams with a frequency of more than and equal to 10") +
    xlab("Unigrams") + ylab("Frequencies") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

WordClouds

Top 100 Unigrams

suppressPackageStartupMessages(library(wordcloud))
set.seed(7777)
wordcloud(uni_freq_word$word, uni_freq_word$freq, max.words = 100, random.order = FALSE, scale = c(5, 0.1), colors = brewer.pal(6, "Dark2"))

Top 50 Bigrams

set.seed(7777)
wordcloud(bi_freq_word$words, bi_freq_word$freq, max.words = 100, random.order = FALSE, scale = c(5, 0.1), colors = brewer.pal(6, "Dark2"))

Top 50 Trigrams

set.seed(7777)
wordcloud(tri_freq_word$words, tri_freq_word$freq, max.words = 150, random.order = F, scale = c(5, 0.1), colors = brewer.pal(6, "Dark2"))