Loading all required packages

library(tm)
library(RWeka)
library(stringi)
library(stringr)
library(ggplot2)
library(R.utils)
library(knitr)
library(dplyr)
library(wordcloud)

Getting Data (English data set)

blogs.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt"
news.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"
twitter.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"

blogs.size <- file.info(blogs.file)$size / (1024*1024)
blogs.length <- countLines(blogs.file)
blogs.words <- sum(stri_count_words(readLines(blogs.file, encoding = "UTF-8", skipNul = TRUE)))

news.size <- file.info(news.file)$size / (1024*1024)
news.length <- countLines(news.file)
news.words <- sum(stri_count_words(readLines(news.file, encoding = "UTF-8", skipNul = TRUE)))

twitter.size <- file.info(twitter.file)$size / (1024*1024)
twitter.length <- countLines(twitter.file)
twitter.words <- sum(stri_count_words(readLines(twitter.file, encoding = "UTF-8", skipNul = TRUE)))

Below is summary statistics of the data by sources.

df <- data.frame(source = c("blogs", "news", "twitter"),
           size = c(blogs.size, news.size, twitter.size),
           length = c(blogs.length, news.length, twitter.length),
           words = c(blogs.words, news.words, twitter.words)
          )

kable(x=df,col.names=c("Source","Size","Line Count","Word Count"))
Source Size Line Count Word Count
blogs 200.4242 899288 37546246
news 196.2775 1010242 2674536
twitter 159.3641 2360148 30093410

Data Sampling

As seen above, the size of the data is too big and sampling might be a good way to analyze the data. This is to reduce the amount of time and to consider the computing resources of the user.

blogs <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt", encoding = 'UTF-8')
news <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt", encoding = 'UTF-8')
twitter <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt", encoding = 'UTF-8')

set.seed(1)
blogsSample <- sample(blogs, length(blogs)*0.01)
newsSample <- sample(news, length(news)*0.01)
twitterSample <- sample(twitter, length(twitter)*0.01)
twitterSample <- sapply(twitterSample, 
                        function(row) iconv(row, "latin1", "ASCII", sub=""))


text_sample  <- c(blogsSample,newsSample,twitterSample)

Cleaning the data

After sampling the data, some cleaning up is done on it.

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
preprocessCorpus <- function(corpus){
    corpus <- tm_map(corpus, toSpace, "/|@|\\|")
    corpus <- tm_map(corpus, content_transformer(tolower))
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, removeWords, stopwords("english"))
    corpus <- tm_map(corpus, stripWhitespace)
    return(corpus)
}

freq_frame <- function(tdm){
    freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
    freq_frame <- data.frame(word=names(freq), freq=freq)
    return(freq_frame)
}

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))

text_sample <- VCorpus(VectorSource(text_sample))
text_sample <- preprocessCorpus(text_sample)

tdm1a <- TermDocumentMatrix(text_sample)
tdm1 <- removeSparseTerms(tdm1a, 0.99)
freq1_frame <- freq_frame(tdm1)
freq1_top30 <- head(freq1_frame,30)

tdm2a <- TermDocumentMatrix(text_sample, control=list(tokenize=BigramTokenizer))
tdm2 <- removeSparseTerms(tdm2a, 0.999)
freq2_frame <- freq_frame(tdm2)
freq2_top30 <- head(freq2_frame,30)

tdm3a <- TermDocumentMatrix(text_sample, control=list(tokenize=TrigramTokenizer))
tdm3 <- removeSparseTerms(tdm3a, 0.9999)
freq3_frame <- freq_frame(tdm3)
freq3_top30 <- head(freq3_frame,30)

tdm4a <- TermDocumentMatrix(text_sample, control=list(tokenize=QuadgramTokenizer))
tdm4 <- removeSparseTerms(tdm4a, 0.9999)
freq4_frame <- freq_frame(tdm4)
freq4_top30 <- head(freq4_frame,30)

Data Exploratory Analysis

Plots are constructed to visualise what are the top 30 common terms based on the n-gram model.

ggplot(freq1_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Unigrams")

ggplot(freq2_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Bigrams")

ggplot(freq3_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Trigrams")

ggplot(freq4_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Quadgrams")

In addition, word clouds are also produced to show the top 10 common terms for the n-gram model.

Top 10 Unigram

wordcloud(freq1_top30$word, freq1_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Bigram

wordcloud(freq2_top30$word, freq2_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Trigram

wordcloud(freq3_top30$word, freq3_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Quadgram

wordcloud(freq4_top30$word, freq4_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Creating a prediction algorithm and Shiny app

The predictive model will be based on the n-gram model. Further cleaning up such as removing profane words can be considered in the final model.

For Shiny app, my plan is to include a text input box and to include options of how many next words the user wants to have as an output.