Loading all required packages

library(tm)
library(RWeka)
library(stringi)
library(stringr)
library(ggplot2)
library(R.utils)
library(knitr)
library(dplyr)
library(wordcloud)

Getting Data (English data set)

blogs.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt"
news.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"
twitter.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"

blogs.size <- file.info(blogs.file)$size / (1024*1024)
blogs.length <- countLines(blogs.file)
blogs.words <- sum(stri_count_words(readLines(blogs.file, encoding = "UTF-8", skipNul = TRUE)))

news.size <- file.info(news.file)$size / (1024*1024)
news.length <- countLines(news.file)
news.words <- sum(stri_count_words(readLines(news.file, encoding = "UTF-8", skipNul = TRUE)))

twitter.size <- file.info(twitter.file)$size / (1024*1024)
twitter.length <- countLines(twitter.file)
twitter.words <- sum(stri_count_words(readLines(twitter.file, encoding = "UTF-8", skipNul = TRUE)))

Below is summary statistics of the data by sources.

df <- data.frame(source = c("blogs", "news", "twitter"),
           size = c(blogs.size, news.size, twitter.size),
           length = c(blogs.length, news.length, twitter.length),
           words = c(blogs.words, news.words, twitter.words)
          )

kable(x=df,col.names=c("Source","Size","Line Count","Word Count"))

Source	Size	Line Count	Word Count
blogs	200.4242	899288	37546246
news	196.2775	1010242	2674536
twitter	159.3641	2360148	30093410

Data Sampling

As seen above, the size of the data is too big and sampling might be a good way to analyze the data. This is to reduce the amount of time and to consider the computing resources of the user.

blogs <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt", encoding = 'UTF-8')
news <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt", encoding = 'UTF-8')
twitter <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt", encoding = 'UTF-8')

set.seed(1)
blogsSample <- sample(blogs, length(blogs)*0.01)
newsSample <- sample(news, length(news)*0.01)
twitterSample <- sample(twitter, length(twitter)*0.01)
twitterSample <- sapply(twitterSample, 
                        function(row) iconv(row, "latin1", "ASCII", sub=""))


text_sample  <- c(blogsSample,newsSample,twitterSample)

Cleaning the data

After sampling the data, some cleaning up is done on it.

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
preprocessCorpus <- function(corpus){
    corpus <- tm_map(corpus, toSpace, "/|@|\\|")
    corpus <- tm_map(corpus, content_transformer(tolower))
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, removeWords, stopwords("english"))
    corpus <- tm_map(corpus, stripWhitespace)
    return(corpus)
}

freq_frame <- function(tdm){
    freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
    freq_frame <- data.frame(word=names(freq), freq=freq)
    return(freq_frame)
}

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))

text_sample <- VCorpus(VectorSource(text_sample))
text_sample <- preprocessCorpus(text_sample)

tdm1a <- TermDocumentMatrix(text_sample)
tdm1 <- removeSparseTerms(tdm1a, 0.99)
freq1_frame <- freq_frame(tdm1)
freq1_top30 <- head(freq1_frame,30)

tdm2a <- TermDocumentMatrix(text_sample, control=list(tokenize=BigramTokenizer))
tdm2 <- removeSparseTerms(tdm2a, 0.999)
freq2_frame <- freq_frame(tdm2)
freq2_top30 <- head(freq2_frame,30)

tdm3a <- TermDocumentMatrix(text_sample, control=list(tokenize=TrigramTokenizer))
tdm3 <- removeSparseTerms(tdm3a, 0.9999)
freq3_frame <- freq_frame(tdm3)
freq3_top30 <- head(freq3_frame,30)

tdm4a <- TermDocumentMatrix(text_sample, control=list(tokenize=QuadgramTokenizer))
tdm4 <- removeSparseTerms(tdm4a, 0.9999)
freq4_frame <- freq_frame(tdm4)
freq4_top30 <- head(freq4_frame,30)

Data Exploratory Analysis

Plots are constructed to visualise what are the top 30 common terms based on the n-gram model.

ggplot(freq1_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Unigrams")

ggplot(freq2_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Bigrams")

ggplot(freq3_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Trigrams")

ggplot(freq4_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y="Frequency", title="Top 30 Common Quadgrams")

In addition, word clouds are also produced to show the top 10 common terms for the n-gram model.

Top 10 Unigram

wordcloud(freq1_top30$word, freq1_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Bigram

wordcloud(freq2_top30$word, freq2_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Trigram

wordcloud(freq3_top30$word, freq3_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Top 10 Quadgram

wordcloud(freq4_top30$word, freq4_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)

Creating a prediction algorithm and Shiny app

The predictive model will be based on the n-gram model. Further cleaning up such as removing profane words can be considered in the final model.

For Shiny app, my plan is to include a text input box and to include options of how many next words the user wants to have as an output.

Milestone Report

Irni Jasmina Ibrahim

May 14, 2017