Download and import the data set

First, download the necessary packages, and then the data set.

library(stringi)
library(tm)
library(RWeka)
library(ggplot2)

fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
fileZIP <- "Coursera-SwiftKey.zip"

if(!file.exists(fileZIP)) {
        download.file(fileUrl, destfile = fileZIP)
        unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Basic summaries

Let’s make a simple summary of our data set.

##   File.Name Word.counts Line.counts Num.of.character
## 1   twitter    37570839     2360148        206824505
## 2     blogs     2651432      899288         15639408
## 3      news    30451170       77259        162096241

Data Preprocessing

A clean data set is needed for further research, therefore perform pre-processing of data.

# remove all non english characters as they cause issues
blogs <- iconv(blogs, "latin1", "ASCII", sub = "")
news <- iconv(news, "latin1", "ASCII", sub = "")
twitter <- iconv(twitter, "latin1", "ASCII", sub = "")

# sample the data
sample_data <- c(sample(twitter, length(twitter) * 0.01),
                 sample(blogs, length(blogs) * 0.01),
                 sample(news, length(news) * 0.01))

# create corpus and clean the data
corpus <- VCorpus(VectorSource(sample_data))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

N-gram Tokenization into Unigrams, Bigrams and Trigrams

To find the most used words and phrases we use tools of n-gram tokenization, in particular unigrams, bigrams and trigrams.

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

unigram_tab <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
bigram_tab <- TermDocumentMatrix(corpus,control = list(tokenize = bigram))
trigram_tab <- TermDocumentMatrix(corpus,control = list(tokenize = trigram))

Exploratory Data Analysis

At this stage, prepared the data for display, and directly made plots of data set.

unigram_corpus <- findFreqTerms(unigram_tab, lowfreq = 1000)
unigram_corpusnum <- rowSums(as.matrix(unigram_tab[unigram_corpus, ]))
unigram_corpustab <- data.frame(Word = names(unigram_corpusnum), frequency = unigram_corpusnum)
unigram_corpussort <- unigram_corpustab[order(-unigram_corpustab$frequency), ]

ggplot(unigram_corpussort, aes(x = reorder(Word, frequency),y = frequency)) +
        geom_bar(stat = "identity") +
        labs(title="Unigrams",x = "Most Words", y = "Frequency") +
        coord_flip()

bigram_corpus <- findFreqTerms(bigram_tab, lowfreq = 80)
bigram_corpusnum <- rowSums(as.matrix(bigram_tab[bigram_corpus, ]))
bigram_corpustab <- data.frame(Word = names(bigram_corpusnum), frequency = bigram_corpusnum)
bigram_corpussort <- bigram_corpustab[order(-bigram_corpustab$frequency), ]

ggplot(bigram_corpussort, aes(x = reorder(Word, frequency), y = frequency)) +
        geom_bar(stat = "identity") +
        labs(title = "Bigrams", x = "Most Words", y = "Frequency") +
        coord_flip()

trigram_corpus <- findFreqTerms(trigram_tab, lowfreq = 10)
trigram_corpusnum <- rowSums(as.matrix(trigram_tab[trigram_corpus, ]))
trigram_corpustab <- data.frame(Word = names(trigram_corpusnum), frequency = trigram_corpusnum)
trigram_corpussort <- trigram_corpustab[order(-trigram_corpustab$frequency), ]

ggplot(trigram_corpussort,aes(x = reorder(Word, frequency),y = frequency)) +
        geom_bar(stat = "identity")+
        labs(title = "Trigrams", x = "Most Words", y = "Frequency") +
        coord_flip()