Set Working Directory, Download the Data, and Unzip

Also assign these files to a specific directory we will be referencing later

setwd("C:/Users/gamartin/R/Coursera Swiftkey Capstone")
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
              "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip", exdir = "final")

Load the Libraries

library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Understand the Scope

blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

blogs_dsize <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 / 1024
news_dsize <- file.info("final/en_US/en_US.news.txt")$size / 1024 / 1024
twitter_dsize <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 / 1024

blogs_msize<-object.size(blogs) / 1024 / 1024
news_msize<-object.size(news) / 1024 / 1024
twitter_msize<-object.size(twitter) / 1024 / 1024

blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)

Data Frame

The data frame shows that we are working wtih a lot of lines and word counts, which could cause processing time to lag.

data.frame(source = c("blogs", "news", "twitter"),
           files_MB = c(blogs_dsize, news_dsize, twitter_dsize),
           in_memory_MB = c(blogs_msize, news_msize, twitter_msize),
           lines = c(length(blogs), length(news), length(twitter)),
           words_num = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
           mean_words_num = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
##    source files_MB in_memory_MB   lines words_num mean_words_num
## 1   blogs 200.4242    248.49350  899288  37546246       41.75108
## 2    news 196.2775     19.17972   77259   2674536       34.61779
## 3 twitter 159.3641    301.39694 2360148  30093410       12.75065

Limit Data

We are going to limit the amount of lines to 5000 for our exploratory anaylsis and N-graming work

data<-c(blogs,news,twitter)
data2 <- head(data, 5000)

Create CORPUS and Clean

From our new data frame (5000 lines) we are going to clean and create the CORPUS. Corpora are collections of documents containing (natural language) text. We create Corpora for analysis.

vector_doc <- VectorSource(data2)
corpus <- VCorpus(vector_doc)


corpus <- tm_map(corpus,  content_transformer(function(x) iconv(x, to='UTF-8', sub='byte')),  mc.cores=1)
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x,fixed=TRUE))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus<-tm_map(corpus, content_transformer(tolower))
corpus<-tm_map(corpus, stripWhitespace)
corpus<-tm_map(corpus, removePunctuation)
corpus<-tm_map(corpus, removeNumbers)
corpus<-tm_map(corpus, removeWords, stopwords('english'))

Tokenizers

We need to create sets of words (single words (unigrams), sets of two (bigrams), and sets of three (trigrams) for analysis. The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length.

BigramTokenizer <-
  function(x)
    unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

TrigramTokenizer <-
  function(x)
    unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

#Count Words
freq_df <- function(tdm){
  freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
  freq_df <- data.frame(word=names(freq), freq=freq)
  return(freq_df)
}
memory.size(max = TRUE)
## [1] 819.44
memory.limit(size=36000)
## [1] 36000
unigram <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
unigram_freq <- freq_df(unigram)

bigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)), 0.9999)
bigram_freq <- freq_df(bigram)

trigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer)), 0.9999)
trigram_freq <- freq_df(trigram)

Plotting Results

freq_plot <- function(data, title) {
  ggplot(data[1:25,], aes(reorder(word, -freq), freq)) +
    labs(x = "Words/Phrases", y = "Frequency") +
    ggtitle(title) +
    theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) +
    geom_bar(stat = "identity")
}

freq_plot(unigram_freq, "Top-25 Unigrams")

freq_plot(bigram_freq, "Top-25 Bigrams")

freq_plot(trigram_freq, "Top-25 Trigrams")