COURSERA CAPSTONE MILESTONE

OVERVIEW

This is the Milestone Report for the Coursera Capstone SwiftKey Application. We created a corpus using data from the internet and performed the following exploratory analysis Download the text files (Twitter, Blogs and News) from the WWW.
Obtained a random sample of 0.1% combining the three sources.
Created a comparison table showing the file sizes, the number of lines (length) and the number of words in each one, including the sample file. Created a corpus. Normalized the text to lower case, eliminating punctuation, whitespaces, numbers, URLs and stopwords. Extracted n-grams, bi-grams, tri-grams and four-grams. Plot data to identify features and patterns.

Setting Environment

# Loading libraries
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(RWeka)
library(SnowballC)
library(googleVis)
## 
## Welcome to googleVis version 0.5.8
## 
## Please read the Google API Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
## 
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
## 
## See the googleVis package vignettes for more details,
## or visit http://github.com/mages/googleVis.
## 
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2) 
## Warning: package 'ggplot2' was built under R version 3.1.3
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(magrittr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(stringi)# for character string analysis
library(wordcloud)

setwd("~/Documents/COURSERA_CAPSTONE/final/en_US")

Reading lines from Blogs, News and Twitter

twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)

Consolidate Files - Create a 1% random sample

sTwitter <- twitter[sample(1:length(twitter),20000)]
sNews <- news[sample(1:length(news), 1000)]
sBlogs <- blogs[sample(1:length(blogs), 10000)]
textSample <- c(sTwitter,sNews,sBlogs)
writeLines(textSample, "textSample.txt")
con <- file("textSample.txt", "r")
txtSample <- readLines(con)
close(con)

Determining size, number of lines and number of words for each of the files

# Determining number of lines for each file
twitterLength <- length(twitter)  #  2360148
blogsLength <- length(blogs)      #   899288
newsLength <- length(news)        #  1010242
sampleLength <- length(txtSample) # 31000


# Determining Word counts for each file
twitterWords <- sum(sapply(gregexpr("\\S+", twitter), length))
blogsWords <- sum(sapply(gregexpr("\\S+", blogs), length))
newsWords <- sum(sapply(gregexpr("\\S+", news), length))
sampleWords <- sum(sapply(gregexpr("\\S+", txtSample), length))

# Determining File size
twitterSize <- file.info("en_US.twitter.txt")$size / 1024.0 / 1024.0
blogsSize <- file.info("en_US.blogs.txt")$size / 1024.0 / 1024.0
newsSize <- file.info("en_US.news.txt")$size / 1024.0 / 1024.0
sampleSize <- file.info("textSample.txt")$size / 1024.0 / 1024.0

# Creating a table
FileNames <- c("Twitter","Blogs","News", "Sample")
FileSizes <- c(round(twitterSize, digits = 2), round(blogsSize,digits = 2), 
               round(newsSize, digits = 2), round(sampleSize, digits = 2))

fileTable <- data.frame(
  fileName = c("Twitter","Blogs","News", "Sample"),
  fileSize = FileSizes,
  lineCount = c(twitterLength,blogsLength, newsLength, sampleLength),
  wordCount = c(blogsWords, newsWords, twitterWords, sampleWords)                  
)

colnames(fileTable) <- c("File Name", "File Size (MB)", "Line Count", "Word Count")

saveRDS(fileTable, file = "fileTable.Rda")

fileTableDF <- readRDS("fileTable.Rda")

fileTable
##   File Name File Size (MB) Line Count Word Count
## 1   Twitter         159.36    2360148   37334147
## 2     Blogs         200.42     899288   34372530
## 3      News         196.28    1010242   30373603
## 4    Sample           3.73      31000     705079

Create a corpus

corpus <- Corpus(VectorSource(txtSample))

Clean corpus

corpus <- tm_map(corpus, content_transformer(tolower), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
rmURL <- function(x) gsub("http[[:alnum:]]*", "", x) 
corpus <- tm_map(corpus, content_transformer(rmURL))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)

corpusDF <- data.frame(text=unlist(sapply(corpus,"[", "content")), stringsAsFactors = FALSE)

meta(corpus)
## data frame with 0 columns and 31000 rows

Create Document-Term Matrix

dtm <- DocumentTermMatrix(corpus)

PLOTTING

We can observe in each plot the most frequent terms and common terms between different size of n-grams and samples.

Create Word Cloud for the Document-Term Matrix

dtmCloud <- as.matrix(dtm) # Convert corpus to matrix
frequency <- colSums(dtmCloud) # Create a vector with term frequencies
frequency <- sort(frequency, decreasing=TRUE)
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100], colors=brewer.pal(8, "Dark2")) 

Create an histogram for the Document-Term Matrix

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wdf <- data.frame(word=names(freq), freq=freq)
subset(wdf, freq>1000)    %>%
  ggplot(aes(word, freq)) +
  geom_bar(stat="identity", fill="slateblue", colour="navyblue") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

Create plot for unigrams

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max =1))
options(mc.cores=1)
dtm1 <- DocumentTermMatrix(corpus, control = list(tokenize = unigramTokenizer))
freq1 <- sort(colSums(as.matrix(dtm1)), decreasing=TRUE)
wcount1 <- data.frame(word=names(freq1), freq=freq1)

p1 <- ggplot(subset(wcount1, freq1 > 1000) ,aes(word, freq))
p1 <- p1 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p1 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Unigram Frequency")

Consolidate Files - Create a Mini random sample

mTwitter <- twitter[sample(1:length(twitter),1000)]
mNews <- news[sample(1:length(news), 1000)]
mBlogs <- blogs[sample(1:length(blogs), 1000)]
miniSample <- c(mTwitter,mNews,mBlogs)
writeLines(miniSample, "miniSample.txt")
conMini <- file("miniSample.txt", "r")
miniSample <- readLines(conMini)
close(conMini)

Create mini-corpus

mcorpus <- Corpus(VectorSource(miniSample))

Clean mini-corpus

mcorpus <- tm_map(mcorpus, content_transformer(tolower), lazy = TRUE)
mcorpus <- tm_map(mcorpus, content_transformer(removePunctuation))
mcorpus <- tm_map(mcorpus, content_transformer(removeNumbers))
rmURL <- function(x) gsub("http[[:alnum:]]*", "", x) 
mcorpus <- tm_map(mcorpus, content_transformer(rmURL))
mcorpus <- tm_map(mcorpus, stripWhitespace)
mcorpus <- tm_map(mcorpus, removeWords, stopwords("english"))
mcorpus <- tm_map(mcorpus, PlainTextDocument)

Create mini Document-Term Matrix

mdtm <- DocumentTermMatrix(mcorpus)

Create plot for bigrams

bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
options(mc.cores=1)
dtm2 <- DocumentTermMatrix(mcorpus, control = list(tokenize = bigramTokenizer))
freq2 <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
wcount2 <- data.frame(word=names(freq2), freq=freq2)


p2 <- ggplot(subset(wcount2, freq2 > 10) ,aes(word, freq))
p2 <- p2 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p2 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Bigram Frequency")

Create plot for trigrams

trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max =3))
options(mc.cores=1)
dtm3 <- DocumentTermMatrix(mcorpus, control = list(tokenize = trigramTokenizer))
freq3 <- sort(colSums(as.matrix(dtm3)), decreasing=TRUE)
wcount3 <- data.frame(word=names(freq3), freq=freq3)

p3 <- ggplot(subset(wcount3, freq3 > 2) ,aes(word, freq))
p3 <- p3 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p3 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Trigram Frequency")

Create plot for fourgrams

fourgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max =4))
options(mc.cores=1)
dtm4 <- DocumentTermMatrix(mcorpus, control = list(tokenize = fourgramTokenizer))
freq4 <- sort(colSums(as.matrix(dtm4)), decreasing=TRUE)
wcount4 <- data.frame(word=names(freq4), freq=freq4)

p4 <- ggplot(subset(wcount4, freq4 > 1) ,aes(word, freq))
p4 <- p4 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p4 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Fourgram Frequency")

Cloud for bigrams

dtm2Cloud <- as.matrix(dtm2) 
frequency2 <- colSums(dtm2Cloud) 
frequency2 <- sort(frequency, decreasing=TRUE)
words <- names(frequency2)
wordcloud(words[1:50], frequency[1:50]) # For 50 top bigrams

Cloud for trigrams

dtm3Cloud <- as.matrix(dtm3) 
frequency3 <- colSums(dtm3Cloud) 
frequency3 <- sort(frequency, decreasing=TRUE)
words <- names(frequency3)
wordcloud(words[1:50], frequency[1:50]) # For 50 top trigrams

NEXT STEPS

In order to avoid further difficulties in handling a large amount of data, I plan to use Python NLTK to download, clean and consolidate all the files, and then I will use R to create the shiny app. I created a basic algorithm to search for unigrams, bigrams, trigrams and fourgrams to solve Quiz2 -predict the next term. First I obtained a 60% accuracy. I assumed the low accuracy was due to the sample size. As increased the size accuracy improved, but I expect a signficant improvement when all the available information is ingested into the corpus.