##Coursera Data Science Capstone: Milestone Report
##Introduction
#The goal of the capstone project is to create a predictive text model using a large text corpus of documents
#as training data. Natural language processing techniques will be used to perform the analysis and build the
#predictive model.
#
#This milestone report describes the major features of the training data with our exploratory data analysis
#and summarizes our plans for creating the predictive model.
#
##GEtting The Data
#
if (file.exists("Coursera-SwiftKey.zip")) {unzip("Coursera-Swiftkey.zip")}
#
#The data sets consist of text from 3 different sources: 1) News, 2) Blogs and 3) Twitter feeds. The text
#data are provided in 4 different languages: 1) German, 2) English - United States, 3) Finnish and
#4) Russian. In this project, we will only focus on the English - United States data sets.
#
blogs <- readLines("~/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("~/final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on '~/final/en_US/en_US.news.txt'
twitter <- readLines("~/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
#
#Summarization of data(file sizes, line counts, word counts, and mean words per line)
#file size
blogs.size <- file.info("~/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("~/final/en_us/en_us.news.txt")$size /1024 ^ 2
twitter.size <- file.info("~/final/en_us/en_us.twitter.txt")$size /1024 ^ 2
#
library(knitr)
library(stringi)
library(stringr)
#Words in files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
#summary of data sets
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogs.size, news.size, twitter.size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 37546246 41.75108
## 2 news 196.2775 77259 2674536 34.61779
## 3 twitter 159.3641 2360148 30093410 12.75065
#
#Cleaning the Data
#Before performing exploratory analysis, we must clean the data first. This involves removing URLs, special
#characters, punctuations, numbers, excess whitespace, stopwords, and changing the text to lower case.
#Since the data sets are quite large, we will randomly choose 1% of the data to demonstrate the data
#cleaning and exploratory analysis.
#
library(tm)
## Loading required package: NLP
#sample the data
set.seed(679)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#
#
blogSample <- sample(blogs, length(blogs) * 0.01)
newSample <- sample(news, length(news) * 0.01)
twitterSample <- sample(twitter, length(twitter) * 0.01)
twitterSample <- sapply(twitterSample, function(row) iconv(row,"latin1", "ASCII", sub =''))
data.sample <- c(blogSample, newSample, twitterSample)
#
#Creat corpus and clean the data
#
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
x))})
corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
#
## Exploratory Analysis
#Looking at the most frequently occurring words in the data. Here we list the most common unigrams, bigrams,
#and trigrams.
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
#
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
#
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
#
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
#
#Here is a histogram of the 30 most common unigrams in the data sample.
#
makePlot(freq1, "30 Most Common Unigrams")

#
#Here is a histogram of the 30 most common bigrams in the data sample.
#
makePlot(freq2, "30 Most Common Bigrams")

#
#Here is a histogram of the 30 most common trigrams in the data sample.
#
makePlot(freq3, "30 Most Common Trigrams")

#
# Future of Prediction Algorithm and Shiny App
#The future steps in capstone project would be to finalize our predictive algorithm, and deploy
#our algorithm as a Shiny app.
#
# Using the predictive algorithm will be using n-gram model with frequency lookup similar to
#the exploratory analysis above. A possible strategywould be to use the trigram model to
#predict the next word. If no matching trigram can be found, then the algorithm would default
#to the bigram model, and then to the unigram model if needed.
#
# The user interface of the Shiny app will consist of a text input box that will allow a user
#to enter a phrase. Then the app will use our algorithm to suggest the most likely next word after
#a short delay. The plan is also to allow the user to configure how many words the app should
#suggest.