milestone.R

##Coursera Data Science Capstone:  Milestone Report

##Introduction
#The goal of the capstone project is to create a predictive text model using a large text corpus of documents 
#as training data. Natural language processing techniques will be used to perform the analysis and build the 
#predictive model. 
#
#This milestone report describes the major features of the training data with our exploratory data analysis 
#and summarizes our plans for creating the predictive model.
#
##GEtting The Data
#
if (file.exists("Coursera-SwiftKey.zip"))  {unzip("Coursera-Swiftkey.zip")}
#
#The data sets consist of text from 3 different sources: 1) News, 2) Blogs and 3) Twitter feeds. The text 
#data are provided in 4 different languages: 1) German, 2) English - United States, 3) Finnish and 
#4) Russian. In this project, we will only focus on the English - United States data sets.
#
blogs <- readLines("~/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("~/final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on '~/final/en_US/en_US.news.txt'

twitter <- readLines("~/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
#
#Summarization of data(file sizes, line counts, word counts, and mean words per line)
#file size
blogs.size <- file.info("~/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("~/final/en_us/en_us.news.txt")$size /1024 ^ 2
twitter.size <- file.info("~/final/en_us/en_us.twitter.txt")$size /1024 ^ 2
#
library(knitr)
library(stringi)
library(stringr)
#Words in files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
#summary of data sets
data.frame(source = c("blogs", "news", "twitter"),
           file.size.MB = c(blogs.size, news.size, twitter.size),
           num.lines = c(length(blogs), length(news), length(twitter)),
           num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))

##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  37546246       41.75108
## 2    news     196.2775     77259   2674536       34.61779
## 3 twitter     159.3641   2360148  30093410       12.75065

#
#Cleaning the Data
#Before performing exploratory analysis, we must clean the data first. This involves removing URLs, special
#characters, punctuations, numbers, excess whitespace, stopwords, and changing the text to lower case. 
#Since the data sets are quite large, we will randomly choose 1% of the data to demonstrate the data 
#cleaning and exploratory analysis.
#
library(tm)

## Loading required package: NLP

#sample the data
set.seed(679)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#
#
blogSample <- sample(blogs, length(blogs) * 0.01)
newSample <- sample(news, length(news) * 0.01) 
twitterSample <- sample(twitter, length(twitter) * 0.01)
twitterSample <- sapply(twitterSample, function(row) iconv(row,"latin1", "ASCII", sub =''))

data.sample <- c(blogSample, newSample, twitterSample)
#
#Creat corpus and clean the data
#
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
                                                                  x))})
corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
#
## Exploratory Analysis
#Looking at the most frequently occurring words in the data. Here we list the most common unigrams, bigrams,
#and trigrams.
library(RWeka)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

options(mc.cores=1)
getFreq <- function(tdm) {
       freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
       return(data.frame(word = names(freq), freq = freq))
}
#
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
#
makePlot <- function(data, label) {
       ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
             labs(x = label, y = "Frequency") +
             theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
             geom_bar(stat = "identity", fill = I("grey50"))
}
#
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
#
#Here is a histogram of the 30 most common unigrams in the data sample.
#
makePlot(freq1, "30 Most Common Unigrams")

#
#Here is a histogram of the 30 most common bigrams in the data sample.
#
makePlot(freq2, "30 Most Common Bigrams")

#
#Here is a histogram of the 30 most common trigrams in the data sample.
#
makePlot(freq3, "30 Most Common Trigrams")

#
#    Future of Prediction Algorithm and Shiny App
#The future steps in capstone project would be to finalize our predictive algorithm, and deploy
#our algorithm as a Shiny app.
#
#    Using the predictive algorithm will be using n-gram model with frequency lookup similar to 
#the exploratory analysis above.  A possible strategywould be to use the trigram model to 
#predict the next word.  If no matching trigram can be found, then the algorithm would default
#to the bigram model, and then to the unigram model if needed.
#
#    The user interface of the Shiny app will consist of a text input box that will allow a user
#to enter a phrase. Then the app will use our algorithm to suggest the most likely next word after
#a short delay. The plan is also to allow the user to configure how many words the app should
#suggest.

milestone.R

Neil Barrington

Thu Feb 16 06:41:11 2017