Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the project is to build a predictive text application that will predict the next word as the user types a sentence, very similar to smart phone keyboards today which are implemented using Swiftkey’s technology.

For this Milestone Project following tasks are performed:

  1. Download the data.
  2. Generate a basic summary of the 3 data sets. 3.1 Perform data cleansing 3.2 Exploratory analysis.
  3. Plan for creating a prediction model and Shiny app.

Data is downloaded form the link: CapstoneDataset

R Packages to be installed

Load the libraries

library(stringi) 
library(NLP)
library(openNLP)
library(tm) # For Text mining
library(rJava)
library(RWeka) # tokenizer to create unigrams, bigrams, trigrams
library(RWekajars)
library(SnowballC) # Stemming
library(RColorBrewer) # Color palettes
library(qdap)
library(ggplot2) #visualization

1. Download the data

Data is in the zip file Coursera-SwiftKey.zip contains text files. We will be downloading the zip file into a local folder of my computer. Following files are used after unziping the downloaded zip file.

#URL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#download.file(URL,"c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip")
#unzip(zipfile = "c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip", exdir = "c:/mahendra/coursera-capstone/data/SwiftKey")

workingDir<-"c:/mahendra/coursera-capstone"
setwd(workingDir)

file_con<-file("./data/SwiftKey/final/en_US/en_US.blogs.txt", open = "rb")
enUSblogs<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)

file_con<-file("./data/SwiftKey/final/en_US/en_US.news.txt", open = "rb")
enUSnews<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)

file_con<-file("./data/SwiftKey/final/en_US/en_US.twitter.txt", open = "rb")
enUStwitter<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)
close(file_con)
rm(file_con)

2. Basic Summary of Datasets

#capture the size of the 3 files

blogs.size <- file.info( "./data/SwiftKey/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info( "./data/SwiftKey/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info( "./data/SwiftKey/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2

# Capture number of lines in each file

length(enUSblogs)
## [1] 899288
length(enUSnews)
## [1] 1010242
length(enUStwitter)
## [1] 2360148
# Get words in files
blogs.words <- stri_count_words(enUSblogs)
news.words <- stri_count_words(enUSnews)
twitter.words <- stri_count_words(enUStwitter)
# Summary of the data sets
summary_ds<-data.frame(DataSet = c("Blogs", "News", "Twitter"),
           FileSizeMB = c(blogs.size, news.size, twitter.size),
           Lines = c(length(enUSblogs), length(enUSnews), length(enUStwitter)),
           Words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           WordsPerLine=c(mean(blogs.words), mean(news.words), mean(twitter.words)))
#           mean.num.words = c(max(nchar(enUSblogs)), max(nchar(enUSnews)), max(nchar(enUStwitter)))
library(knitr)
kable(summary_ds)
DataSet FileSizeMB Lines Words WordsPerLine
Blogs 200.4242 899288 37546239 41.75107
News 196.2775 1010242 34762395 34.40997
Twitter 159.3641 2360148 30093413 12.75065

3. Data Cleansing and Exploratory Analysis

As we observed in the last section these files are very large. For analysis we will take out sample of 10,000 lines from each dataset.

set.seed(54321)
blog_sample<-sample(enUSblogs, size=10000,replace = TRUE)
news_sample<-sample(enUSnews, size=10000, replace = TRUE)
twitter_sample<-sample(enUStwitter,size = 10000, replace = TRUE)
sample_all<-c(blog_sample,news_sample,twitter_sample)
#Word count in Sample
sum(stri_count_words(sample_all))
## [1] 892468
#size of sample
length(sample_all)
## [1] 30000
#take backup in a text file 
file_con<-file("./data/sample_all.txt", open = "wt")
writeLines(sample_all, file_con)
close(file_con)
rm(file_con)

The Sample contains 10,000 lines and 892468 words.

3.1 Cleaning The Data

Data cleaing involes following steps

*1. Remove Profanity (bad) words

*2. Remove Numbers, non-ascii characters, punctualtions, stop words, white spaces etc.

*3. Remove URL , hashtag #, @

file_con<-file("./data/sample_all.txt" )
corSample<-readLines(file_con)
#Remove Non ASCII characters
corSample<-iconv(corSample, "latin1", "ASCII", sub = "")
close(file_con)
rm(file_con)

## Download the list of bad words

download.file("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt", "./data/bad-words.txt")
file_con<-file("./data/bad-words.txt", open = "rb")
badWords<-readLines(file_con, encoding = "UTF-8", skipNul = TRUE)
close(file_con)
rm(file_con)

#Remove Non ASCII characters

badWords<- iconv(badWords, "latin1", "ASCII", sub = "")

# Remove Bad Words

corSample<-removeWords(corSample, badWords)

#create Corpus object from sample for using tm_map function

corSample<-Corpus(VectorSource(corSample))


# Remove Punctuations
corSample<-tm_map(corSample,removePunctuation)
# Remove Numbers
corSample<-tm_map(corSample, removeNumbers)
# convert to lower case
corSample<-tm_map(corSample, content_transformer(tolower))  
# remove stop words in english (a, as,at,so etc.)
corSample<-tm_map(corSample,removeWords, stopwords("en"))
# Stripe unnessary white spaces
corSample<-tm_map(corSample,stripWhitespace)
# Remove URLs
removeURL<-function(x) gsub("http[[:alnum:]]*", "", x)
corSample<-tm_map(corSample, content_transformer(removeURL) )
## Custom content transformaer
toEmpty <- content_transformer(function(x, pattern) gsub(pattern, "", x,fixed=TRUE))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x,fixed=TRUE))
corSample<-tm_map(corSample, toEmpty, "#\\w+")
corSample<-tm_map(corSample, toEmpty,  "(\\b\\S+\\@\\S+\\..{1,3}(\\s)?\\b)")
corSample<-tm_map(corSample, toEmpty,  "@\\w+")
corSample<-tm_map(corSample, toEmpty,  "http[^[:space:]]*")
corSample<-tm_map(corSample, toSpace,  "/|@|\\|")

#Save clean sample
cleanSample<-get("content", corSample)
save("cleanSample", file="./data/cleanSample.Rdata")

3.2 Exploratory Analysys

Word Cloud

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
suppressWarnings(wordcloud(cleanSample, max.words = 500, random.order = FALSE, colors=brewer.pal(9, "RdYlGn"))) 

Functions to create and plot n-Gram

## Function to create n Gram
suppressMessages(library(quanteda))
tokenizedDF <- function(obj, n) {
        nGramSparse <- dfm(obj, ngrams= n, concatenator = " ")
        nGramDF <- data.frame(Content = featnames(nGramSparse), Frequency = colSums(nGramSparse), 
                 row.names = NULL, stringsAsFactors = FALSE)
                }
                
## Function to plot top 10 n-gram
top10Plot <- function(df, title) {
          ggplot(df[1:10,], aes(reorder(Content, Frequency), Frequency)) +
          labs(x = "Word(s)", y = "Frequency") +
          theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) + 
          coord_flip() + 
          ggtitle(title) +
          geom_bar(stat = "identity", fill = I("purple4"))
          }

UniGram

unigram<-tokenizedDF(cleanSample, 1)

top10Plot(unigram,"Top 10 Unigrams")

BiGram

bigram<-tokenizedDF(cleanSample,2)

top10Plot(bigram, "Top 10 Bi-grams")

TriGram

trigram<-tokenizedDF(cleanSample,3)

top10Plot(bigram, "Top 10 Tri-grams")

4. Plan for creating a prediction model and Shiny app

After constructing basic n gram model to build prediction algorithm next step would be to find a balance between the sample size and prediciton accuracy. Plan is to use Shiny app to create user interface for text input and to display suggested list of next words. To increase the efficincy of the shiny app, plan is to have the n-grams generated and saved prior rather than generating the n-grams in the run time which will results in bad user experience.