Milestone Report

Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the project is to build a predictive text application that will predict the next word as the user types a sentence, very similar to smart phone keyboards today which are implemented using Swiftkey’s technology.

For this Milestone Project following tasks are performed:

Download the data.
Generate a basic summary of the 3 data sets. 3.1 Perform data cleansing 3.2 Exploratory analysis.
Plan for creating a prediction model and Shiny app.

Data is downloaded form the link: CapstoneDataset

R Packages to be installed

RWeka
tm
wordcloud
qdap
quanteda
openNLP

Load the libraries

library(stringi) 
library(NLP)
library(openNLP)
library(tm) # For Text mining
library(rJava)
library(RWeka) # tokenizer to create unigrams, bigrams, trigrams
library(RWekajars)
library(SnowballC) # Stemming
library(RColorBrewer) # Color palettes
library(qdap)
library(ggplot2) #visualization

1. Download the data

Data is in the zip file Coursera-SwiftKey.zip contains text files. We will be downloading the zip file into a local folder of my computer. Following files are used after unziping the downloaded zip file.

en_US.blogs.txt
en_US.news.txt
en_US.twitter.txt

#URL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#download.file(URL,"c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip")
#unzip(zipfile = "c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip", exdir = "c:/mahendra/coursera-capstone/data/SwiftKey")

workingDir<-"c:/mahendra/coursera-capstone"
setwd(workingDir)

file_con<-file("./data/SwiftKey/final/en_US/en_US.blogs.txt", open = "rb")
enUSblogs<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)

file_con<-file("./data/SwiftKey/final/en_US/en_US.news.txt", open = "rb")
enUSnews<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)

file_con<-file("./data/SwiftKey/final/en_US/en_US.twitter.txt", open = "rb")
enUStwitter<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)
close(file_con)
rm(file_con)

2. Basic Summary of Datasets

#capture the size of the 3 files

blogs.size <- file.info( "./data/SwiftKey/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info( "./data/SwiftKey/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info( "./data/SwiftKey/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2

# Capture number of lines in each file

length(enUSblogs)

## [1] 899288

length(enUSnews)

## [1] 1010242

length(enUStwitter)

## [1] 2360148

# Get words in files
blogs.words <- stri_count_words(enUSblogs)
news.words <- stri_count_words(enUSnews)
twitter.words <- stri_count_words(enUStwitter)
# Summary of the data sets
summary_ds<-data.frame(DataSet = c("Blogs", "News", "Twitter"),
           FileSizeMB = c(blogs.size, news.size, twitter.size),
           Lines = c(length(enUSblogs), length(enUSnews), length(enUStwitter)),
           Words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           WordsPerLine=c(mean(blogs.words), mean(news.words), mean(twitter.words)))
#           mean.num.words = c(max(nchar(enUSblogs)), max(nchar(enUSnews)), max(nchar(enUStwitter)))
library(knitr)
kable(summary_ds)

DataSet	FileSizeMB	Lines	Words	WordsPerLine
Blogs	200.4242	899288	37546239	41.75107
News	196.2775	1010242	34762395	34.40997
Twitter	159.3641	2360148	30093413	12.75065

3. Data Cleansing and Exploratory Analysis

As we observed in the last section these files are very large. For analysis we will take out sample of 10,000 lines from each dataset.

set.seed(54321)
blog_sample<-sample(enUSblogs, size=10000,replace = TRUE)
news_sample<-sample(enUSnews, size=10000, replace = TRUE)
twitter_sample<-sample(enUStwitter,size = 10000, replace = TRUE)
sample_all<-c(blog_sample,news_sample,twitter_sample)
#Word count in Sample
sum(stri_count_words(sample_all))

## [1] 892468

#size of sample
length(sample_all)

## [1] 30000

#take backup in a text file 
file_con<-file("./data/sample_all.txt", open = "wt")
writeLines(sample_all, file_con)
close(file_con)
rm(file_con)

The Sample contains 10,000 lines and 892468 words.

3.1 Cleaning The Data

Data cleaing involes following steps

*1. Remove Profanity (bad) words

*2. Remove Numbers, non-ascii characters, punctualtions, stop words, white spaces etc.

*3. Remove URL , hashtag #, @

file_con<-file("./data/sample_all.txt" )
corSample<-readLines(file_con)
#Remove Non ASCII characters
corSample<-iconv(corSample, "latin1", "ASCII", sub = "")
close(file_con)
rm(file_con)

## Download the list of bad words

download.file("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt", "./data/bad-words.txt")
file_con<-file("./data/bad-words.txt", open = "rb")
badWords<-readLines(file_con, encoding = "UTF-8", skipNul = TRUE)
close(file_con)
rm(file_con)

#Remove Non ASCII characters

badWords<- iconv(badWords, "latin1", "ASCII", sub = "")

# Remove Bad Words

corSample<-removeWords(corSample, badWords)

#create Corpus object from sample for using tm_map function

corSample<-Corpus(VectorSource(corSample))


# Remove Punctuations
corSample<-tm_map(corSample,removePunctuation)
# Remove Numbers
corSample<-tm_map(corSample, removeNumbers)
# convert to lower case
corSample<-tm_map(corSample, content_transformer(tolower))  
# remove stop words in english (a, as,at,so etc.)
corSample<-tm_map(corSample,removeWords, stopwords("en"))
# Stripe unnessary white spaces
corSample<-tm_map(corSample,stripWhitespace)
# Remove URLs
removeURL<-function(x) gsub("http[[:alnum:]]*", "", x)
corSample<-tm_map(corSample, content_transformer(removeURL) )
## Custom content transformaer
toEmpty <- content_transformer(function(x, pattern) gsub(pattern, "", x,fixed=TRUE))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x,fixed=TRUE))
corSample<-tm_map(corSample, toEmpty, "#\\w+")
corSample<-tm_map(corSample, toEmpty,  "(\\b\\S+\\@\\S+\\..{1,3}(\\s)?\\b)")
corSample<-tm_map(corSample, toEmpty,  "@\\w+")
corSample<-tm_map(corSample, toEmpty,  "http[^[:space:]]*")
corSample<-tm_map(corSample, toSpace,  "/|@|\\|")

#Save clean sample
cleanSample<-get("content", corSample)
save("cleanSample", file="./data/cleanSample.Rdata")

3.2 Exploratory Analysys

Word Cloud

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.3

suppressWarnings(wordcloud(cleanSample, max.words = 500, random.order = FALSE, colors=brewer.pal(9, "RdYlGn")))

Functions to create and plot n-Gram

## Function to create n Gram
suppressMessages(library(quanteda))
tokenizedDF <- function(obj, n) {
        nGramSparse <- dfm(obj, ngrams= n, concatenator = " ")
        nGramDF <- data.frame(Content = featnames(nGramSparse), Frequency = colSums(nGramSparse), 
                 row.names = NULL, stringsAsFactors = FALSE)
                }
                
## Function to plot top 10 n-gram
top10Plot <- function(df, title) {
          ggplot(df[1:10,], aes(reorder(Content, Frequency), Frequency)) +
          labs(x = "Word(s)", y = "Frequency") +
          theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) + 
          coord_flip() + 
          ggtitle(title) +
          geom_bar(stat = "identity", fill = I("purple4"))
          }

UniGram

unigram<-tokenizedDF(cleanSample, 1)

top10Plot(unigram,"Top 10 Unigrams")

BiGram

bigram<-tokenizedDF(cleanSample,2)

top10Plot(bigram, "Top 10 Bi-grams")

TriGram

trigram<-tokenizedDF(cleanSample,3)

top10Plot(bigram, "Top 10 Tri-grams")

4. Plan for creating a prediction model and Shiny app

After constructing basic n gram model to build prediction algorithm next step would be to find a balance between the sample size and prediciton accuracy. Plan is to use Shiny app to create user interface for text input and to display suggested list of next words. To increase the efficincy of the shiny app, plan is to have the n-grams generated and saved prior rather than generating the n-grams in the run time which will results in bad user experience.