This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the project is to build a predictive text application that will predict the next word as the user types a sentence, very similar to smart phone keyboards today which are implemented using Swiftkey’s technology.
For this Milestone Project following tasks are performed:
Data is downloaded form the link: CapstoneDataset
R Packages to be installed
Load the libraries
library(stringi)
library(NLP)
library(openNLP)
library(tm) # For Text mining
library(rJava)
library(RWeka) # tokenizer to create unigrams, bigrams, trigrams
library(RWekajars)
library(SnowballC) # Stemming
library(RColorBrewer) # Color palettes
library(qdap)
library(ggplot2) #visualization
Data is in the zip file Coursera-SwiftKey.zip contains text files. We will be downloading the zip file into a local folder of my computer. Following files are used after unziping the downloaded zip file.
#URL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
#download.file(URL,"c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip")
#unzip(zipfile = "c:/mahendra/coursera-capstone/data/Coursera-SwiftKey.zip", exdir = "c:/mahendra/coursera-capstone/data/SwiftKey")
workingDir<-"c:/mahendra/coursera-capstone"
setwd(workingDir)
file_con<-file("./data/SwiftKey/final/en_US/en_US.blogs.txt", open = "rb")
enUSblogs<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)
file_con<-file("./data/SwiftKey/final/en_US/en_US.news.txt", open = "rb")
enUSnews<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)
file_con<-file("./data/SwiftKey/final/en_US/en_US.twitter.txt", open = "rb")
enUStwitter<-readLines(file_con, encoding = "UTF-8",skipNul = TRUE)
close(file_con)
rm(file_con)
#capture the size of the 3 files
blogs.size <- file.info( "./data/SwiftKey/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info( "./data/SwiftKey/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info( "./data/SwiftKey/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# Capture number of lines in each file
length(enUSblogs)
## [1] 899288
length(enUSnews)
## [1] 1010242
length(enUStwitter)
## [1] 2360148
# Get words in files
blogs.words <- stri_count_words(enUSblogs)
news.words <- stri_count_words(enUSnews)
twitter.words <- stri_count_words(enUStwitter)
# Summary of the data sets
summary_ds<-data.frame(DataSet = c("Blogs", "News", "Twitter"),
FileSizeMB = c(blogs.size, news.size, twitter.size),
Lines = c(length(enUSblogs), length(enUSnews), length(enUStwitter)),
Words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
WordsPerLine=c(mean(blogs.words), mean(news.words), mean(twitter.words)))
# mean.num.words = c(max(nchar(enUSblogs)), max(nchar(enUSnews)), max(nchar(enUStwitter)))
library(knitr)
kable(summary_ds)
| DataSet | FileSizeMB | Lines | Words | WordsPerLine |
|---|---|---|---|---|
| Blogs | 200.4242 | 899288 | 37546239 | 41.75107 |
| News | 196.2775 | 1010242 | 34762395 | 34.40997 |
| 159.3641 | 2360148 | 30093413 | 12.75065 |
As we observed in the last section these files are very large. For analysis we will take out sample of 10,000 lines from each dataset.
set.seed(54321)
blog_sample<-sample(enUSblogs, size=10000,replace = TRUE)
news_sample<-sample(enUSnews, size=10000, replace = TRUE)
twitter_sample<-sample(enUStwitter,size = 10000, replace = TRUE)
sample_all<-c(blog_sample,news_sample,twitter_sample)
#Word count in Sample
sum(stri_count_words(sample_all))
## [1] 892468
#size of sample
length(sample_all)
## [1] 30000
#take backup in a text file
file_con<-file("./data/sample_all.txt", open = "wt")
writeLines(sample_all, file_con)
close(file_con)
rm(file_con)
The Sample contains 10,000 lines and 892468 words.
Data cleaing involes following steps
*1. Remove Profanity (bad) words
*2. Remove Numbers, non-ascii characters, punctualtions, stop words, white spaces etc.
*3. Remove URL , hashtag #, @
file_con<-file("./data/sample_all.txt" )
corSample<-readLines(file_con)
#Remove Non ASCII characters
corSample<-iconv(corSample, "latin1", "ASCII", sub = "")
close(file_con)
rm(file_con)
## Download the list of bad words
download.file("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt", "./data/bad-words.txt")
file_con<-file("./data/bad-words.txt", open = "rb")
badWords<-readLines(file_con, encoding = "UTF-8", skipNul = TRUE)
close(file_con)
rm(file_con)
#Remove Non ASCII characters
badWords<- iconv(badWords, "latin1", "ASCII", sub = "")
# Remove Bad Words
corSample<-removeWords(corSample, badWords)
#create Corpus object from sample for using tm_map function
corSample<-Corpus(VectorSource(corSample))
# Remove Punctuations
corSample<-tm_map(corSample,removePunctuation)
# Remove Numbers
corSample<-tm_map(corSample, removeNumbers)
# convert to lower case
corSample<-tm_map(corSample, content_transformer(tolower))
# remove stop words in english (a, as,at,so etc.)
corSample<-tm_map(corSample,removeWords, stopwords("en"))
# Stripe unnessary white spaces
corSample<-tm_map(corSample,stripWhitespace)
# Remove URLs
removeURL<-function(x) gsub("http[[:alnum:]]*", "", x)
corSample<-tm_map(corSample, content_transformer(removeURL) )
## Custom content transformaer
toEmpty <- content_transformer(function(x, pattern) gsub(pattern, "", x,fixed=TRUE))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x,fixed=TRUE))
corSample<-tm_map(corSample, toEmpty, "#\\w+")
corSample<-tm_map(corSample, toEmpty, "(\\b\\S+\\@\\S+\\..{1,3}(\\s)?\\b)")
corSample<-tm_map(corSample, toEmpty, "@\\w+")
corSample<-tm_map(corSample, toEmpty, "http[^[:space:]]*")
corSample<-tm_map(corSample, toSpace, "/|@|\\|")
#Save clean sample
cleanSample<-get("content", corSample)
save("cleanSample", file="./data/cleanSample.Rdata")
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
suppressWarnings(wordcloud(cleanSample, max.words = 500, random.order = FALSE, colors=brewer.pal(9, "RdYlGn")))
## Function to create n Gram
suppressMessages(library(quanteda))
tokenizedDF <- function(obj, n) {
nGramSparse <- dfm(obj, ngrams= n, concatenator = " ")
nGramDF <- data.frame(Content = featnames(nGramSparse), Frequency = colSums(nGramSparse),
row.names = NULL, stringsAsFactors = FALSE)
}
## Function to plot top 10 n-gram
top10Plot <- function(df, title) {
ggplot(df[1:10,], aes(reorder(Content, Frequency), Frequency)) +
labs(x = "Word(s)", y = "Frequency") +
theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
coord_flip() +
ggtitle(title) +
geom_bar(stat = "identity", fill = I("purple4"))
}
unigram<-tokenizedDF(cleanSample, 1)
top10Plot(unigram,"Top 10 Unigrams")
bigram<-tokenizedDF(cleanSample,2)
top10Plot(bigram, "Top 10 Bi-grams")
trigram<-tokenizedDF(cleanSample,3)
top10Plot(bigram, "Top 10 Tri-grams")
After constructing basic n gram model to build prediction algorithm next step would be to find a balance between the sample size and prediciton accuracy. Plan is to use Shiny app to create user interface for text input and to display suggested list of next words. To increase the efficincy of the shiny app, plan is to have the n-grams generated and saved prior rather than generating the n-grams in the run time which will results in bad user experience.