Introduction

This is a Capstone Project to build an application to predict next word using NLP.

The goal of this exercise is to create a product to highlight the prediction algorithm and to provide an interface that can be accessed by others. 1. A Shiny app that takes as input a phrase (multiple words) in a text box input and outputs a prediction of the next word.

  1. A slide deck presentation.

  2. The following steps have been considered prior to building the app.

Load Libraries

library(tm)
library(ggplot2)
library(RWeka)
library(R.utils)
library(dplyr)
library(wordcloud)
library(corpus)
library(ngram)
library(NLP)
library(openNLP)
library(SnowballC)

Download the data from the Coursera site Capstone Dataset

setwd("C:/Capstone Final")

traindata <- download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","datafile.zip")
unzip("datafile.zip")

Use en_US Folder to Read the Lines in English

rtwit <- file("C:/Capstone Final/final/en_US/en_US.twitter.txt", "rb",encoding = "UTF-8")
readrtwit <- readLines(rtwit,skipNul = TRUE, warn = FALSE)


rblog <- file("C:/Capstone Final/final/en_US/en_US.blogs.txt","rb",encoding = "UTF-8")
readrblog <- readLines(rblog,skipNul = TRUE, warn = FALSE)


rnews <- file("C:/Capstone Final/final/en_US/en_US.news.txt","rb",encoding = "UTF-8")
readrnews <- readLines(rnews,skipNul = TRUE, warn = FALSE)

Exploratory Data Analysis

Basic summaries of the three files en_US.twitter.txt,en_US.blogs.txt,en_US.news.txt * Size * Number of Lines * Word count

twit_size <- format(object.size(readrtwit), "MB")
twit_lines <- length(readrtwit)
twit_word <- wordcount(readrtwit, sep = " ", count.function = sum)

blog_size <- format(object.size(readrblog), "MB")
blog_lines <- length(readrblog)
blog_word <- wordcount(readrblog, sep = " ", count.function = sum)

news_size <- format(object.size(readrnews), "MB")
news_lines <- length(readrnews)
news_word <- wordcount(readrblog, sep = " ", count.function = sum)

file_size <- c(twit_size,blog_size,news_size)
file_linescount <- c(twit_lines,blog_lines,news_lines)
file_wordcount <- c(twit_word,blog_word,news_word)

df <- data.frame("Files " = c("Twitter","Blog","News"),file_size,file_linescount,file_wordcount)

df
##    Files. file_size file_linescount file_wordcount
## 1 Twitter  301.4 Mb         2360148       30373583
## 2    Blog  248.5 Mb          899288       37334131
## 3    News  249.6 Mb         1010242       37334131

Getting and Cleaning the Data

The data is huge, hence I am considering a very small subset. This subset is further cleaned.

set.seed(65513)

samp_rtwit <- sample(readrtwit,length(readrtwit)*0.001,replace = TRUE)
samp_rblog <- sample(readrblog,length(readrblog)*0.001,replace = TRUE)
samp_rnews <- sample(readrnews,length(readrnews)*0.001,replace = TRUE)

sampleTotal <- c(samp_rtwit, samp_rblog, samp_rnews)
length(sampleTotal)
## [1] 4269
writeLines(sampleTotal, "C:/Capstone Final/samplefile.txt")

textCon <- file("C:/Capstone Final/samplefile.txt")

Convert the data into Corpus

corpdata <- readLines(textCon)
corpdata <- VCorpus(VectorSource(corpdata))

Clean Data

corpdata <- tm_map(corpdata, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
corpdata <- tm_map(corpdata, content_transformer(tolower), lazy = TRUE)
corpdata <- tm_map(corpdata, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)

removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpdata <- tm_map(corpdata, content_transformer(removeURL))
corpdata <- tm_map(corpdata, removeWords, stopwords("english")) 
corpdata <- tm_map(corpdata, stripWhitespace) 
corpdata <- tm_map(corpdata, PlainTextDocument)
saveRDS(corpdata, file = "C:/Capstone Final/finalCorpus.RData")

final_corpdata <- readRDS("C:/Capstone Final/finalCorpus.RData")
final_corpdata_df <-data.frame(text=unlist(sapply(final_corpdata,`[`,"content")),stringsAsFactors= FALSE)

Tokenize Using TermDocumentMatrix

Use TermDocumentMatrix to tokenize the corpus data using Ngram Tokenizer Tokenization breaks the sentences into a set of words or tokens based on the ngram selections.

onegram <- function(x) NGramTokenizer(x,Weka_control(min = 1, max = 1,delimiters=" \\r\\n\\t.,;:\"()?!"))
onegrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=onegram))
onegrammat
## <<TermDocumentMatrix (terms: 15526, documents: 4269)>>
## Non-/sparse entries: 51940/66228554
## Sparsity           : 100%
## Maximal term length: 62
## Weighting          : term frequency (tf)
onegrammat1 <- as.matrix(onegrammat)
onegramsort <- sort(rowSums(onegrammat1), decreasing = TRUE)
onegramsortdf <- data.frame(word = names(onegramsort),freq=onegramsort)
head(onegramsortdf)
##      word freq
## one   one  299
## will will  283
## just just  282
## said said  274
## like like  244
## can   can  243
ggplot(data=onegramsortdf[1:40,], aes(x=reorder(word,-freq),y=freq)) + geom_bar(stat = "identity",color = "gray", fill = "red") + coord_flip() + labs(title = "One Gram Plot") + xlab("Frequency Count") + ylab("Word")

wordcloud(onegramsortdf$word,onegramsortdf$freq,min.freq = 100,max.words = 100,random.order = FALSE, colors = brewer.pal(8,"Dark2"))

onegramsortdf$word <- as.character(onegramsortdf$word)
write.csv(onegramsortdf[onegramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/onegram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/onegram.csv",stringsAsFactors = F)
saveRDS(onegramsortdf,file = "C:/Capstone Final/ShinyApp/onegram.RData")
bigram <- function(x) NGramTokenizer(x,Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=bigram))
bigrammat
## <<TermDocumentMatrix (terms: 50099, documents: 4269)>>
## Non-/sparse entries: 52733/213819898
## Sparsity           : 100%
## Maximal term length: 72
## Weighting          : term frequency (tf)
bigrammat1 <- as.matrix(bigrammat)
bigramsort <- sort(rowSums(bigrammat1), decreasing = TRUE)
bigramsortdf <- data.frame(word = names(bigramsort),freq=bigramsort)
head(bigramsortdf)
##                word freq
## right now right now   36
## new york   new york   24
## last year last year   23
## dont know dont know   20
## years ago years ago   20
## make sure make sure   17
bigramsortdf$word <- as.character(bigramsortdf$word)
bisplit <- strsplit(bigramsortdf$word,split = " ")

bigramsortdf <- transform(bigramsortdf, one = sapply(bisplit,"[[",1), two = sapply(bisplit,"[[",2))
bigramsortdf <- data.frame(word1=bigramsortdf$one,word2=bigramsortdf$two,freq=bigramsortdf$freq,stringsAsFactors = FALSE)


write.csv(bigramsortdf[bigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/bigram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/bigram.csv",stringsAsFactors = F)
saveRDS(bigramsortdf,file = "C:/Capstone Final/ShinyApp/bigram.RData")
trigram <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=trigram))
trigrammat
## <<TermDocumentMatrix (terms: 48740, documents: 4269)>>
## Non-/sparse entries: 48867/208022193
## Sparsity           : 100%
## Maximal term length: 82
## Weighting          : term frequency (tf)
trigrammat_sparse <- removeSparseTerms(trigrammat,0.99)
trigrammat1 <- as.matrix(trigrammat)
trigramsort <- sort(rowSums(trigrammat1), decreasing = TRUE)
trigramsortdf <- data.frame(word = names(trigramsort),freq=trigramsort)
head(trigramsortdf)
##                                          word freq
## happy mothers day           happy mothers day    8
## lead background vocals lead background vocals    5
## dont even know                 dont even know    4
## love love love                 love love love    4
## cant wait see                   cant wait see    3
## cents per share               cents per share    3
trigramsortdf$word <- as.character(trigramsortdf$word)

trisplit <- strsplit(trigramsortdf$word,split = " ")
trigramsortdf <- transform(trigramsortdf,one = sapply(trisplit,"[[",1),two = sapply(trisplit,"[[",2),three = sapply(trisplit,"[[",3) )

trigramsortdf <- data.frame(word1=trigramsortdf$one,word2=trigramsortdf$two,word3=trigramsortdf$three,freq=trigramsortdf$freq,stringsAsFactors = FALSE)

write.csv(trigramsortdf[trigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/trigram.csv",row.names = F)
trigramsortdf <- read.csv("C:/Capstone Final/Milestone report/trigram.csv",stringsAsFactors = F)
saveRDS(trigramsortdf,file = "C:/Capstone Final/ShinyApp/trigram.RData")
quadgram <- function(x) NGramTokenizer(x,Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=quadgram))
quadgrammat
## <<TermDocumentMatrix (terms: 44931, documents: 4269)>>
## Non-/sparse entries: 44982/191765457
## Sparsity           : 100%
## Maximal term length: 94
## Weighting          : term frequency (tf)
quadgrammat_sparse <- removeSparseTerms(quadgrammat,0.99)
quadgrammat1 <- as.matrix(quadgrammat)
quadgramsort <- sort(rowSums(quadgrammat1), decreasing = TRUE)
quadgramsortdf <- data.frame(word = names(quadgramsort),freq=quadgramsort)
head(quadgramsortdf)
##                                                                  word freq
## 3133552 bottles that’s assuming   3133552 bottles that’s assuming    2
## 500 mgm2 max 1000                                   500 mgm2 max 1000    2
## 63 cents per share                                 63 cents per share    2
## 9400655 bottles chemical laden         9400655 bottles chemical laden    2
## alternative that’s still 3133552 alternative that’s still 3133552    2
## armed robbery five counts                   armed robbery five counts    2
quadgramsortdf$word <- as.character(quadgramsortdf$word)

quadsplit <- strsplit(quadgramsortdf$word,split = " ")
quadgramsortdf <- transform(quadgramsortdf,one=sapply(quadsplit,"[[",1),two=sapply(quadsplit,"[[",2),three= sapply(quadsplit,"[[",3),four=sapply(quadsplit,"[[",4) )

quadgramsortdf <- data.frame(word1=quadgramsortdf$one,word2=quadgramsortdf$two,word3=quadgramsortdf$three,word4=quadgramsortdf$four,freq=quadgramsortdf$freq,stringsAsFactors = FALSE)

write.csv(quadgramsortdf[quadgramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/quadgram.csv",row.names = F)
quadgramsortdf <- read.csv("C:/Capstone Final/Milestone report/quadgram.csv",stringsAsFactors = F)
saveRDS(quadgramsortdf,file = "C:/Capstone Final/ShinyApp/quadgram.RData")

Observations and Future Plans