Capstone Project

Introduction

This is a Capstone Project to build an application to predict next word using NLP.

The goal of this exercise is to create a product to highlight the prediction algorithm and to provide an interface that can be accessed by others. 1. A Shiny app that takes as input a phrase (multiple words) in a text box input and outputs a prediction of the next word.

A slide deck presentation.
The following steps have been considered prior to building the app.

Exploratory Data Analysis.
Getting and Cleaning Data
Tokenization

Load Libraries

library(tm)
library(ggplot2)
library(RWeka)
library(R.utils)
library(dplyr)
library(wordcloud)
library(corpus)
library(ngram)
library(NLP)
library(openNLP)
library(SnowballC)

Download the data from the Coursera site Capstone Dataset

setwd("C:/Capstone Final")

traindata <- download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","datafile.zip")
unzip("datafile.zip")

Use en_US Folder to Read the Lines in English

en_US.twitter.txt
en_US.blogs.txt
en_US.news.txt

rtwit <- file("C:/Capstone Final/final/en_US/en_US.twitter.txt", "rb",encoding = "UTF-8")
readrtwit <- readLines(rtwit,skipNul = TRUE, warn = FALSE)


rblog <- file("C:/Capstone Final/final/en_US/en_US.blogs.txt","rb",encoding = "UTF-8")
readrblog <- readLines(rblog,skipNul = TRUE, warn = FALSE)


rnews <- file("C:/Capstone Final/final/en_US/en_US.news.txt","rb",encoding = "UTF-8")
readrnews <- readLines(rnews,skipNul = TRUE, warn = FALSE)

Exploratory Data Analysis

Basic summaries of the three files en_US.twitter.txt,en_US.blogs.txt,en_US.news.txt * Size * Number of Lines * Word count

twit_size <- format(object.size(readrtwit), "MB")
twit_lines <- length(readrtwit)
twit_word <- wordcount(readrtwit, sep = " ", count.function = sum)

blog_size <- format(object.size(readrblog), "MB")
blog_lines <- length(readrblog)
blog_word <- wordcount(readrblog, sep = " ", count.function = sum)

news_size <- format(object.size(readrnews), "MB")
news_lines <- length(readrnews)
news_word <- wordcount(readrblog, sep = " ", count.function = sum)

file_size <- c(twit_size,blog_size,news_size)
file_linescount <- c(twit_lines,blog_lines,news_lines)
file_wordcount <- c(twit_word,blog_word,news_word)

df <- data.frame("Files " = c("Twitter","Blog","News"),file_size,file_linescount,file_wordcount)

df

##    Files. file_size file_linescount file_wordcount
## 1 Twitter  301.4 Mb         2360148       30373583
## 2    Blog  248.5 Mb          899288       37334131
## 3    News  249.6 Mb         1010242       37334131

Getting and Cleaning the Data

The data is huge, hence I am considering a very small subset. This subset is further cleaned.

set.seed(65513)

samp_rtwit <- sample(readrtwit,length(readrtwit)*0.001,replace = TRUE)
samp_rblog <- sample(readrblog,length(readrblog)*0.001,replace = TRUE)
samp_rnews <- sample(readrnews,length(readrnews)*0.001,replace = TRUE)

sampleTotal <- c(samp_rtwit, samp_rblog, samp_rnews)
length(sampleTotal)

## [1] 4269

writeLines(sampleTotal, "C:/Capstone Final/samplefile.txt")

textCon <- file("C:/Capstone Final/samplefile.txt")

Convert the data into Corpus

corpdata <- readLines(textCon)
corpdata <- VCorpus(VectorSource(corpdata))

Clean Data

corpdata <- tm_map(corpdata, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
corpdata <- tm_map(corpdata, content_transformer(tolower), lazy = TRUE)
corpdata <- tm_map(corpdata, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)

removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpdata <- tm_map(corpdata, content_transformer(removeURL))
corpdata <- tm_map(corpdata, removeWords, stopwords("english")) 
corpdata <- tm_map(corpdata, stripWhitespace) 
corpdata <- tm_map(corpdata, PlainTextDocument)
saveRDS(corpdata, file = "C:/Capstone Final/finalCorpus.RData")

final_corpdata <- readRDS("C:/Capstone Final/finalCorpus.RData")
final_corpdata_df <-data.frame(text=unlist(sapply(final_corpdata,`[`,"content")),stringsAsFactors= FALSE)

Tokenize Using TermDocumentMatrix

Use TermDocumentMatrix to tokenize the corpus data using Ngram Tokenizer Tokenization breaks the sentences into a set of words or tokens based on the ngram selections.

One Gram Shows a single word and its frequency of use

onegram <- function(x) NGramTokenizer(x,Weka_control(min = 1, max = 1,delimiters=" \\r\\n\\t.,;:\"()?!"))
onegrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=onegram))
onegrammat

## <<TermDocumentMatrix (terms: 15526, documents: 4269)>>
## Non-/sparse entries: 51940/66228554
## Sparsity           : 100%
## Maximal term length: 62
## Weighting          : term frequency (tf)

onegrammat1 <- as.matrix(onegrammat)
onegramsort <- sort(rowSums(onegrammat1), decreasing = TRUE)
onegramsortdf <- data.frame(word = names(onegramsort),freq=onegramsort)
head(onegramsortdf)

##      word freq
## one   one  299
## will will  283
## just just  282
## said said  274
## like like  244
## can   can  243

ggplot(data=onegramsortdf[1:40,], aes(x=reorder(word,-freq),y=freq)) + geom_bar(stat = "identity",color = "gray", fill = "red") + coord_flip() + labs(title = "One Gram Plot") + xlab("Frequency Count") + ylab("Word")

wordcloud(onegramsortdf$word,onegramsortdf$freq,min.freq = 100,max.words = 100,random.order = FALSE, colors = brewer.pal(8,"Dark2"))

onegramsortdf$word <- as.character(onegramsortdf$word)
write.csv(onegramsortdf[onegramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/onegram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/onegram.csv",stringsAsFactors = F)
saveRDS(onegramsortdf,file = "C:/Capstone Final/ShinyApp/onegram.RData")

Bi Gram Shows two words and its frequency of use

bigram <- function(x) NGramTokenizer(x,Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=bigram))
bigrammat

## <<TermDocumentMatrix (terms: 50099, documents: 4269)>>
## Non-/sparse entries: 52733/213819898
## Sparsity           : 100%
## Maximal term length: 72
## Weighting          : term frequency (tf)

bigrammat1 <- as.matrix(bigrammat)
bigramsort <- sort(rowSums(bigrammat1), decreasing = TRUE)
bigramsortdf <- data.frame(word = names(bigramsort),freq=bigramsort)
head(bigramsortdf)

##                word freq
## right now right now   36
## new york   new york   24
## last year last year   23
## dont know dont know   20
## years ago years ago   20
## make sure make sure   17

bigramsortdf$word <- as.character(bigramsortdf$word)
bisplit <- strsplit(bigramsortdf$word,split = " ")

bigramsortdf <- transform(bigramsortdf, one = sapply(bisplit,"[[",1), two = sapply(bisplit,"[[",2))
bigramsortdf <- data.frame(word1=bigramsortdf$one,word2=bigramsortdf$two,freq=bigramsortdf$freq,stringsAsFactors = FALSE)


write.csv(bigramsortdf[bigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/bigram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/bigram.csv",stringsAsFactors = F)
saveRDS(bigramsortdf,file = "C:/Capstone Final/ShinyApp/bigram.RData")

Tri Gram Shows three words and its frequency of use

trigram <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=trigram))
trigrammat

## <<TermDocumentMatrix (terms: 48740, documents: 4269)>>
## Non-/sparse entries: 48867/208022193
## Sparsity           : 100%
## Maximal term length: 82
## Weighting          : term frequency (tf)

trigrammat_sparse <- removeSparseTerms(trigrammat,0.99)
trigrammat1 <- as.matrix(trigrammat)
trigramsort <- sort(rowSums(trigrammat1), decreasing = TRUE)
trigramsortdf <- data.frame(word = names(trigramsort),freq=trigramsort)
head(trigramsortdf)

##                                          word freq
## happy mothers day           happy mothers day    8
## lead background vocals lead background vocals    5
## dont even know                 dont even know    4
## love love love                 love love love    4
## cant wait see                   cant wait see    3
## cents per share               cents per share    3

trigramsortdf$word <- as.character(trigramsortdf$word)

trisplit <- strsplit(trigramsortdf$word,split = " ")
trigramsortdf <- transform(trigramsortdf,one = sapply(trisplit,"[[",1),two = sapply(trisplit,"[[",2),three = sapply(trisplit,"[[",3) )

trigramsortdf <- data.frame(word1=trigramsortdf$one,word2=trigramsortdf$two,word3=trigramsortdf$three,freq=trigramsortdf$freq,stringsAsFactors = FALSE)

write.csv(trigramsortdf[trigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/trigram.csv",row.names = F)
trigramsortdf <- read.csv("C:/Capstone Final/Milestone report/trigram.csv",stringsAsFactors = F)
saveRDS(trigramsortdf,file = "C:/Capstone Final/ShinyApp/trigram.RData")

Quad Gram Shows four words and its frequency of use

quadgram <- function(x) NGramTokenizer(x,Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=quadgram))
quadgrammat

## <<TermDocumentMatrix (terms: 44931, documents: 4269)>>
## Non-/sparse entries: 44982/191765457
## Sparsity           : 100%
## Maximal term length: 94
## Weighting          : term frequency (tf)

quadgrammat_sparse <- removeSparseTerms(quadgrammat,0.99)
quadgrammat1 <- as.matrix(quadgrammat)
quadgramsort <- sort(rowSums(quadgrammat1), decreasing = TRUE)
quadgramsortdf <- data.frame(word = names(quadgramsort),freq=quadgramsort)
head(quadgramsortdf)

##                                                                  word freq
## 3133552 bottles thatâs assuming   3133552 bottles thatâs assuming    2
## 500 mgm2 max 1000                                   500 mgm2 max 1000    2
## 63 cents per share                                 63 cents per share    2
## 9400655 bottles chemical laden         9400655 bottles chemical laden    2
## alternative thatâs still 3133552 alternative thatâs still 3133552    2
## armed robbery five counts                   armed robbery five counts    2

quadgramsortdf$word <- as.character(quadgramsortdf$word)

quadsplit <- strsplit(quadgramsortdf$word,split = " ")
quadgramsortdf <- transform(quadgramsortdf,one=sapply(quadsplit,"[[",1),two=sapply(quadsplit,"[[",2),three= sapply(quadsplit,"[[",3),four=sapply(quadsplit,"[[",4) )

quadgramsortdf <- data.frame(word1=quadgramsortdf$one,word2=quadgramsortdf$two,word3=quadgramsortdf$three,word4=quadgramsortdf$four,freq=quadgramsortdf$freq,stringsAsFactors = FALSE)

write.csv(quadgramsortdf[quadgramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/quadgram.csv",row.names = F)
quadgramsortdf <- read.csv("C:/Capstone Final/Milestone report/quadgram.csv",stringsAsFactors = F)
saveRDS(quadgramsortdf,file = "C:/Capstone Final/ShinyApp/quadgram.RData")

Observations and Future Plans

Plan is to consider a bigger subset data.
Create a prediction algorithm and Shiny app.
Plan is to use the next plotted n-gram to predict the next word in a user text.
Plan is to create a Shiny App that will take this text user input and predict based on these n-gram algorithms.