This is a Capstone Project to build an application to predict next word using NLP.
The goal of this exercise is to create a product to highlight the prediction algorithm and to provide an interface that can be accessed by others. 1. A Shiny app that takes as input a phrase (multiple words) in a text box input and outputs a prediction of the next word.
A slide deck presentation.
library(tm)
library(ggplot2)
library(RWeka)
library(R.utils)
library(dplyr)
library(wordcloud)
library(corpus)
library(ngram)
library(NLP)
library(openNLP)
library(SnowballC)
setwd("C:/Capstone Final")
traindata <- download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","datafile.zip")
unzip("datafile.zip")
rtwit <- file("C:/Capstone Final/final/en_US/en_US.twitter.txt", "rb",encoding = "UTF-8")
readrtwit <- readLines(rtwit,skipNul = TRUE, warn = FALSE)
rblog <- file("C:/Capstone Final/final/en_US/en_US.blogs.txt","rb",encoding = "UTF-8")
readrblog <- readLines(rblog,skipNul = TRUE, warn = FALSE)
rnews <- file("C:/Capstone Final/final/en_US/en_US.news.txt","rb",encoding = "UTF-8")
readrnews <- readLines(rnews,skipNul = TRUE, warn = FALSE)
Basic summaries of the three files en_US.twitter.txt,en_US.blogs.txt,en_US.news.txt * Size * Number of Lines * Word count
twit_size <- format(object.size(readrtwit), "MB")
twit_lines <- length(readrtwit)
twit_word <- wordcount(readrtwit, sep = " ", count.function = sum)
blog_size <- format(object.size(readrblog), "MB")
blog_lines <- length(readrblog)
blog_word <- wordcount(readrblog, sep = " ", count.function = sum)
news_size <- format(object.size(readrnews), "MB")
news_lines <- length(readrnews)
news_word <- wordcount(readrblog, sep = " ", count.function = sum)
file_size <- c(twit_size,blog_size,news_size)
file_linescount <- c(twit_lines,blog_lines,news_lines)
file_wordcount <- c(twit_word,blog_word,news_word)
df <- data.frame("Files " = c("Twitter","Blog","News"),file_size,file_linescount,file_wordcount)
df
## Files. file_size file_linescount file_wordcount
## 1 Twitter 301.4 Mb 2360148 30373583
## 2 Blog 248.5 Mb 899288 37334131
## 3 News 249.6 Mb 1010242 37334131
The data is huge, hence I am considering a very small subset. This subset is further cleaned.
set.seed(65513)
samp_rtwit <- sample(readrtwit,length(readrtwit)*0.001,replace = TRUE)
samp_rblog <- sample(readrblog,length(readrblog)*0.001,replace = TRUE)
samp_rnews <- sample(readrnews,length(readrnews)*0.001,replace = TRUE)
sampleTotal <- c(samp_rtwit, samp_rblog, samp_rnews)
length(sampleTotal)
## [1] 4269
writeLines(sampleTotal, "C:/Capstone Final/samplefile.txt")
textCon <- file("C:/Capstone Final/samplefile.txt")
corpdata <- readLines(textCon)
corpdata <- VCorpus(VectorSource(corpdata))
corpdata <- tm_map(corpdata, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
corpdata <- tm_map(corpdata, content_transformer(tolower), lazy = TRUE)
corpdata <- tm_map(corpdata, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpdata <- tm_map(corpdata, content_transformer(removeURL))
corpdata <- tm_map(corpdata, removeWords, stopwords("english"))
corpdata <- tm_map(corpdata, stripWhitespace)
corpdata <- tm_map(corpdata, PlainTextDocument)
saveRDS(corpdata, file = "C:/Capstone Final/finalCorpus.RData")
final_corpdata <- readRDS("C:/Capstone Final/finalCorpus.RData")
final_corpdata_df <-data.frame(text=unlist(sapply(final_corpdata,`[`,"content")),stringsAsFactors= FALSE)
Use TermDocumentMatrix to tokenize the corpus data using Ngram Tokenizer Tokenization breaks the sentences into a set of words or tokens based on the ngram selections.
onegram <- function(x) NGramTokenizer(x,Weka_control(min = 1, max = 1,delimiters=" \\r\\n\\t.,;:\"()?!"))
onegrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=onegram))
onegrammat
## <<TermDocumentMatrix (terms: 15526, documents: 4269)>>
## Non-/sparse entries: 51940/66228554
## Sparsity : 100%
## Maximal term length: 62
## Weighting : term frequency (tf)
onegrammat1 <- as.matrix(onegrammat)
onegramsort <- sort(rowSums(onegrammat1), decreasing = TRUE)
onegramsortdf <- data.frame(word = names(onegramsort),freq=onegramsort)
head(onegramsortdf)
## word freq
## one one 299
## will will 283
## just just 282
## said said 274
## like like 244
## can can 243
ggplot(data=onegramsortdf[1:40,], aes(x=reorder(word,-freq),y=freq)) + geom_bar(stat = "identity",color = "gray", fill = "red") + coord_flip() + labs(title = "One Gram Plot") + xlab("Frequency Count") + ylab("Word")
wordcloud(onegramsortdf$word,onegramsortdf$freq,min.freq = 100,max.words = 100,random.order = FALSE, colors = brewer.pal(8,"Dark2"))
onegramsortdf$word <- as.character(onegramsortdf$word)
write.csv(onegramsortdf[onegramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/onegram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/onegram.csv",stringsAsFactors = F)
saveRDS(onegramsortdf,file = "C:/Capstone Final/ShinyApp/onegram.RData")
bigram <- function(x) NGramTokenizer(x,Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=bigram))
bigrammat
## <<TermDocumentMatrix (terms: 50099, documents: 4269)>>
## Non-/sparse entries: 52733/213819898
## Sparsity : 100%
## Maximal term length: 72
## Weighting : term frequency (tf)
bigrammat1 <- as.matrix(bigrammat)
bigramsort <- sort(rowSums(bigrammat1), decreasing = TRUE)
bigramsortdf <- data.frame(word = names(bigramsort),freq=bigramsort)
head(bigramsortdf)
## word freq
## right now right now 36
## new york new york 24
## last year last year 23
## dont know dont know 20
## years ago years ago 20
## make sure make sure 17
bigramsortdf$word <- as.character(bigramsortdf$word)
bisplit <- strsplit(bigramsortdf$word,split = " ")
bigramsortdf <- transform(bigramsortdf, one = sapply(bisplit,"[[",1), two = sapply(bisplit,"[[",2))
bigramsortdf <- data.frame(word1=bigramsortdf$one,word2=bigramsortdf$two,freq=bigramsortdf$freq,stringsAsFactors = FALSE)
write.csv(bigramsortdf[bigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/bigram.csv",row.names = F)
onegramsortdf <- read.csv("C:/Capstone Final/Milestone report/bigram.csv",stringsAsFactors = F)
saveRDS(bigramsortdf,file = "C:/Capstone Final/ShinyApp/bigram.RData")
trigram <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=trigram))
trigrammat
## <<TermDocumentMatrix (terms: 48740, documents: 4269)>>
## Non-/sparse entries: 48867/208022193
## Sparsity : 100%
## Maximal term length: 82
## Weighting : term frequency (tf)
trigrammat_sparse <- removeSparseTerms(trigrammat,0.99)
trigrammat1 <- as.matrix(trigrammat)
trigramsort <- sort(rowSums(trigrammat1), decreasing = TRUE)
trigramsortdf <- data.frame(word = names(trigramsort),freq=trigramsort)
head(trigramsortdf)
## word freq
## happy mothers day happy mothers day 8
## lead background vocals lead background vocals 5
## dont even know dont even know 4
## love love love love love love 4
## cant wait see cant wait see 3
## cents per share cents per share 3
trigramsortdf$word <- as.character(trigramsortdf$word)
trisplit <- strsplit(trigramsortdf$word,split = " ")
trigramsortdf <- transform(trigramsortdf,one = sapply(trisplit,"[[",1),two = sapply(trisplit,"[[",2),three = sapply(trisplit,"[[",3) )
trigramsortdf <- data.frame(word1=trigramsortdf$one,word2=trigramsortdf$two,word3=trigramsortdf$three,freq=trigramsortdf$freq,stringsAsFactors = FALSE)
write.csv(trigramsortdf[trigramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/trigram.csv",row.names = F)
trigramsortdf <- read.csv("C:/Capstone Final/Milestone report/trigram.csv",stringsAsFactors = F)
saveRDS(trigramsortdf,file = "C:/Capstone Final/ShinyApp/trigram.RData")
quadgram <- function(x) NGramTokenizer(x,Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgrammat <- TermDocumentMatrix(corpdata,control = list(tokenize=quadgram))
quadgrammat
## <<TermDocumentMatrix (terms: 44931, documents: 4269)>>
## Non-/sparse entries: 44982/191765457
## Sparsity : 100%
## Maximal term length: 94
## Weighting : term frequency (tf)
quadgrammat_sparse <- removeSparseTerms(quadgrammat,0.99)
quadgrammat1 <- as.matrix(quadgrammat)
quadgramsort <- sort(rowSums(quadgrammat1), decreasing = TRUE)
quadgramsortdf <- data.frame(word = names(quadgramsort),freq=quadgramsort)
head(quadgramsortdf)
## word freq
## 3133552 bottles thatâs assuming 3133552 bottles thatâs assuming 2
## 500 mgm2 max 1000 500 mgm2 max 1000 2
## 63 cents per share 63 cents per share 2
## 9400655 bottles chemical laden 9400655 bottles chemical laden 2
## alternative thatâs still 3133552 alternative thatâs still 3133552 2
## armed robbery five counts armed robbery five counts 2
quadgramsortdf$word <- as.character(quadgramsortdf$word)
quadsplit <- strsplit(quadgramsortdf$word,split = " ")
quadgramsortdf <- transform(quadgramsortdf,one=sapply(quadsplit,"[[",1),two=sapply(quadsplit,"[[",2),three= sapply(quadsplit,"[[",3),four=sapply(quadsplit,"[[",4) )
quadgramsortdf <- data.frame(word1=quadgramsortdf$one,word2=quadgramsortdf$two,word3=quadgramsortdf$three,word4=quadgramsortdf$four,freq=quadgramsortdf$freq,stringsAsFactors = FALSE)
write.csv(quadgramsortdf[quadgramsortdf$freq > 1,],"C:/Capstone Final/Milestone report/quadgram.csv",row.names = F)
quadgramsortdf <- read.csv("C:/Capstone Final/Milestone report/quadgram.csv",stringsAsFactors = F)
saveRDS(quadgramsortdf,file = "C:/Capstone Final/ShinyApp/quadgram.RData")