Goal of this project is to build a model and create ShinyApp to predict next word(s) (1st, 2nd, and 3rd) based on user type-in text. In this process, we are using Corpus data structure provided by tm - text mining framework, in R. Blogs, news, and twitter data (one large file from each) will be loaded from source directory (preferably) to a data frome. Observe/print the number of lines, words, and characters avalilable in data frame to understand the distribution and relationship between the data. Preapre sample data (from each file) and create a Corpus object. Then proceed to clean the data using Corpus cleaning techniques (stopwords, swearwords, removeWhiteSpace, punctuations etc). Load RWeka - library to get n-gram tokens (1st, 2nd, and 3rd) from the data. Execute NGrameTokenizers on data to understand the frequency of words and pairs by ploting diagrams.
Data files downloaded from Capstone Dataset to filesystem. Setting up source folders for original data files, sample files, and swear words file to read from filesystem. Below code checks for riginal source folder, creates it if one does not exist, and unzip files after donwload.
#Loading tm library by supressing messages and warnings
suppressMessages(suppressWarnings(library(tm)))
#source directory for data
sourcedir <- "C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone/final/en_US"
sampledir <- "C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone/sample/en_US"
swearwordsdir <- "C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone/swearwords"
if(!file.exists("C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone/final/en_US")){
webURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(webURL,"C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone")
#unzip to target folder
unzip("C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone/Coursera-SwiftKey.zip",
"C:/Data/Development/datascience/coursera/R/data/DataScienceCapstone");
}
twittertext <- readTxtFile(sourcedir,"en_US.twitter.txt")
blogtext <- readTxtFile(sourcedir,"en_US.blogs.txt")
newstext <- readTxtFile(sourcedir,"en_US.news.txt")
#Data frame to print file statistics
docs_details <- data.frame("Source" = c("blogs","news","twitter"),
"FileSize"=sapply(list(blogtext,newstext,twittertext), docSize),
"Length"=sapply(list(blogtext,newstext,twittertext), docLength),
"Caracters"=sapply(list(blogtext,newstext,twittertext), docChars))
#summary
docs_details
## Source FileSize Length Caracters
## 1 blogs 248.5 Mb 899288 208361438
## 2 news 249.6 Mb 1010242 203791405
## 3 twitter 301.4 Mb 2360148 162384825
Preparing sample data files (20% of raw data) and storing them in filesystem to read for analysis
#Sampling data:: Getting 20% of sample data
writeTxtFile(sampledir,iconvText(sample(twittertext, docLength(twittertext) * 0.02)),"sample.twitter.txt")
writeTxtFile(sampledir,iconvText(sample(blogtext, docLength(blogtext) * 0.02)),"sample.blogs.txt")
writeTxtFile(sampledir,iconvText(sample(newstext, docLength(newstext) * 0.02)),"sample.news.txt")
. remove special characters
. replace some characters with space
. remove spaces
. remove punctuations
. remove repeated characters
. remove numbers
. strip white spaces
. remove stopwords
. converting lowercase
#Loading libraries
suppressMessages(suppressWarnings(library(SnowballC)))
suppressMessages(suppressWarnings(library(RWeka)))
docs <- Corpus(DirSource(sampledir))
#[1] "sample.blogs.txt" "sample.news.txt" "sample.twitter.txt"
docs
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
Cleaning Coupus and printing top 10 words which has higher frequency
#cleaning corpus text
docs <- tm_map(docs, replaceWithSpace,"-")
docs <- tm_map(docs, removeChar,"`")
docs <- tm_map(docs, removeChar,"`")
docs <- tm_map(docs, removeChar,"´")
docs <- tm_map(docs, removeChar,"!")
docs <- tm_map(docs, replaceWithSpace,":")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeRepeatedChars)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeLinks)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
#docs <- tm_map(docs, stemDocument)
#printing top 10 words frequency
printWordFrequency(docs, 10)
## term occurrences
## that that 20363
## with with 14023
## this this 10909
## have have 10366
## from from 7473
## they they 6325
## will will 6297
## your your 6103
## just just 6090
## said said 6068
#DTM for Corpus - words which has length >4 & < 20 from three files
dtmr <- DocumentTermMatrix(docs, control=list(wordLengths=c(4,20), bounds = list(global = c(3,27))))
crpsfreq <- colSums(as.matrix(dtmr))
Printing Corpus wordcould to check words frequency before removing stopwords & profanity words
Note: Please check ‘Commonfunctions’ listed below
suppressMessages(suppressWarnings(library(wordcloud)))
set.seed(9)
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
#ordrfq <- order(crpsfreq, decreasing = TRUE)
wordcloud(names(crpsfreq),crpsfreq, min.freq = 70, scale=c(2,0.5), max.words=200, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=pal)
rm(dtmr)
rm(crpsfreq)
Downloaded profanity words (though I named as swear words) and placed int filesystem. Removing profanity words from Corpus
Removing english stop words from data to get more meaningful words frequency
Printing top words which has higher frequency
#Loading profanity workds
swearwords <- readTxtFile(swearwordsdir,"swearWords.txt")
#Removing profanity words to get useful words count
docs <- tm_map(docs, removeWords, stopwords('english'))
#Profanity filtering - removing profanity and other words you do not want to predict
docs <- tm_map(docs, removeWords, swearwords)
#Checking word frequency which has length > 4 & < 20 in three files
printWordFrequency(docs, 10)
## term occurrences
## will will 6297
## just just 6090
## said said 6068
## like like 5459
## time time 4502
## good good 3532
## love love 3220
## know know 3170
## people people 3128
## back back 2849
#findFreqTerms(docs, lowfreq = 200)
Creating 1-Gram, 2-Gram, and 3-Gram tokenizers to create TermDocumentMatrix to proceed with Analysis
Removing Corpus object from memory after usage. This object will not be referred in further analysis
Executing 1-Gram, 2-Gram, and 3-Gram tokenizers on data
options(mc.cores=1)
#Tokenizers
unitokenizer <- function(x){NGramTokenizer(x,Weka_control(min=1, max=1))}
biotokenizer <- function(x){NGramTokenizer(x,Weka_control(min=2, max=2))}
tritokenizer <- function(x){NGramTokenizer(x,Weka_control(min=3, max=3))}
#Tokenization
UniMatrix <- TermDocumentMatrix(docs, control=list(tokenize=unitokenizer))
BiMatrix <- TermDocumentMatrix(docs, control=list(tokenize=biotokenizer))
TriMatrix <- TermDocumentMatrix(docs, control=list(tokenize=tritokenizer))
#removing docs from memory
rm(docs)
gc(reset = TRUE)
suppressMessages(suppressWarnings(library(ggplot2)))
unifreq <- rowSums(as.matrix(UniMatrix))
unifreq <- unifreq[order(unifreq,decreasing = TRUE)]
#Quantiles
printQuantile(unifreq)
## 0% 25% 50% 75% 80% 90% 95% 100%
## 1 1 1 4 6 16 41 6297
Printing unifrequency words
set.seed(9)
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
wordcloud(names(unifreq),unifreq,min.freq = 70, scale=c(2,0.5), max.words=200, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=pal)
Ploting more frequent words graphs
printGgPlot(unifreq)
#Bi-frequency for unique workds
bifreq <- rowSums(as.matrix(BiMatrix))
bifreq <- bifreq[order(bifreq,decreasing = TRUE)]
#Quantiles
printQuantile(bifreq)
## 0% 25% 50% 75% 80% 90% 95% 100%
## 1 1 1 1 1 2 2 502
printGgPlot(bifreq)
trifreq <- rowSums(as.matrix(TriMatrix))
trifreq <- trifreq[order(trifreq,decreasing = TRUE)]
#Quantiles
printQuantile(trifreq)
## 0% 25% 50% 75% 80% 90% 95% 100%
## 1 1 1 1 1 1 1 68
printGgPlot(trifreq)
. 50% Coverage
. 90% Coverage
. 95% Coverage
Note: Please check ‘Commonfunctions’ listed below
#50% coverage
dictCoverage(as.data.frame(unifreq), 0.50)
## [1] 990
#90% coverage
dictCoverage(as.data.frame(unifreq), 0.90)
## [1] 15615
#95% coverage
dictCoverage(as.data.frame(unifreq), 0.95)
## [1] 32203
Higher frequencies giving more words from the dictionary
This analysis concludes that bigram and trigram models will not be sutable as we cannot predict much so far. We could increase the sample data size and develop probabilistic approach where quad-grams are unobserved. Increasing the sample data requires more computational power and impact the efficiency of the application.
Following steps will be perfomed for final analysis, build a model, and prediction:
. Execte N-Gram modelling against full data files by optimizing the model to consume low memory
. Develop final model as a ShinyApp/data product
#Common functions to get size, length, and number of characters in a file
readTxtFile <- function(srcdir,x)
{
con <- file(paste(srcdir,sep = "/",x),'rb',blocking = FALSE)
txtFile <- readLines(con, warn = FALSE)
close(con)
return(txtFile)
}
writeTxtFile <- function(srcdir,d,x)
{
con <- file(paste(srcdir,sep = "/",x),blocking = FALSE,raw = FALSE,open = "wt", encoding = "UTF-8")
txtFile <- writeLines(d,con)
close(con)
}
printWordFrequency <- function(crps, nm){
dtmtrx <- DocumentTermMatrix(crps, control=list(wordLengths=c(4,20), bounds = list(global = c(1,3))))
frqncy <- colSums(as.matrix(dtmtrx))
#Word frequency data frome
wf = data.frame(term=names(frqncy), occurrences=frqncy)
#printing first 10 max occurred terms
return(head(wf[with(wf, order(-occurrences, term)), ], nm))
}
printGgPlot <- function(dtmarix) {
freq <- sort(rowSums(as.matrix(dtmarix)), decreasing=TRUE)
freq_frame <- data.frame(Word=names(freq), Frequency=freq)
#return(freq_frame)
freq_frame <- head(freq_frame, 25)
ggplot(freq_frame, aes(x=Word,y=Frequency)) + geom_bar(stat="Identity", fill=c("#999999")) +geom_text(aes(label=Frequency), vjust=-0.20) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 15)) + theme(axis.text.y = element_text(hjust = 1, size = 15))+ theme(axis.title.x = element_text(size = 18)) + theme(axis.title.y = element_text(size = 18))
}
printQuantile <- function(dfreq){
iquant <- quantile(dfreq,probs=c(0,25,50,75,80,90,95,100)/100)
print(iquant)
}
#Checking dictionary coverage %
dictCoverage <- function (tdmtrx, prcnt)
{
freq <- 0
prcntfreq <- prcnt * sum(tdmtrx[1])
for (i in 1:nrow(tdmtrx)) {
if (freq >= prcntfreq) {
return (i)
}
freq <- freq + tdmtrx[i, 1]
}
return (nrow(tdmtrx))
}
#to get file statistics
iconvText <- function(x){return(iconv(x, to='ASCII//TRANSLIT'))}
docSize <- function(x){format(object.size(x),"MB")}
docLength <- function(x){return(length(x))}
docChars <- function(x) {sum(nchar(x))}
getTokeNizer <- function(crps,mi,mx){return (NGramTokenizer(docs,Weka_control(min=mi, max=mx)))}
#regex functions
replaceWithSpace <- content_transformer(function(x, pattern){return (gsub(pattern,' ', x))})
removeChar <- content_transformer(function(x, pattern){return (gsub(pattern,'', x))})
removeLinks <- content_transformer(function(x){return(gsub("http[^[:space:]]*", "", x))})
#remove characters if occures more than two
removeRepeatedChars <- content_transformer(function(x){return(gsub('([[:alpha:]])\\1{2,}', '\\1\\1', x))})
#End common functions code