# Loading libraries
library(tm)
## Loading required package: NLP
library(stringi)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
This report is to show my progress and ideas for completing the Capstone Project. I will briefly summarize my plans for creating the prediction algorithm and Shiny app. I will start with a basic report of summary statistics.
The data consists of 3 sets – from blogs, news, and twitter.
conBlogs <- file("..\\Coursera-SwiftKey\\en_US\\en_US.blogs.txt", "r")
conNews <- file("..\\Coursera-SwiftKey\\en_US\\en_US.news.txt", "r")
conTwitter <- file("..\\Coursera-SwiftKey\\en_US\\en_US.twitter.txt", "r")
blogs <- readLines(conBlogs, encoding="UTF-8")
news <- readLines(conNews, encoding="UTF-8")
twitter <- readLines(conTwitter, encoding="UTF-8")
close(conBlogs)
close(conNews)
close(conTwitter)
The blogs have 899288 lines, the news have 77259 lines, and the twitter has 2360148 lines.
In order to make the computation faster I am going to sample the data.
sampleSize <- 1000
sampleBlogs <- blogs[sample(1:length(blogs), sampleSize)]
sampleNews <- news[sample(1:length(news), sampleSize)]
sampleTwitter <- twitter[sample(1:length(twitter), sampleSize)]
sampleAll <- c(sampleTwitter, sampleNews, sampleBlogs)
writeLines(sampleAll, "sampleAll.txt")
sampleAllCon <- file("sampleAll.txt")
sampleAll <- readLines(sampleAllCon)
close(sampleAllCon)
Together we have a sample with 3000 lines of text and approximately 87899 words.
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
sampleClean <- VCorpus(VectorSource(sampleAll))
sampleClean <- tm_map(sampleClean, toSpace, "/|@|\\|")
sampleClean <- tm_map(sampleClean, content_transformer(tolower))
sampleClean <- tm_map(sampleClean, content_transformer(removeNumbers))
sampleClean <- tm_map(sampleClean, content_transformer(removePunctuation))
sampleClean <- tm_map(sampleClean, removeWords, stopwords("english"))
sampleClean <- tm_map(sampleClean, stripWhitespace)
sampleClean
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3000
word_freq <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
myTokenizer <- function(len){
function(x) NGramTokenizer(x, Weka_control(min=len, max=len))
}
tdm1gram <- TermDocumentMatrix(sampleClean)
freq1gram <- word_freq(tdm1gram)
tdm2gram <- TermDocumentMatrix(sampleClean, control=list(tokenize=myTokenizer(2)))
freq2gram <- word_freq(tdm2gram)
tdm3gram <- TermDocumentMatrix(sampleClean, control=list(tokenize=myTokenizer(3)))
freq3gram <- word_freq(tdm3gram)
ggplot(freq1gram[1:10, ], aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y=element_blank()) +
labs(y="Frequency", title="Most common unigrams in text sample")
ggplot(freq2gram[1:10, ], aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y=element_blank()) +
labs(y="Frequency", title="Most common bigrams in text sample")
ggplot(freq3gram[1:10, ], aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y=element_blank()) +
labs(y="Frequency", title="Most common trigrams in text sample")
At the current state I was able to clean the corpus a little. Some artifacts still exist, though, and this step needs more attention.
The plan is to first look for higher n-gram to predict the next word, if that fails to look for a shorter one, and so on.
The Shiny application is meant to have a very simple user interface, consisting only of text box to input the user text and another one with suggestions of most probable next words.