library(tm)
library(RWeka)
library(stringi)
library(stringr)
library(ggplot2)
library(R.utils)
library(knitr)
library(dplyr)
library(wordcloud)
blogs.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt"
news.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"
twitter.file <- "E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"
blogs.size <- file.info(blogs.file)$size / (1024*1024)
blogs.length <- countLines(blogs.file)
blogs.words <- sum(stri_count_words(readLines(blogs.file, encoding = "UTF-8", skipNul = TRUE)))
news.size <- file.info(news.file)$size / (1024*1024)
news.length <- countLines(news.file)
news.words <- sum(stri_count_words(readLines(news.file, encoding = "UTF-8", skipNul = TRUE)))
twitter.size <- file.info(twitter.file)$size / (1024*1024)
twitter.length <- countLines(twitter.file)
twitter.words <- sum(stri_count_words(readLines(twitter.file, encoding = "UTF-8", skipNul = TRUE)))
Below is summary statistics of the data by sources.
df <- data.frame(source = c("blogs", "news", "twitter"),
size = c(blogs.size, news.size, twitter.size),
length = c(blogs.length, news.length, twitter.length),
words = c(blogs.words, news.words, twitter.words)
)
kable(x=df,col.names=c("Source","Size","Line Count","Word Count"))
Source | Size | Line Count | Word Count |
---|---|---|---|
blogs | 200.4242 | 899288 | 37546246 |
news | 196.2775 | 1010242 | 2674536 |
159.3641 | 2360148 | 30093410 |
As seen above, the size of the data is too big and sampling might be a good way to analyze the data. This is to reduce the amount of time and to consider the computing resources of the user.
blogs <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt", encoding = 'UTF-8')
news <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt", encoding = 'UTF-8')
twitter <- readLines("E:\\Documents\\IrniJasminaIbrahim\\Capstone\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt", encoding = 'UTF-8')
set.seed(1)
blogsSample <- sample(blogs, length(blogs)*0.01)
newsSample <- sample(news, length(news)*0.01)
twitterSample <- sample(twitter, length(twitter)*0.01)
twitterSample <- sapply(twitterSample,
function(row) iconv(row, "latin1", "ASCII", sub=""))
text_sample <- c(blogsSample,newsSample,twitterSample)
After sampling the data, some cleaning up is done on it.
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
preprocessCorpus <- function(corpus){
corpus <- tm_map(corpus, toSpace, "/|@|\\|")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
freq_frame <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_frame <- data.frame(word=names(freq), freq=freq)
return(freq_frame)
}
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
text_sample <- VCorpus(VectorSource(text_sample))
text_sample <- preprocessCorpus(text_sample)
tdm1a <- TermDocumentMatrix(text_sample)
tdm1 <- removeSparseTerms(tdm1a, 0.99)
freq1_frame <- freq_frame(tdm1)
freq1_top30 <- head(freq1_frame,30)
tdm2a <- TermDocumentMatrix(text_sample, control=list(tokenize=BigramTokenizer))
tdm2 <- removeSparseTerms(tdm2a, 0.999)
freq2_frame <- freq_frame(tdm2)
freq2_top30 <- head(freq2_frame,30)
tdm3a <- TermDocumentMatrix(text_sample, control=list(tokenize=TrigramTokenizer))
tdm3 <- removeSparseTerms(tdm3a, 0.9999)
freq3_frame <- freq_frame(tdm3)
freq3_top30 <- head(freq3_frame,30)
tdm4a <- TermDocumentMatrix(text_sample, control=list(tokenize=QuadgramTokenizer))
tdm4 <- removeSparseTerms(tdm4a, 0.9999)
freq4_frame <- freq_frame(tdm4)
freq4_top30 <- head(freq4_frame,30)
Plots are constructed to visualise what are the top 30 common terms based on the n-gram model.
ggplot(freq1_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y = element_blank()) +
labs(y="Frequency", title="Top 30 Common Unigrams")
ggplot(freq2_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y = element_blank()) +
labs(y="Frequency", title="Top 30 Common Bigrams")
ggplot(freq3_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y = element_blank()) +
labs(y="Frequency", title="Top 30 Common Trigrams")
ggplot(freq4_top30, aes(x=reorder(word,freq), y=freq, fill=freq)) +
geom_bar(stat="identity") +
theme_bw() +
coord_flip() +
theme(axis.title.y = element_blank()) +
labs(y="Frequency", title="Top 30 Common Quadgrams")
In addition, word clouds are also produced to show the top 10 common terms for the n-gram model.
Top 10 Unigram
wordcloud(freq1_top30$word, freq1_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)
Top 10 Bigram
wordcloud(freq2_top30$word, freq2_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)
Top 10 Trigram
wordcloud(freq3_top30$word, freq3_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)
Top 10 Quadgram
wordcloud(freq4_top30$word, freq4_top30$freq, colors=brewer.pal(8, "Set1"), random.order = FALSE, max.words = 10)
The predictive model will be based on the n-gram model. Further cleaning up such as removing profane words can be considered in the final model.
For Shiny app, my plan is to include a text input box and to include options of how many next words the user wants to have as an output.