The aim of the report is to explore text data in order to examine the frequency of words and understand the relationships between couples and triplets of words.
The data consist of 3 text documents written in English language and taken from Blogs, News and Twitter. We focus on a sample of the 3 documents, joined to form a Corpus for text mining purposes.
The data are preprocessed in order to remove punctuation and unnecessary words. The exploratory analysis shows the most frequent words, bigrams and trigrams.
if(!file.exists("Coursera-Swiftkey.zip")) {
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url)
unzip("Coursera-Swiftkey.zip")
}
Sys.setlocale(locale = "english_US.1252")
con <- file(description = "final/en_US/en_US.blogs.txt", open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(description = "final/en_US/en_US.news.txt", open = "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(description = "final/en_US/en_US.twitter.txt", open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
We determine the dimension of the 3 files, the number of rows and the number of words
library(stringi)
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
blogs_words <- sum(stri_count_words(blogs))
news_words <- sum(stri_count_words(news))
twitter_words <- sum(stri_count_words(twitter))
info_blogs <- c(round(blogs_size, 2), length(blogs), blogs_words)
info_news <- c(round(news_size, 2), length(news), news_words)
info_twitter <- c(round(twitter_size, 2), length(twitter), twitter_words)
dataf <- rbind(info_blogs, info_news, info_twitter)
dataf <- data.frame(dataf)
names(dataf) <- c("size (Mb)", "# of rows", "# of words")
rownames(dataf) <- c("blogs", "news", "twitter")
save(dataf, file = "dataf.RData")
We summarize the results in the following table:
if(!exists("dataf")) load("dataf.RData")
print(dataf)
## size (Mb) # of rows # of words
## blogs 200.42 899288 37546246
## news 196.28 1010242 34762395
## twitter 159.36 2360148 30093410
then we display them in the following plot:
par(mfrow = c(3, 1))
barplot(dataf[, 1], names.arg = rownames(dataf), horiz = TRUE, col = "steelblue", ylab = "size (Mb)")
barplot(dataf[, 2], names.arg = rownames(dataf), horiz = TRUE, col = "magenta", ylab = "# of rows")
barplot(dataf[, 3], names.arg = rownames(dataf), horiz = TRUE, col = "darkgreen", ylab = "# of words")
Due to the very large dimension of the files, we decide to extract a sample of the rows:
set.seed(125)
bls <- sample(blogs, size = as.integer(length(blogs) * 0.02))
nws <- sample(news, size = as.integer(length(news) * 0.02))
tws <- sample(twitter, size = as.integer(length(twitter) * 0.02))
## the function iconv() allows us to remove all the strange characters
bls <- iconv(bls, from = "UTF-8", to = "ASCII", sub = " ")
nws <- iconv(nws, from = "UTF-8", to = "ASCII", sub = " ")
tws <- iconv(tws, from = "UTF-8", to = "ASCII", sub = " ")
mysample <- c(bls, nws, tws)
save(bls, file = "./data/bls.RData")
save(nws, file = "./data/nws.RData")
save(tws, file = "./data/tws.RData")
save(mysample, file = "mysample.RData")
rm("blogs", "news", "twitter")
The first step in our text mining process is the creation of a Corpus, a structure representing a collection of text documents.
## we load the required packages
library(RWeka)
library(SnowballC)
library(tm)
library(wordcloud)
library(ggplot2)
if(!exists("mysample")) load("mysample.RData")
mytext <- VCorpus(VectorSource(mysample))
We remove punctuation and numbers. Then we convert words to lowercase and remove stopwords (common and uninteresting words which can confound our analysis).
mytext <- tm_map(mytext, removePunctuation)
mytext <- tm_map(mytext, removeNumbers)
mytext <- tm_map(mytext, content_transformer(tolower))
mytext <- tm_map(mytext, removeWords, stopwords("english"))
Finally we remove common word endings (this process is called stemming) and extra whitespace left after removing words.
mytext <- tm_map(mytext, stemDocument)
mytext <- tm_map(mytext, stripWhitespace)
mytext <- tm_map(mytext, PlainTextDocument)
save(mytext, file = "mytext.RData")
The aim of the following analysis is to gain knowledge about the frequencies of words, bigrams and trigrams in the Corpus.
We create a Term-Document Matrix
if(!exists("mytext")) load("mytext.RData")
myctrl <- list(bounds = list(global = c(100, Inf)))
tdmat1 <- TermDocumentMatrix(mytext, control = myctrl)
save(tdmat1, file = "tdmat1.RData")
then we sort the words to find the most frequently occurring ones, and display them by a wordcloud and a barplot
## frequency of words
if(!exists("tdmat1")) load("tdmat1.RData")
myfreq <- findFreqTerms(tdmat1, lowfreq = 800)
words <- sort(rowSums(as.matrix(tdmat1[myfreq, ])), decreasing = TRUE)
words <- data.frame(term = names(words[1:25]), frequency = words[1:25])
wordcloud(words = words$term, freq = words$frequency, scale = c(4,0.25),
random.order = FALSE, rot.per = 0.25, colors = brewer.pal(6, "Dark2"))
g1 <- ggplot(data = words, aes(reorder(term, frequency), frequency)) +
geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
ggtitle("Plot of the most frequent Words") + xlab("term")
g1
The next step consists of splitting our sample Corpus into n-grams in order to find the most frequently occurring Bigrams and Trigrams
## the following function returns a N-gram Matrix from a tokenized Corpus
getNGram <- function(txt, N, lbound, lfreq) {
token <- function(x) NGramTokenizer(x, Weka_control(min = N, max = N))
myctrl <- list(tokenize = token, bounds = list(global = c(lbound, Inf)))
tdmat <- TermDocumentMatrix(txt, control = myctrl)
myfreq <- findFreqTerms(tdmat, lowfreq = lfreq)
tdmat <- sort(rowSums(as.matrix(tdmat[myfreq, ])), decreasing = TRUE)
return(tdmat)
}
## frequency of bigrams
bigrams <- getNGram(mytext, N = 2, lbound = 50, lfreq = 200)
bigrams <- data.frame(bigram = names(bigrams[1:20]), frequency = bigrams[1:20])
save(bigrams, file = "bigrams.RData")
if(!exists("bigrams")) load("bigrams.RData")
g2 <- ggplot(data = bigrams, aes(reorder(bigram,frequency), frequency)) +
geom_bar(stat = "identity", fill = "darkorange") + coord_flip() +
ggtitle("Plot of the most frequent Bigrams") + xlab("bigram")
g2
## frequency of trigrams
trigrams <- getNGram(mytext, N = 3, lbound = 10, lfreq = 20)
trigrams <- data.frame(trigram = names(trigrams[1:20]), frequency = trigrams[1:20])
save(trigrams, file = "trigrams.RData")
if(!exists("trigrams")) load("trigrams.RData")
g3 <- ggplot(data = trigrams, aes(reorder(trigram,frequency), frequency)) +
geom_bar(stat = "identity", fill = "darkred") + coord_flip() +
ggtitle("Plot of the most frequent Trigrams") + xlab("trigram")
g3
The future step of the project is the building of a Shiny application which tries to predict the next word given an input (consisting of 1, 2 or 3 words) from the user. The model will be based on the knowledge acquired about the words frequencies.
The following questions need to be considered:
1. the number of parameters of the model
2. the accuracy of the model
3. the balance between the algorithm size and the algorithm runtime.