DSS Capstone Project - Milestone Report

Synopsis

The aim of the report is to explore text data in order to examine the frequency of words and understand the relationships between couples and triplets of words.
The data consist of 3 text documents written in English language and taken from Blogs, News and Twitter. We focus on a sample of the 3 documents, joined to form a Corpus for text mining purposes.
The data are preprocessed in order to remove punctuation and unnecessary words. The exploratory analysis shows the most frequent words, bigrams and trigrams.

Getting the data

Download files

if(!file.exists("Coursera-Swiftkey.zip")) {
  url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  download.file(url)
  unzip("Coursera-Swiftkey.zip") 
  }
Sys.setlocale(locale = "english_US.1252")
con <- file(description = "final/en_US/en_US.blogs.txt", open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(description = "final/en_US/en_US.news.txt", open = "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(description = "final/en_US/en_US.twitter.txt", open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

Basic summary of the data

We determine the dimension of the 3 files, the number of rows and the number of words

library(stringi)
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
blogs_words <- sum(stri_count_words(blogs))
news_words <- sum(stri_count_words(news))
twitter_words <- sum(stri_count_words(twitter))
info_blogs <- c(round(blogs_size, 2), length(blogs), blogs_words)
info_news <- c(round(news_size, 2), length(news), news_words)
info_twitter <- c(round(twitter_size, 2), length(twitter), twitter_words)
dataf <- rbind(info_blogs, info_news, info_twitter)
dataf <- data.frame(dataf)
names(dataf) <- c("size (Mb)", "# of rows", "# of words")
rownames(dataf) <- c("blogs", "news", "twitter")
save(dataf, file = "dataf.RData")

We summarize the results in the following table:

if(!exists("dataf")) load("dataf.RData")
print(dataf)

##         size (Mb) # of rows # of words
## blogs      200.42    899288   37546246
## news       196.28   1010242   34762395
## twitter    159.36   2360148   30093410

then we display them in the following plot:

par(mfrow = c(3, 1))
barplot(dataf[, 1], names.arg = rownames(dataf), horiz = TRUE, col = "steelblue", ylab = "size (Mb)")
barplot(dataf[, 2], names.arg = rownames(dataf), horiz = TRUE, col = "magenta", ylab = "# of rows")
barplot(dataf[, 3], names.arg = rownames(dataf), horiz = TRUE, col = "darkgreen", ylab = "# of words")

Select a sample

Due to the very large dimension of the files, we decide to extract a sample of the rows:

set.seed(125)
bls <- sample(blogs, size = as.integer(length(blogs) * 0.02))
nws <- sample(news, size = as.integer(length(news) * 0.02))
tws <- sample(twitter, size = as.integer(length(twitter) * 0.02))
## the function iconv() allows us to remove all the strange characters 
bls <- iconv(bls, from = "UTF-8", to = "ASCII", sub = " ")
nws <- iconv(nws, from = "UTF-8", to = "ASCII", sub = " ")
tws <- iconv(tws, from = "UTF-8", to = "ASCII", sub = " ")
mysample <- c(bls, nws, tws)
save(bls, file = "./data/bls.RData")
save(nws, file = "./data/nws.RData")
save(tws, file = "./data/tws.RData")
save(mysample, file = "mysample.RData")
rm("blogs", "news", "twitter")

Create a Corpus from the sample

The first step in our text mining process is the creation of a Corpus, a structure representing a collection of text documents.

## we load the required packages
library(RWeka)
library(SnowballC)
library(tm)
library(wordcloud)
library(ggplot2)

if(!exists("mysample")) load("mysample.RData")
mytext <- VCorpus(VectorSource(mysample))

Preprocessing step

We remove punctuation and numbers. Then we convert words to lowercase and remove stopwords (common and uninteresting words which can confound our analysis).

mytext <- tm_map(mytext, removePunctuation)
mytext <- tm_map(mytext, removeNumbers)
mytext <- tm_map(mytext, content_transformer(tolower))
mytext <- tm_map(mytext, removeWords, stopwords("english"))

Finally we remove common word endings (this process is called stemming) and extra whitespace left after removing words.

mytext <- tm_map(mytext, stemDocument)
mytext <- tm_map(mytext, stripWhitespace)
mytext <- tm_map(mytext, PlainTextDocument)
save(mytext, file = "mytext.RData")

Exploratory Data Analysis

The aim of the following analysis is to gain knowledge about the frequencies of words, bigrams and trigrams in the Corpus.

Words analysis

We create a Term-Document Matrix

if(!exists("mytext")) load("mytext.RData")
myctrl <- list(bounds = list(global = c(100, Inf)))
tdmat1 <- TermDocumentMatrix(mytext, control = myctrl)
save(tdmat1, file = "tdmat1.RData")

then we sort the words to find the most frequently occurring ones, and display them by a wordcloud and a barplot

## frequency of words
if(!exists("tdmat1")) load("tdmat1.RData")
myfreq <- findFreqTerms(tdmat1, lowfreq = 800)
words <- sort(rowSums(as.matrix(tdmat1[myfreq, ])), decreasing = TRUE)
words <- data.frame(term = names(words[1:25]), frequency = words[1:25])
wordcloud(words = words$term, freq = words$frequency, scale = c(4,0.25),
          random.order = FALSE, rot.per = 0.25, colors = brewer.pal(6, "Dark2"))

g1 <- ggplot(data = words, aes(reorder(term, frequency), frequency)) + 
  geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
  ggtitle("Plot of the most frequent Words") + xlab("term")
g1

N-grams analysis

The next step consists of splitting our sample Corpus into n-grams in order to find the most frequently occurring Bigrams and Trigrams

## the following function returns a N-gram Matrix from a tokenized Corpus
getNGram <- function(txt, N, lbound, lfreq) {
  token <- function(x) NGramTokenizer(x, Weka_control(min = N, max = N))
  myctrl <- list(tokenize = token, bounds = list(global = c(lbound, Inf)))
  tdmat <- TermDocumentMatrix(txt, control = myctrl)
  myfreq <- findFreqTerms(tdmat, lowfreq = lfreq)
  tdmat <- sort(rowSums(as.matrix(tdmat[myfreq, ])), decreasing = TRUE)
  return(tdmat)
  }

## frequency of bigrams
bigrams <- getNGram(mytext, N = 2, lbound = 50, lfreq = 200)
bigrams <- data.frame(bigram = names(bigrams[1:20]), frequency = bigrams[1:20])
save(bigrams, file = "bigrams.RData")

if(!exists("bigrams")) load("bigrams.RData")
g2 <- ggplot(data = bigrams, aes(reorder(bigram,frequency), frequency)) + 
  geom_bar(stat = "identity", fill = "darkorange") + coord_flip() +
  ggtitle("Plot of the most frequent Bigrams") + xlab("bigram")
g2

## frequency of trigrams
trigrams <- getNGram(mytext, N = 3, lbound = 10, lfreq = 20)
trigrams <- data.frame(trigram = names(trigrams[1:20]), frequency = trigrams[1:20])
save(trigrams, file = "trigrams.RData")

if(!exists("trigrams")) load("trigrams.RData")
g3 <- ggplot(data = trigrams, aes(reorder(trigram,frequency), frequency)) + 
  geom_bar(stat = "identity", fill = "darkred") + coord_flip() +
  ggtitle("Plot of the most frequent Trigrams") + xlab("trigram")
g3

Plans for a prediction model

The future step of the project is the building of a Shiny application which tries to predict the next word given an input (consisting of 1, 2 or 3 words) from the user. The model will be based on the knowledge acquired about the words frequencies.

The following questions need to be considered:
1. the number of parameters of the model
2. the accuracy of the model
3. the balance between the algorithm size and the algorithm runtime.