Synopsis

The goal of this report is to perform an exploratory analysis of the data extracted from the text files provided, understand the distribution of words, explore frequencies of words and word pairs as well as basic relationships in the data.

Install required packages and load libraries

packages <- c("tm", "RWeka", "SnowballC", "ggplot2")
install.packages(packages, repos="http://cran.rstudio.com/")
library(tm); library(RWeka); library(SnowballC); library(ggplot2)

Load source text files and collect basic metrics

us.blogs.file <- "final/en_US/en_US.blogs.txt"
us.news.file <- "final/en_US/en_US.news.txt"
us.twitter.file <- "final/en_US/en_US.twitter.txt"
subsets.dir <- "final/en_US/subsets"
us.blogs.subset.file <- paste(subsets.dir, "en_US.blogs.txt", sep = "/")
us.news.subset.file <- paste(subsets.dir, "en_US.news.txt", sep = "/")
us.twitter.subset.file <- paste(subsets.dir, "en_US.twitter.txt", sep = "/")
#Only download the main zip if unzipped files not found
if (!file.exists(us.blogs.file) && !file.exists(us.news.file) && !file.exists(us.twitter.file)) {
    temp = tempfile()
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", 
                  temp, mode = "wb")
    unzip(temp, files = c(us.blogs.file, us.news.file, us.twitter.file))
    unlink(temp)
}

con <- file(us.news.file, "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)

con <- file(us.blogs.file, "rb")
blogs <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)

con <- file(us.twitter.file, "rb")
twitter <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)

news.size <- file.info(us.news.file)$size
blogs.size <- file.info(us.blogs.file)$size
twitter.size <- file.info(us.twitter.file)$size

twitter.words <- sum(sapply(gregexpr("\\S+", twitter), length))
blogs.words <- sum(sapply(gregexpr("\\S+", blogs), length))
news.words <- sum(sapply(gregexpr("\\S+", news), length))

data.frame(filename = c("blogs", "news", "twitter"), 
           size = c(blogs.size, news.size, twitter.size),
           lines = c(length(blogs), length(news), length(twitter)),
           words = c(blogs.words, news.words, twitter.words))
##   filename      size   lines    words
## 1    blogs 210160014  899288 37334131
## 2     news 205811889 1010242 34372530
## 3  twitter 167105338 2360148 30373583
dir.create(subsets.dir, showWarnings = F)

create.subset.file <- function(source.file, target.file) {
    
    con <- file(target.file, "w")
    
    len <- length(source.file)
    # randomly select 5% of the data
    rand.idx <- sample(1:len, round(len/20))
    writeLines(source.file[rand.idx], con)
    
    close(con)
}

if (!file.exists(us.blogs.subset.file)) {
    create.subset.file(blogs, us.blogs.subset.file)
} 
if (!file.exists(us.news.subset.file)) {
    create.subset.file(news, us.news.subset.file)
} 
if (!file.exists(us.twitter.subset.file)) {
    create.subset.file(twitter, us.twitter.subset.file)
}

Text transformation and cleaning

Text in the text files contains quite a lot of problems, such as repetitive text blocks and phrases, punctuation chars in both ASCII and Unicode, Unicode junk chars, missing apostrophes, repeating words and word pairs. Text cleaning and transformation below attempt to address those issues as well as remove so called stop-words (very frequent common words with no value for the analysis) and profane words.

# fetch a list of profane words
con <- url("http://www.bannedwordlist.com/lists/swearWords.txt", "r")
profane.words <- readLines(con, warn = F)
close(con) 
# repetitive block of text, occurs multiple times in en_US.blogs.txt
amazon.legal <- "^.* is a participant in the Amazon Services LLC and Amazon EU Associates Programmes designed to provide a means for sites to earn advertising fees by advertising and linking to amazon\\.com, amazon\\.ca, amazon\\.co\\.uk, amazon\\.de, amazon\\.fr, amazon\\.it and amazon\\.es\\. Certain content that appears on this website comes from Amazon Services LLC and/or Amazon EU\\. This content is provided “as is” and is subject to change or removal at any time\\.$"
# repetitive pattern
citation <- "\\(Incorporated in Item .*\\)"
# repetitive pattern
repeating.phrase <- "Shoutout to Kimberly and Sasha\\(Can they sing Happy Birthday to her\\? PLEASE!\\) What kind of donuts do the Boyz like\\? :DD"
# repetitive pattern
repeating.phrase2 <- "^.* visit www\\.RadioTAGr\\.com/WFUV to TAG this song"

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
rem.amazon.legal <- function (x) gsub(amazon.legal, " ", x)
rem.citation <- function (x) gsub(citation, " ", x)
rem.repeating.phrase <- function (x) gsub(repeating.phrase, " ", x)
rem.repeating.phrase2 <- function (x) gsub(repeating.phrase2, " ", x)
#remove Unicode punctuation and junk
rem.unicode.punct <- function (x) gsub("[“”–—„…€$†•‹›]", " ", x)
#replace Unicode apostrophe with ASCII apostrophe
repl.unicode.apos <- function (x) gsub("[’‘]", "'", x)
#remove multiple repeating consecutive words
rem.dupes <- function (x) gsub("\\b(\\w+)(?:\\s+\\1\\b)+", "\\1", x)
#remove multiple repeating consecutive pairs of words
rem.dupe.pairs <- function(x) gsub("\\b(\\w+\\s\\w+)(\\s\\1)+", "\\1", x)
# create document corpus
data <- Corpus(DirSource(subsets.dir))

data <- tm_map(data, content_transformer(rem.amazon.legal))
data <- tm_map(data, content_transformer(rem.citation))
data <- tm_map(data, content_transformer(rem.repeating.phrase))
data <- tm_map(data, content_transformer(rem.repeating.phrase2))
data <- tm_map(data, removeNumbers)
data <- tm_map(data, content_transformer(rem.dupes))
data <- tm_map(data, content_transformer(rem.dupe.pairs))
data <- tm_map(data, content_transformer(tolower))
data <- tm_map(data, content_transformer(repl.unicode.apos))
#remove stopwords
data <- tm_map(data, removeWords, c(stopwords("english"), "dont", "didnt",
                                      "wasnt", "wont", "werent", "wouldnt",
                                      "cant", "aint", "doesnt", "lets", "isnt",
                                      "shouldnt", "arent"))
data <- tm_map(data, removeWords, profane.words)
data <- tm_map(data, stemDocument)
data <- tm_map(data, removePunctuation)
data <- tm_map(data, content_transformer(rem.unicode.punct))
data <- tm_map(data, stripWhitespace)
data <- tm_map(data, content_transformer(trim))

Build document-term matrices

Once text has been cleaned and transformed, we can create document-term matrices that we will use for the exploratory data analysis later. The document-term matrices split text into tokens and store them along with information about frequencies those tokens occur in each document in the corpus.

# create a document-term matrix from single words found in all documents
data.uni.dtm <- DocumentTermMatrix(data)
data.uni.dtm
## <<DocumentTermMatrix (documents: 3, terms: 190786)>>
## Non-/sparse entries: 275127/297231
## Sparsity           : 52%
## Maximal term length: 326
## Weighting          : term frequency (tf)
# create a document-term matrix for word pairs found in all documents
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
data.bi.dtm <- DocumentTermMatrix(data, control = list(tokenize = BigramTokenizer))
data.bi.dtm
## <<DocumentTermMatrix (documents: 3, terms: 2973734)>>
## Non-/sparse entries: 3396926/5524276
## Sparsity           : 62%
## Maximal term length: 122
## Weighting          : term frequency (tf)
data.frame(dtm=c("words", "word.pairs"),
           term.count=c(data.uni.dtm$ncol, data.bi.dtm$ncol))
##          dtm term.count
## 1      words     190786
## 2 word.pairs    2973734
# remove sparse words from the DTM, leaving only 20% sparsity
data.uni.s.dtm <- removeSparseTerms(data.uni.dtm, .2)
# remove sparse word pairs from the DTM, leaving only 20% sparsity
data.bi.s.dtm <- removeSparseTerms(data.bi.dtm, .2)

data.frame(dtm=c("words", "word.pairs"),
           term.count=c(data.uni.s.dtm$ncol, data.bi.s.dtm$ncol))
##          dtm term.count
## 1      words      29323
## 2 word.pairs      82666

Explore data

Word frequencies

Let’s see the most frequent single words:

total.w.freq <- colSums(as.matrix(data.uni.s.dtm))

w.frequency <- sort(total.w.freq, decreasing = T)
head(w.frequency, 20)
##  will  said   one  just   get  like   can  time   day  make  love   new 
## 32326 30778 30645 30334 30165 29791 24647 24502 20959 20605 19596 19510 
##  year  know  good   now  work  want   see think 
## 18641 18212 18117 17873 17003 15913 15745 15206

and the most frequent word pairs:

total.wp.freq <- colSums(as.matrix(data.bi.s.dtm))
wp.frequency <- sort(total.wp.freq, decreasing = T)
head(wp.frequency, 20)
##      right now      last year      look like       new york      feel like 
##           2535           2227           2038           2005           1747 
##       year ago     last night   look forward    high school      last week 
##           1666           1605           1552           1476           1406 
##     first time      make sure        can get   thank follow    even though 
##           1255           1195           1133           1035            991 
##     new jersey happi birthday       let know        one day       just got 
##            949            903            893            885            857

Now let’s see the least frequent single words:

tail(w.frequency, 20)
##     wiltshir       winder      windham       wining withstanding 
##            3            3            3            3            3 
##       witter     wittiest      wofford       womens       worded 
##            3            3            3            3            3 
##  worshippers          wps          xps        xwing   yellowgold 
##            3            3            3            3            3 
##   yellowrump        yohan       zebras        zeros        zwick 
##            3            3            3            3            3

and the least frequent words pairs:

tail(wp.frequency, 20)
##          york may        york world         young fan     young generat 
##                 3                 3                 3                 3 
##        young hors       young lover        young mani       young singl 
##                 3                 3                 3                 3 
##      young victim       youth happi    youth ministry    youth unemploy 
##                 3                 3                 3                 3 
##         youth use zach galifianakis        zack ryder          zero day 
##                 3                 3                 3                 3 
##    zimmerman call        zone catch        zone peopl          zone way 
##                 3                 3                 3                 3

Frequency histogram of the most frequent words (more than 10K occurrences):

w.df <- data.frame(word=names(w.frequency), frequency=w.frequency)
p <- ggplot(subset(w.df, frequency>10000), aes(word, frequency))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + ggtitle("Most frequent words")
p

Frequency histogram of the most frequent word pairs (more than 700 occurrences):

wp.df <- data.frame(word.pair=names(wp.frequency), frequency=wp.frequency)
p <- ggplot(subset(wp.df, frequency>700), aes(word.pair, frequency))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p <- p + ggtitle("Most frequent word pairs")
p

Distribution of frequencies

The top row in the tables below is frequencies of words / word pairs and the bottom row is how many word / word pairs occur with that frequency

Greatest word frequencies vs word count per frequency:

head(table(w.frequency), 20)
## w.frequency
##    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##  879 1116 1240 1174 1061  992  918  833  791  723  660  669  569  548  493 
##   18   19   20   21   22 
##  466  451  415  403  386

Lowest word frequencies vs word count per frequency:

tail(table(w.frequency), 20)
## w.frequency
## 15206 15745 15913 17003 17873 18117 18212 18641 19510 19596 20605 20959 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 24502 24647 29791 30165 30334 30645 30778 32326 
##     1     1     1     1     1     1     1     1

Greatest word pair frequencies vs word pair count per frequency:

head(table(wp.frequency), 20)
## wp.frequency
##     3     4     5     6     7     8     9    10    11    12    13    14 
## 10289 10557  8875  7172  5964  4804  3992  3235  2813  2378  2007  1715 
##    15    16    17    18    19    20    21    22 
##  1519  1295  1124  1094   946   867   790   653

Lowest word pair frequencies vs word pair count per frequency:

tail(table(wp.frequency), 20)
## wp.frequency
##  857  885  893  903  949  991 1035 1133 1195 1255 1406 1476 1552 1605 1666 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1747 2005 2038 2227 2535 
##    1    1    1    1    1

Conclusion

This analysis helps to get the feel of the data, gives an understanding of issues that may occur while parsing and processing the data, sets expectations about the data and will certainly help to build a predictive model at later stages.