The goal of this report is to perform an exploratory analysis of the data extracted from the text files provided, understand the distribution of words, explore frequencies of words and word pairs as well as basic relationships in the data.
packages <- c("tm", "RWeka", "SnowballC", "ggplot2")
install.packages(packages, repos="http://cran.rstudio.com/")
library(tm); library(RWeka); library(SnowballC); library(ggplot2)
us.blogs.file <- "final/en_US/en_US.blogs.txt"
us.news.file <- "final/en_US/en_US.news.txt"
us.twitter.file <- "final/en_US/en_US.twitter.txt"
subsets.dir <- "final/en_US/subsets"
us.blogs.subset.file <- paste(subsets.dir, "en_US.blogs.txt", sep = "/")
us.news.subset.file <- paste(subsets.dir, "en_US.news.txt", sep = "/")
us.twitter.subset.file <- paste(subsets.dir, "en_US.twitter.txt", sep = "/")
#Only download the main zip if unzipped files not found
if (!file.exists(us.blogs.file) && !file.exists(us.news.file) && !file.exists(us.twitter.file)) {
temp = tempfile()
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
temp, mode = "wb")
unzip(temp, files = c(us.blogs.file, us.news.file, us.twitter.file))
unlink(temp)
}
con <- file(us.news.file, "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)
con <- file(us.blogs.file, "rb")
blogs <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)
con <- file(us.twitter.file, "rb")
twitter <- readLines(con, encoding = "UTF-8", skipNul = T, warn = F)
close(con)
news.size <- file.info(us.news.file)$size
blogs.size <- file.info(us.blogs.file)$size
twitter.size <- file.info(us.twitter.file)$size
twitter.words <- sum(sapply(gregexpr("\\S+", twitter), length))
blogs.words <- sum(sapply(gregexpr("\\S+", blogs), length))
news.words <- sum(sapply(gregexpr("\\S+", news), length))
data.frame(filename = c("blogs", "news", "twitter"),
size = c(blogs.size, news.size, twitter.size),
lines = c(length(blogs), length(news), length(twitter)),
words = c(blogs.words, news.words, twitter.words))
## filename size lines words
## 1 blogs 210160014 899288 37334131
## 2 news 205811889 1010242 34372530
## 3 twitter 167105338 2360148 30373583
dir.create(subsets.dir, showWarnings = F)
create.subset.file <- function(source.file, target.file) {
con <- file(target.file, "w")
len <- length(source.file)
# randomly select 5% of the data
rand.idx <- sample(1:len, round(len/20))
writeLines(source.file[rand.idx], con)
close(con)
}
if (!file.exists(us.blogs.subset.file)) {
create.subset.file(blogs, us.blogs.subset.file)
}
if (!file.exists(us.news.subset.file)) {
create.subset.file(news, us.news.subset.file)
}
if (!file.exists(us.twitter.subset.file)) {
create.subset.file(twitter, us.twitter.subset.file)
}
Text in the text files contains quite a lot of problems, such as repetitive text blocks and phrases, punctuation chars in both ASCII and Unicode, Unicode junk chars, missing apostrophes, repeating words and word pairs. Text cleaning and transformation below attempt to address those issues as well as remove so called stop-words (very frequent common words with no value for the analysis) and profane words.
# fetch a list of profane words
con <- url("http://www.bannedwordlist.com/lists/swearWords.txt", "r")
profane.words <- readLines(con, warn = F)
close(con)
# repetitive block of text, occurs multiple times in en_US.blogs.txt
amazon.legal <- "^.* is a participant in the Amazon Services LLC and Amazon EU Associates Programmes designed to provide a means for sites to earn advertising fees by advertising and linking to amazon\\.com, amazon\\.ca, amazon\\.co\\.uk, amazon\\.de, amazon\\.fr, amazon\\.it and amazon\\.es\\. Certain content that appears on this website comes from Amazon Services LLC and/or Amazon EU\\. This content is provided “as is” and is subject to change or removal at any time\\.$"
# repetitive pattern
citation <- "\\(Incorporated in Item .*\\)"
# repetitive pattern
repeating.phrase <- "Shoutout to Kimberly and Sasha\\(Can they sing Happy Birthday to her\\? PLEASE!\\) What kind of donuts do the Boyz like\\? :DD"
# repetitive pattern
repeating.phrase2 <- "^.* visit www\\.RadioTAGr\\.com/WFUV to TAG this song"
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
rem.amazon.legal <- function (x) gsub(amazon.legal, " ", x)
rem.citation <- function (x) gsub(citation, " ", x)
rem.repeating.phrase <- function (x) gsub(repeating.phrase, " ", x)
rem.repeating.phrase2 <- function (x) gsub(repeating.phrase2, " ", x)
#remove Unicode punctuation and junk
rem.unicode.punct <- function (x) gsub("[“”–—„…€$†•‹›]", " ", x)
#replace Unicode apostrophe with ASCII apostrophe
repl.unicode.apos <- function (x) gsub("[’‘]", "'", x)
#remove multiple repeating consecutive words
rem.dupes <- function (x) gsub("\\b(\\w+)(?:\\s+\\1\\b)+", "\\1", x)
#remove multiple repeating consecutive pairs of words
rem.dupe.pairs <- function(x) gsub("\\b(\\w+\\s\\w+)(\\s\\1)+", "\\1", x)
# create document corpus
data <- Corpus(DirSource(subsets.dir))
data <- tm_map(data, content_transformer(rem.amazon.legal))
data <- tm_map(data, content_transformer(rem.citation))
data <- tm_map(data, content_transformer(rem.repeating.phrase))
data <- tm_map(data, content_transformer(rem.repeating.phrase2))
data <- tm_map(data, removeNumbers)
data <- tm_map(data, content_transformer(rem.dupes))
data <- tm_map(data, content_transformer(rem.dupe.pairs))
data <- tm_map(data, content_transformer(tolower))
data <- tm_map(data, content_transformer(repl.unicode.apos))
#remove stopwords
data <- tm_map(data, removeWords, c(stopwords("english"), "dont", "didnt",
"wasnt", "wont", "werent", "wouldnt",
"cant", "aint", "doesnt", "lets", "isnt",
"shouldnt", "arent"))
data <- tm_map(data, removeWords, profane.words)
data <- tm_map(data, stemDocument)
data <- tm_map(data, removePunctuation)
data <- tm_map(data, content_transformer(rem.unicode.punct))
data <- tm_map(data, stripWhitespace)
data <- tm_map(data, content_transformer(trim))
Once text has been cleaned and transformed, we can create document-term matrices that we will use for the exploratory data analysis later. The document-term matrices split text into tokens and store them along with information about frequencies those tokens occur in each document in the corpus.
# create a document-term matrix from single words found in all documents
data.uni.dtm <- DocumentTermMatrix(data)
data.uni.dtm
## <<DocumentTermMatrix (documents: 3, terms: 190786)>>
## Non-/sparse entries: 275127/297231
## Sparsity : 52%
## Maximal term length: 326
## Weighting : term frequency (tf)
# create a document-term matrix for word pairs found in all documents
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
data.bi.dtm <- DocumentTermMatrix(data, control = list(tokenize = BigramTokenizer))
data.bi.dtm
## <<DocumentTermMatrix (documents: 3, terms: 2973734)>>
## Non-/sparse entries: 3396926/5524276
## Sparsity : 62%
## Maximal term length: 122
## Weighting : term frequency (tf)
data.frame(dtm=c("words", "word.pairs"),
term.count=c(data.uni.dtm$ncol, data.bi.dtm$ncol))
## dtm term.count
## 1 words 190786
## 2 word.pairs 2973734
# remove sparse words from the DTM, leaving only 20% sparsity
data.uni.s.dtm <- removeSparseTerms(data.uni.dtm, .2)
# remove sparse word pairs from the DTM, leaving only 20% sparsity
data.bi.s.dtm <- removeSparseTerms(data.bi.dtm, .2)
data.frame(dtm=c("words", "word.pairs"),
term.count=c(data.uni.s.dtm$ncol, data.bi.s.dtm$ncol))
## dtm term.count
## 1 words 29323
## 2 word.pairs 82666
Let’s see the most frequent single words:
total.w.freq <- colSums(as.matrix(data.uni.s.dtm))
w.frequency <- sort(total.w.freq, decreasing = T)
head(w.frequency, 20)
## will said one just get like can time day make love new
## 32326 30778 30645 30334 30165 29791 24647 24502 20959 20605 19596 19510
## year know good now work want see think
## 18641 18212 18117 17873 17003 15913 15745 15206
and the most frequent word pairs:
total.wp.freq <- colSums(as.matrix(data.bi.s.dtm))
wp.frequency <- sort(total.wp.freq, decreasing = T)
head(wp.frequency, 20)
## right now last year look like new york feel like
## 2535 2227 2038 2005 1747
## year ago last night look forward high school last week
## 1666 1605 1552 1476 1406
## first time make sure can get thank follow even though
## 1255 1195 1133 1035 991
## new jersey happi birthday let know one day just got
## 949 903 893 885 857
Now let’s see the least frequent single words:
tail(w.frequency, 20)
## wiltshir winder windham wining withstanding
## 3 3 3 3 3
## witter wittiest wofford womens worded
## 3 3 3 3 3
## worshippers wps xps xwing yellowgold
## 3 3 3 3 3
## yellowrump yohan zebras zeros zwick
## 3 3 3 3 3
and the least frequent words pairs:
tail(wp.frequency, 20)
## york may york world young fan young generat
## 3 3 3 3
## young hors young lover young mani young singl
## 3 3 3 3
## young victim youth happi youth ministry youth unemploy
## 3 3 3 3
## youth use zach galifianakis zack ryder zero day
## 3 3 3 3
## zimmerman call zone catch zone peopl zone way
## 3 3 3 3
Frequency histogram of the most frequent words (more than 10K occurrences):
w.df <- data.frame(word=names(w.frequency), frequency=w.frequency)
p <- ggplot(subset(w.df, frequency>10000), aes(word, frequency))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + ggtitle("Most frequent words")
p
Frequency histogram of the most frequent word pairs (more than 700 occurrences):
wp.df <- data.frame(word.pair=names(wp.frequency), frequency=wp.frequency)
p <- ggplot(subset(wp.df, frequency>700), aes(word.pair, frequency))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + ggtitle("Most frequent word pairs")
p
The top row in the tables below is frequencies of words / word pairs and the bottom row is how many word / word pairs occur with that frequency
Greatest word frequencies vs word count per frequency:
head(table(w.frequency), 20)
## w.frequency
## 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 879 1116 1240 1174 1061 992 918 833 791 723 660 669 569 548 493
## 18 19 20 21 22
## 466 451 415 403 386
Lowest word frequencies vs word count per frequency:
tail(table(w.frequency), 20)
## w.frequency
## 15206 15745 15913 17003 17873 18117 18212 18641 19510 19596 20605 20959
## 1 1 1 1 1 1 1 1 1 1 1 1
## 24502 24647 29791 30165 30334 30645 30778 32326
## 1 1 1 1 1 1 1 1
Greatest word pair frequencies vs word pair count per frequency:
head(table(wp.frequency), 20)
## wp.frequency
## 3 4 5 6 7 8 9 10 11 12 13 14
## 10289 10557 8875 7172 5964 4804 3992 3235 2813 2378 2007 1715
## 15 16 17 18 19 20 21 22
## 1519 1295 1124 1094 946 867 790 653
Lowest word pair frequencies vs word pair count per frequency:
tail(table(wp.frequency), 20)
## wp.frequency
## 857 885 893 903 949 991 1035 1133 1195 1255 1406 1476 1552 1605 1666
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1747 2005 2038 2227 2535
## 1 1 1 1 1
This analysis helps to get the feel of the data, gives an understanding of issues that may occur while parsing and processing the data, sets expectations about the data and will certainly help to build a predictive model at later stages.