We will perform exploratory data analysis on the course corpus of 3 files, blogs, news and twitter.txt.
We first load the text with readLines and create a tm Corpus. Because the files are big, we sample about half of each file through Linux command line tools and save them as _subset.txt files.
filePath <- "~/core/code/R/coursera_R/capstone/Coursera-SwiftKey/final/en_US/"
text <- readLines(filePath)
stri_stats_general(text)
## Lines LinesNEmpty Chars CharsNWhite
## 0 0 0 0
docs <- Corpus(VectorSource(text))
We can subset with standard unix tools (‘head’, ‘tail’) ans save the files in a directory called subset. We have approximately subsetted half of the files.
filepath <- file.path("~/core/code/R/coursera_R/capstone/Coursera-SwiftKey/final/en_US/subset")
docs <- Corpus(DirSource(filepath))
We then cleanup the data sequentially; first we remove additional characters and convert them to space. Also, we convert everything to lowercase and remove numbers. Then we remove frequently used words like ‘the’, ‘for’, ‘of’ etc. We remove all punctuation, emoticons and remove additional spaces.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
stripSpChar <- function(x) gsub("[^0-9A-Za-z///' ]","" , x ,ignore.case = TRUE)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, toSpace, "–")
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("may"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stripSpChar)
Next we build a frequency distribution of words and make a plot of 10 most frequent words
dtm <- DocumentTermMatrix(docs)
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## thank thank 796
## good good 771
## one one 632
## love love 571
## just just 520
## time time 517
## thanks thanks 463
## can can 456
## like like 442
## get get 360
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,main="Frequency distribution of words")
Finding the commonest word pairs needs to build the document term matrix a little differently. Here are the 30 most frequent word pairs
filepath <- file.path("~/core/code/R/coursera_R/capstone/Coursera-SwiftKey/final/en_US/subset/combo")
docs2 <- Corpus(DirSource(filepath))
docs2 <- tm_map(docs2, toSpace, "/")
docs2 <- tm_map(docs2, toSpace, "@")
docs2 <- tm_map(docs2, toSpace, "\\|")
docs2 <- tm_map(docs2, toSpace, "–")
docs2 <- tm_map(docs2, content_transformer(tolower))
docs2 <- tm_map(docs2, removeNumbers)
docs2 <- tm_map(docs2, removeWords, stopwords("english"))
docs2 <- tm_map(docs2, removePunctuation)
docs2 <- tm_map(docs2, stripWhitespace)
c <- corpus(docs2)
dfm <- dfm(c, ngrams = 2:2, remove_punct = TRUE)
m <- as.matrix(dfm)
m_sorted <- t(apply(m,1,sort))
m_top <- tail(m_sorted[1,],30)
barplot(m_top,las=2)