The goal of this report is to load text data set, clean it, perform exploratory data analysis, create unigrams, bigrams and trigrams. The ultimate goal is to prepare to develop Shiny application that performs next word prediction.
First, the global environment is cleaned, the working directory is set and required packages are loaded.
rm(list = ls())
setwd("~/Documents/classes/dataScSpec/capstone")
require(RWeka)
require(ggplot2)
require(stringi)
require(tm)
require(SnowballC)
require(wordcloud)
require(dplyr)
require(knitr)
Then the training data set file is downloaded and unzipped. There are training data sets in 4 languages: Finnish, Russian, English and German. This report will focus on analyzing English data set. There are 3 files in English data set folder containing samples from blogs, twitter messages and news. All 3 are read into separate variables.
# URL for the data file
file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# downloading and unzipping the file
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(file, destfile="Coursera-SwiftKey.zip", method="curl")
}
unzip("Coursera-SwiftKey.zip")
# reading US twitter file
twCon <- file("./final/en_US/en_US.twitter.txt", "r")
tw <- readLines(con = twCon, skipNul = TRUE)
close(twCon)
# reading US blogs file
blCon <- file("./final/en_US/en_US.blogs.txt", "r")
bl <- readLines(con = blCon, skipNul = TRUE)
close(blCon)
# reading US news file
nCon <- file("./final/en_US/en_US.news.txt", "r")
news <- readLines(con = nCon, skipNul = TRUE)
close(nCon)
Preliminary Exploratory Data Analysis is performed by checking if there are elements with no data. Also file sizes, word counts and line counts are calculated and shown in the table.
# Are there lines with missing data?
sum(bl == "")
## [1] 0
sum(tw == "")
## [1] 0
sum(news == "")
## [1] 0
kable(data.frame(
File = c("twitter", "blog", "news"),
File.Size.In.MB = round(c(object.size(tw) / 1024 ^ 2,
object.size(bl) / 1024 ^ 2,
object.size(news)/ 1024 ^ 2), 1),
Word.Count = c(
sum(stri_count(tw,regex="\\S+")),
sum(stri_count(bl,regex="\\S+")),
sum(stri_count(news,regex="\\S+"))),
Line.Count = c(length(tw), length(bl), length(news))
), col.names = c("File", "File size in Mb", "Word count", "Line count"))
| File | File size in Mb | Word count | Line count |
|---|---|---|---|
| 301.4 | 30373583 | 2360148 | |
| blog | 248.5 | 37334131 | 899288 |
| news | 249.6 | 34372530 | 1010242 |
Since each file of the training set is large, they were sampled and combined into a single vector, which was saved for easier retrieval later.
sampleFactor <- 1000
set.seed(1)
twSample <- sample(tw, length(tw) / sampleFactor)
blSample <- sample(bl, length(bl) / sampleFactor)
nSample <- sample(news, length(news) / sampleFactor)
combSample <- c(twSample, blSample, nSample)
# Saving combined sample, cleaning memory
writeLines(combSample, "combSample.txt")
rm(tw, bl, news, twSample, blSample, nSample)
Sample of the training data set was read and converted into a corpus. Then it was cleaned by removing punctuation and numbers, substituting uppercase letters into lowercase. Then stop words and white spaces were removed. Since these samples may potentially contain profanity words and it is not preferable that the algorithm suggests profanity words, they need to be removed from the corpus. Profanity words were downloaded from https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/badwordslist/badwords.txt and eliminated from the corpus.
# reading saved sample
sampCon <- file("./combSample.txt", "r")
combSample <- readLines(sampCon)
close(sampCon)
sampCorp <- VCorpus(VectorSource(list(combSample)))
sampCorp <- tm_map(sampCorp, removePunctuation)
sampCorp <- tm_map(sampCorp, removeNumbers)
sampCorp <- tm_map(sampCorp, content_transformer(tolower))
sampCorp <- tm_map(sampCorp, removeWords, stopwords("english"))
sampCorp <- tm_map(sampCorp, stripWhitespace)
#sampCorp <- tm_map(sampCorp, stemDocument)
# remove profanity
profFile <- "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/badwordslist/badwords.txt"
if (!file.exists("badwords.txt")) {
download.file(profFile, destfile="badwords.txt", method="curl")
}
profCon <- file("./badwords.txt", "r")
prof <- readLines(profCon)
close(profCon)
prof_new<- gsub("[*()]","",prof)
sampCorp <- tm_map(sampCorp, removeWords, prof_new)
rm(prof, prof_new)
The frequency of the most frequently occurring words is depicted as a word cloud.
wordcloud(sampCorp, min.freq=110, max.words=80, colors=brewer.pal(8, "Set1"))
Unigrams, bigrams and trigrams are built using RWeka and tm package.
tokenizer1 <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
tdm1 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer1))
tokenizer2 <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
tdm2 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer2))
tokenizer3 <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tdm3 <- TermDocumentMatrix(sampCorp, control = list(tokenize = tokenizer3))
Top 20 frequencies of unigrams, bigrams and trigrams were plotted.
freqTerms1 <- findFreqTerms(tdm1)
termFreq1 <- rowSums(as.matrix(tdm1[freqTerms1, ]))
termFreq1 <- data.frame(unigram=names(termFreq1), frequency=termFreq1)
topFreq1 <- termFreq1 %>% arrange(desc(frequency)) %>% head(20)
topFreq1$unigram <- factor(as.character(topFreq1$unigram),
levels = (as.character(topFreq1$unigram)))
p1 <- ggplot(topFreq1, aes(x=unigram, y=frequency)) +
geom_bar(stat = "identity") +
xlab("Unigram") + ylab("Frequency") +
labs(title = "Top 20 Unigrams by Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p1)
freqTerms2 <- findFreqTerms(tdm2)
termFreq2 <- rowSums(as.matrix(tdm2[freqTerms2, ]))
termFreq2 <- data.frame(bigram=names(termFreq2), frequency=termFreq2)
topFreq2 <- termFreq2 %>% arrange(desc(frequency)) %>% head(20)
topFreq2$bigram <- factor(as.character(topFreq2$bigram),
levels = (as.character(topFreq2$bigram)))
p2 <- ggplot(topFreq2, aes(x=bigram, y=frequency)) +
geom_bar(stat = "identity") +
xlab("Bigram") + ylab("Frequency") +
labs(title = "Top 20 Bigrams by Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p2)
freqTerms3 <- findFreqTerms(tdm3)
termFreq3 <- rowSums(as.matrix(tdm3[freqTerms3, ]))
termFreq3 <- data.frame(trigram=names(termFreq3), frequency=termFreq3)
topFreq3 <- termFreq3 %>% arrange(desc(frequency)) %>% head(20)
topFreq3$trigram <- factor(as.character(topFreq3$trigram),
levels = (as.character(topFreq3$trigram)))
p3 <- ggplot(topFreq3, aes(x=trigram, y=frequency)) +
geom_bar(stat = "identity") +
xlab("Trigram") + ylab("Frequency") +
labs(title = "Top 20 Trigrams by Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p3)