Data Science Capstone Project
url_f <-
"https://d396qusza40orc.cloudfront.net/dsscapstone/corpusset/Coursera-SwiftKey.zip"
file_corpus <- "Coursera-SwiftKey.zip"
# download.file(url = url_f, destfile = file_corpus)
file <- list.files(pattern = "zip", full.names = TRUE)
unzip(file)
folder <- "final"
files_corpus <- list.files(folder, full.names = TRUE, recursive = TRUE)
files_corpus_sh <- list.files(folder, full.names = FALSE, recursive = TRUE)
files_corpus_sh <- gsub("^(.*)/", "", files_corpus_sh)
files_corpus_info <- file.info(files_corpus, recursive = TRUE)
files_corpus_info$size <- round(files_corpus_info$size / 1e6, 1)
files_corpus_info <- rename(files_corpus_info, size_Mb = size)
files_corpus_info[, c("size_Mb", "mtime")]
Total size of corpus = 1,408.9 Mb in 12 files.
Show Filenames and first line of each file, limited to 60 characters:
corpus_sm <- sapply(files_corpus, read_lines, n_max = 1)
enframe(substr(corpus_sm, 1, sLen))
Find the US files only and load into memory.
files_corpus_US <- files_corpus[grepl("US", files_corpus)]
files_corpus_sh_US <- files_corpus_sh[grepl("US", files_corpus_sh)]
corpus <- lapply(files_corpus_US, read_lines)
names(corpus) <- files_corpus_sh_US
corpus_size1 = object.size(corpus)
The files en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt are loaded. They are using 831.7 Mb of memory.
Each file has the following number of rows:
enframe(prettyNum(lapply(corpus, length), big.mark = ","),
name = "file", value = "rows")
Sampling 1/10 of the rows for more efficient processing.
corpus <- lapply(corpus, function(c) {
len <- length(c)
samSize <- round(len/sam, 0)
c <- c[sample(len, samSize)]
})
corpus_size2 = object.size(corpus)
The following number of randomly selected rows from each file are being used:
enframe(prettyNum(lapply(corpus, length), big.mark = ","),
name = "file", value = "rows")
They are now using 83.4 Mb of memory.
Using the tm package, the corpus is cleaned
corpus <- VCorpus(VectorSource(corpus))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
names(corpus) <- gsub('en_US.|.txt', "", files_corpus_sh_US)
Dictionaries of unigrams, bigrams and trigrams are built and their distributions are examined.
The wordcloud package is used to illustrate the most common word frequencies for each document.
for(i in 1:length(corpus)) {
nm <- names(corpus)[i]
wordcloud(corpus[i], max.words = 80, colors=brewer.pal(8, "Accent"),
random.order = FALSE, rot.per = 0.45, scale = c(3, 1))
title(nm)
}
Also, a histogram, segmented by word source, shows the overall distribution of unigrams.
tdm <- as.matrix(TermDocumentMatrix(corpus))
tdm <- as_tibble(tdm, rownames = "word")
tdm <- tdm %>%
mutate(all = blogs + news + twitter) %>%
arrange(desc(all))
tdm <- tdm %>% mutate(runningCount = cumsum(all))
tdm_wordLevels <- unique(tdm$word)
tdm_top <- tdm[1:20, c("word", "blogs", "news", "twitter")] %>%
gather(key = "source", value = "count", 2:4)
tdm_top$word <- as_factor(tdm_top$word)
p <- ggplot(data = tdm_top,
aes(x = word, y = count, fill = source),
color = brewer.pal(3, "Accent")) +
geom_bar(stat = "identity") +
theme_bw() + theme(legend.position = c(0.9,0.8))
print(p)
bigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}
tdm_bigrams <- as.matrix(
TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer)))
tdm_bigrams <- as_tibble(tdm_bigrams, rownames = "bigram")
tdm_bigrams <- tdm_bigrams %>%
mutate(all = blogs + news + twitter) %>%
arrange(desc(all))
tdm_bigrams[1:20,]
trigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}
tdm_trigrams <- as.matrix(
TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer)))
tdm_trigrams <- as_tibble(tdm_trigrams, rownames = "trigram")
tdm_trigrams <- tdm_trigrams %>%
mutate(all = blogs + news + twitter) %>%
arrange(desc(all))
tdm_trigrams[1:20,]
A fraction of the words in a frequency sorted dictionary can be used to cover most of the the total word instances in the set.
tdm <- tdm %>% mutate(runningCount = cumsum(all))
totalUnigrams <- sum(tdm$all)
cover50 <- which(abs(tdm$runningCount - (0.5 * totalUnigrams)) ==
min(abs(tdm$runningCount - (0.5 * totalUnigrams))))
cover90 <- which(abs(tdm$runningCount - (0.9 * totalUnigrams)) ==
min(abs(tdm$runningCount - (0.9 * totalUnigrams))))
Some unigrams found are obviously come from a foreign languages. These could be programitally identified by excluding those containing non-ASCII characters. They, however, do not make up a large enough percentage to warrant adding them to a stopwords list.
As shown with the coverage estimates, a reduced dictionary could be used to practically produce similiar results.