library(tm)
library(ggplot2)
library(SnowballC)
library(dplyr)
library(ngram)
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url = url, destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
con1 <- file("final/en_US/en_US.twitter.txt")
twitter <- readLines(con = con1, n = -1, skipNul = TRUE)
con2 <- file("final/en_US/en_US.blogs.txt")
blogs <- readLines(con = con2, n = -1, skipNul = TRUE)
con3 <- file("final/en_US/en_US.news.txt")
news <- readLines(con = con3, n = -1, skipNul = TRUE)
close(con1, con2, con3)
nchar <- c(sum(nchar(twitter)), sum(nchar(blogs)), sum(nchar(news)))
elements <- c(length(twitter), length(blogs), length(news))
size_Mb <- c(301.4, 248.5, 19.2)
df <- data.frame(nchar, elements, size_Mb)
rownames(df) <- c("twitter", "blogs", "news")
df
## nchar elements size_Mb
## twitter 162385035 2360148 301.4
## blogs 208361438 899288 248.5
## news 15683765 77259 19.2
Table above shows that the text files are large, twitter file being the largest with 301 Mbytes. In order to save computing time text files are subsetted so that 10 % of the entries in the files are included in exploratory data analysis.
subnews <- sample(news, size = 0.1*length(news), replace = FALSE)
subtwitter <- sample(twitter, size = 0.1*length(twitter), replace = FALSE)
subblogs <- sample(blogs, size = 0.1*length(blogs), replace = FALSE)
writeLines(subnews, "final/subfiles/subnews.txt")
writeLines(subtwitter, "final/subfiles/subtwitter.txt")
writeLines(subblogs, "final/subfiles/subblogs.txt")
Corpora is created from the three subsetted text-files.
corpus <- Corpus(DirSource("final/subfiles/", encoding = "UTF-8"), readerControl = list(language = "en"))
Corpora is preprocessed in following ways: non-ASCII characters, punctuation, numbers, stopwords and whitespace are removed and upper case letters are transformed into lower case letters. Also stemming is performed.
# remove non-ASCII characters
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, "latin1", "ASCII", sub="")))
# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
#remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# transform upper case letters to lower case letters
corpus <- tm_map(corpus, content_transformer(tolower))
#remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))
#r emove whitespace
corpus <- tm_map(corpus, content_transformer(stripWhitespace))
# stem document
corpus <- tm_map(corpus, stemDocument)
Corpus is transformed into term-document matrice. Matrice has over 136000 unique terms (= words).
dtm_corpus <- DocumentTermMatrix(corpus)
inspect(dtm_corpus)
## <<DocumentTermMatrix (documents: 3, terms: 136000)>>
## Non-/sparse entries: 178610/229390
## Sparsity : 56%
## Maximal term length: 107
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs can day get just like love make one time will
## 1 9842 7056 9420 10163 10765 6393 8138 13362 10483 11879
## 2 432 304 457 402 462 102 393 606 536 867
## 3 8973 10891 14695 15114 13181 12398 7209 8721 8483 9426
Plot below presents the top 21 most frequent words in the corpora. Words ‘just’, ‘get’ and ‘like’ are the three most frequent words with approximately 25000 entries. Even the last three words ‘want’, ‘think’ and ‘look’ have total count over 12000.
# I am interested in only the most frequet words, so sparse terms are removed from the corpus
frq_corpus <- removeSparseTerms(dtm_corpus, 0.3)
freq_data <- as.data.frame((as.matrix(frq_corpus)))
total_frq <- as.data.frame(names(freq_data))
total_frq$total_count <- colSums(freq_data)
colnames(total_frq)[1] <- "word"
total_frq %>% filter(total_count > 12000) %>% ggplot(aes(x = reorder(word, total_count),
y = total_count)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
coord_flip() + labs(x = "word",
title = "Total Counts of Top 21 Words in Corpora")
Plot below presents what proportion of the corpora is covered by the most frequent unique words. Plot shows that approximately 400 of the most frequently occuring words cover 50 % of the corpora. Approximately 3100 of the most frequently occuring words ares needed to cover 90 % of the corpora.
Plot below shows frequences of the top 20 most common 2-grams in the corpora. ‘righ now’, ‘look like’ and ‘last night’ are the three most common 2-grams in the corpora with frequences over 1500.
corpus2 <- concatenate(lapply(corpus , "[", 1))
two_grams <- ngram(corpus2, n = 2)
phrase_table <- get.phrasetable(two_grams)
phrase_table %>% filter(freq > 800) %>% ggplot(aes(x = reorder(ngrams, freq),
y = freq)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
coord_flip() + labs(x = "2-gram", y = "total count",
title = "Total Counts of Top 20 2-Grams in Corpora")
Plot below shows frequences of the top 21 most common three grams in corpora. Most common three gram is ‘happi mother day’ with frequency over 340. Next two most common three grams are ‘let us know’ and ‘happi new year’ with frequencies over 240.
three_grams <- ngram(corpus2, n = 3)
phrase_table_2 <- get.phrasetable(three_grams)
phrase_table_2 %>% filter(freq > 70) %>% ggplot(aes(x = reorder(ngrams, freq),
y = freq)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
coord_flip() + labs(x = "3-gram", y = "total count",
title = "Total Counts of Top 21 3-Grams in Corpora")