library(tm)
library(ggplot2)
library(SnowballC)
library(dplyr)
library(ngram)

Loading Text-files

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url = url, destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")

Reading in and Analysis of the Text-files.

con1 <- file("final/en_US/en_US.twitter.txt") 
twitter <- readLines(con = con1, n = -1, skipNul = TRUE)

con2 <- file("final/en_US/en_US.blogs.txt") 
blogs <- readLines(con = con2, n = -1, skipNul = TRUE)

con3 <- file("final/en_US/en_US.news.txt") 
news <- readLines(con = con3, n = -1, skipNul = TRUE)
close(con1, con2, con3)


nchar <- c(sum(nchar(twitter)), sum(nchar(blogs)), sum(nchar(news)))
elements <- c(length(twitter), length(blogs), length(news))
size_Mb <- c(301.4, 248.5, 19.2)
df <- data.frame(nchar, elements, size_Mb)
rownames(df) <- c("twitter", "blogs", "news")
df
##             nchar elements size_Mb
## twitter 162385035  2360148   301.4
## blogs   208361438   899288   248.5
## news     15683765    77259    19.2

Subsetting

Table above shows that the text files are large, twitter file being the largest with 301 Mbytes. In order to save computing time text files are subsetted so that 10 % of the entries in the files are included in exploratory data analysis.

subnews <- sample(news, size = 0.1*length(news), replace = FALSE)
subtwitter <- sample(twitter, size = 0.1*length(twitter), replace = FALSE)
subblogs <- sample(blogs, size = 0.1*length(blogs), replace = FALSE)

writeLines(subnews, "final/subfiles/subnews.txt")
writeLines(subtwitter, "final/subfiles/subtwitter.txt")
writeLines(subblogs, "final/subfiles/subblogs.txt")

Corpora is created from the three subsetted text-files.

corpus <- Corpus(DirSource("final/subfiles/", encoding = "UTF-8"), readerControl = list(language = "en"))

Preprocessing

Corpora is preprocessed in following ways: non-ASCII characters, punctuation, numbers, stopwords and whitespace are removed and upper case letters are transformed into lower case letters. Also stemming is performed.

# remove non-ASCII characters
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, "latin1", "ASCII", sub="")))

# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))

#remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))

# transform upper case letters to lower case letters
corpus <- tm_map(corpus, content_transformer(tolower))

#remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))

#r emove whitespace
corpus <- tm_map(corpus, content_transformer(stripWhitespace))

# stem document
corpus <- tm_map(corpus, stemDocument)

Corpus is transformed into term-document matrice. Matrice has over 136000 unique terms (= words).

dtm_corpus <- DocumentTermMatrix(corpus)

inspect(dtm_corpus)
## <<DocumentTermMatrix (documents: 3, terms: 136000)>>
## Non-/sparse entries: 178610/229390
## Sparsity           : 56%
## Maximal term length: 107
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs  can   day   get  just  like  love make   one  time  will
##    1 9842  7056  9420 10163 10765  6393 8138 13362 10483 11879
##    2  432   304   457   402   462   102  393   606   536   867
##    3 8973 10891 14695 15114 13181 12398 7209  8721  8483  9426

Most Common Words in Corpora

Plot below presents the top 21 most frequent words in the corpora. Words ‘just’, ‘get’ and ‘like’ are the three most frequent words with approximately 25000 entries. Even the last three words ‘want’, ‘think’ and ‘look’ have total count over 12000.

# I am interested in only the most frequet words, so sparse terms are removed from the corpus
frq_corpus <- removeSparseTerms(dtm_corpus, 0.3)

freq_data <- as.data.frame((as.matrix(frq_corpus)))

total_frq <- as.data.frame(names(freq_data))
total_frq$total_count <- colSums(freq_data)
colnames(total_frq)[1] <- "word"

total_frq %>% filter(total_count > 12000) %>%  ggplot(aes(x = reorder(word, total_count),
                    y = total_count)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
                    coord_flip() + labs(x = "word", 
                    title = "Total Counts of Top 21 Words in Corpora")

Coverage of the Corpora by Unique Words

Plot below presents what proportion of the corpora is covered by the most frequent unique words. Plot shows that approximately 400 of the most frequently occuring words cover 50 % of the corpora. Approximately 3100 of the most frequently occuring words ares needed to cover 90 % of the corpora.

Frequences of the Most Common Two- and Three-grams in Corpora.

Plot below shows frequences of the top 20 most common 2-grams in the corpora. ‘righ now’, ‘look like’ and ‘last night’ are the three most common 2-grams in the corpora with frequences over 1500.

corpus2 <- concatenate(lapply(corpus , "[", 1))

two_grams <- ngram(corpus2, n = 2)

phrase_table <- get.phrasetable(two_grams)

phrase_table %>% filter(freq > 800) %>%  ggplot(aes(x = reorder(ngrams, freq),
                    y = freq)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
                    coord_flip() + labs(x = "2-gram", y = "total count",
                    title = "Total Counts of Top 20 2-Grams in Corpora")

Plot below shows frequences of the top 21 most common three grams in corpora. Most common three gram is ‘happi mother day’ with frequency over 340. Next two most common three grams are ‘let us know’ and ‘happi new year’ with frequencies over 240.

three_grams <- ngram(corpus2, n = 3)

phrase_table_2 <- get.phrasetable(three_grams)

phrase_table_2 %>% filter(freq > 70) %>%  ggplot(aes(x = reorder(ngrams, freq),
                    y = freq)) + geom_col(fill = "blue", width = 0.7, alpha = 0.4) +
                    coord_flip() + labs(x = "3-gram", y = "total count",
                    title = "Total Counts of Top 21 3-Grams in Corpora")