library(R.utils)
library(stringi)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
library(knitr)
In this document, we present an exploratory analysis of the following files: 1) en_US.blogs.txt, 2) en_US.news.txt, and 3) en_US.twitter.txt. It provides statistics for each document (e.g. line count, word count), steps taken to clean the data and words frequency visualization, some observations and foughts for final task.
twitter <- readLines("~/datasciencecoursera/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("~/datasciencecoursera/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/datasciencecoursera/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
# initalize a dataframe
df <- data.frame((matrix(ncol = 5, nrow = 3)))
row.names(df) <- c("blogs","news","twitter")
colnames(df) <- c("size","lines","longest_line","words","avg_words")
# Get file sizes
df$size[1] <- file.info("~/datasciencecoursera/en_US/en_US.blogs.txt")$size / 1024 ^ 2
df$size[2] <- file.info("~/datasciencecoursera/en_US/en_US.news.txt")$size / 1024 ^ 2
df$size[3] <- file.info("~/datasciencecoursera/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# Get Word Counts
df$words[1] <- length(stri_count_words(blogs))
df$words[2] <- length(stri_count_words(news))
df$words[3] <- length(stri_count_words(twitter))
# Get Word Counts
df$avg_words[1] <- mean(stri_count_words(blogs))
df$avg_words[2] <- mean(stri_count_words(news))
df$avg_words[3] <- mean(stri_count_words(twitter))
# Get Line Counts
df$lines[1] <- length(blogs)
df$lines[2] <- length(news)
df$lines[3] <- length(twitter)
# Get length of longest line
df$longest_line[1] <- max(nchar(blogs))
df$longest_line[2] <- max(nchar(news))
df$longest_line[3] <- max(nchar(twitter))
Information related with each of the three files is provided in the following table. It includes size of the file in MB, number of lines, amount of characters in the longest line, number of words as well as average amount of words per post/segment.
kable(df)
| size | lines | longest_line | words | avg_words | |
|---|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 40833 | 899288 | 41.75108 |
| news | 196.2775 | 1010242 | 11384 | 1010242 | 34.40997 |
| 159.3641 | 2360148 | 140 | 2360148 | 12.75065 |
#First we remove all non-English characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
#Then we take a 5% sample of each file
set.seed(123)
sampleData <- c(sample(blogs, length(blogs) * 0.05),
sample(news, length(news) * 0.05),
sample(twitter, length(twitter) * 0.05))
#Next we create corpus
corpus <- VCorpus(VectorSource(sampleData))
#Finally, we clean the data
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
In this section, we will find the most frequently occurring words in the data. Here we present the most common unigrams, bigrams, and trigrams.
getFreq <- function(x) {
freq <- sort(rowSums(as.matrix(x)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
trigram <- function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
bigram <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
# Get frequencies of most common n-grams in data sample
my_unigram <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
my_bigram <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
my_trigram <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
# plot a cloud of most frequent unigram
palette <- brewer.pal(8, "Dark2")
wordcloud(my_unigram[, 1], my_unigram[,2], min.freq = 1,
random.order = F, ordered.colors = F, colors = palette)
text(x = 0.5, y = 0, "1-Gram cloud")
Visualization of 25 most frequent unigrams, bigrams and trigrams.
#Histogram of N-Grams
barplot(my_unigram[1:25, 2],
cex.names = 0.6, names.arg = my_unigram[1:25, 1], col = "red",
main = "Histogram: 1-Grams", las = 2)
barplot(my_bigram[1:25, 2],
cex.names = 0.6, names.arg = my_bigram[1:25, 1], col = "blue",
main = "Histogram: 2-Grams", las = 2)
barplot(my_trigram[1:25, 2],
cex.names = 0.6, names.arg = my_trigram[1:25, 1], col = "green",
main = "Histogram: 3-Grams", las = 2)
Looking at the histogram of unigrams, we can notice that the most common terms are formed of one syllable, for example top 3 are as follows: “will”, “just”,“said”.
Histogram of bigrams shows that people talk a lot about New York and use lots of time related bigrams such as right now, last year, last night etc.
Among trigrams New York City again is one of the most popular terms. Then we can notice that people frequently use wishes such as Happy Mothers Day, Happy New Years and Happy Valentines Day.
To build prediction model I plan to use n-gram (up to 5-gram) models with a frequency look-up table combined with a back-off technique. I also will measure memory and time required to run the prediction for the next word. If the created model won’t be fast enough, I will explore several ways to speed up and increase accuracy in prediction. The user interface of Shiny app has to be simple and functional. It will allow the user to input the text and will provide a list of the most likely to follow words.