This report contains preliminary analysis for the Capstone Project for the Data Science Specialization course sequence. The Capstone Project involves the development of a word prediction application based on provided datasets from blog posts, news posts, and twitter posts. The intent of the application is to predict potential upcoming words based on the words currently being typed by the user.
This interim report provides an assessment of various aspects of the datasets, including file size, number of documents in each dataset, number of words per dataset, average number of words per document, most common words using various weighting schemes, and most common n-grams. The intent of this analysis is to help guide the design of the word prediction application, understanding the need to balance memory footprint, processing load, and application responsiveness to user input.
The first step of the analysis is to read the data sets and capture basic information about the datasets.
suppressMessages(suppressWarnings(library(dplyr, warn.conflicts = FALSE, quietly = TRUE)))
suppressMessages(suppressWarnings(library(tm, warn.conflicts = FALSE, quietly = TRUE)))
blogsFile <- "../data/en_US/en_US.blogs.txt"
newsFile <- "../data/en_US/en_US.news.txt"
twitterFile <- "../data/en_US/en_US.twitter.txt"
# Capture file sizes
blogsSize <- file.size(blogsFile) / 2^20
newsSize <- file.size(newsFile) / 2^20
twitterSize <- file.size(twitterFile)/ 2^20
sampleStats <- tibble(
source = c("blogs", "news", "twitter"),
fileSize_MB = c(blogsSize, newsSize, twitterSize)
)
# For large data sets, it may be beneficial to sample from the data sets
set.seed(1234)
partitionPct <- 1.0
# Read files line-by-line. Each line contains a single document
blogs <- readLines(blogsFile, skipNul = TRUE)
blogsLength <- length(blogs)
blogsPartition <-
tibble(text = sample(blogs, round(blogsLength * partitionPct, 0), replace = FALSE))
rm("blogs") # Clean up unneeded data
news <- readLines(newsFile, skipNul = TRUE)
newsLength <- length(news)
newsPartition <-
tibble(text = sample(news, round(newsLength * partitionPct, 0), replace = FALSE))
rm("news") # Clean up unneeded data
twitter <- readLines(twitterFile, skipNul = TRUE)
twitterLength <- length(twitter)
twitterPartition <-
tibble(text = sample(twitter, round(twitterLength * partitionPct, 0), replace = FALSE))
rm("twitter") # Clean up unneeded data
sampleStats <- bind_cols(sampleStats,
tibble(lines = c(blogsLength, newsLength, twitterLength)))
sampleDataSet <- bind_rows(mutate(blogsPartition, source = "blogs"),
mutate(newsPartition, source = "news"),
mutate(twitterPartition, source = "twitter"))
sampleDataSet$source <- as.factor(sampleDataSet$source)
# Clean up large data that is no longer needed
rm("blogsPartition", "newsPartition", "twitterPartition")
sampleStats
## # A tibble: 3 × 3
## source fileSize_MB lines
## <chr> <dbl> <int>
## 1 blogs 200. 899288
## 2 news 196. 1010242
## 3 twitter 159. 2360148
The table above provides details regarding file size and the number of entries of each type of document.
THe next step is to clean up the data by removing numbers, punctuation. The text is also converted to lower case to properly obtain word counts. The text is then tokenized and counts of total words and total unique words are captured.
suppressMessages(suppressWarnings(library(tidytext, warn.conflicts = FALSE, quietly = TRUE)))
# Clean up words by removing numbers, punctuation, stop words, etc.
removeSymbols <- function(x) {
gsub("[^A-Za-z ]*", "", x) # Remove anything except letters and spaces
gsub(" +", " ", x) # Replace multiple spaces with a single space
}
sampleDataSet$text <- lapply(sampleDataSet$text, removeSymbols)
sampleDataSet$text <- lapply(sampleDataSet$text, tolower)
# Tokenize
sampleTokens <- sampleDataSet %>%
unnest_tokens(word, text)
# Remove stop words
data("stop_words")
sampleTokens <- sampleTokens %>%
anti_join(stop_words, by = "word")
rm("stop_words")
blogsTokenCount <- sampleTokens %>%
filter(source == "blogs") %>%
with(length(word))
newsTokenCount <- sampleTokens %>%
filter(source == "news") %>%
with(length(word))
twitterTokenCount <- sampleTokens %>%
filter(source == "twitter") %>%
with(length(word))
sampleStats <- bind_cols(sampleStats,
tibble(totalTokens = c(blogsTokenCount, newsTokenCount, twitterTokenCount)))
# Count the number of occurrences of each unique word for each source
sampleTokens <- sampleTokens %>%
count(word, source, sort = TRUE)
blogsTokenCount <- sampleTokens %>%
filter(source == "blogs") %>%
with(length(word))
newsTokenCount <- sampleTokens %>%
filter(source == "news") %>%
with(length(word))
twitterTokenCount <- sampleTokens %>%
filter(source == "twitter") %>%
with(length(word))
sampleStats <- bind_cols(sampleStats,
tibble(uniqueTokens = c(blogsTokenCount, newsTokenCount, twitterTokenCount)))
sampleStats
## # A tibble: 3 × 5
## source fileSize_MB lines totalTokens uniqueTokens
## <chr> <dbl> <int> <int> <int>
## 1 blogs 200. 899288 14562338 319282
## 2 news 196. 1010242 16359379 283812
## 3 twitter 159. 2360148 12486499 369663
We can look at the most common words for each document type and the most common words overall as presented in the following graphs.
suppressMessages(suppressWarnings(library(ggplot2, warn.conflicts = FALSE, quietly = TRUE)))
suppressMessages(suppressWarnings(library(gridExtra, warn.conflicts = FALSE, quietly = TRUE)))
suppressMessages(suppressWarnings(library(tidyr, warn.conflicts = FALSE, quietly = TRUE)))
# Create a term-frequency matrix by document type
sampleTF <- sampleTokens %>%
spread(source, n)
# Find the total for each term
sampleTF$total <- apply(sampleTF[ ,2:4], 1, "sum", na.rm = TRUE)
nTerms <- 20
p1 <- sampleTF %>%
top_n(nTerms, blogs) %>%
mutate(word = reorder(word, blogs)) %>%
ggplot(aes(blogs, word)) +
geom_col(fill = "blue") +
labs(y = NULL)
p2 <- sampleTF %>%
top_n(nTerms, news) %>%
mutate(word = reorder(word, news)) %>%
ggplot(aes(news, word)) +
geom_col(fill = "red") +
labs(y = NULL)
p3 <- sampleTF %>%
top_n(nTerms, twitter) %>%
mutate(word = reorder(word, twitter)) %>%
ggplot(aes(twitter, word)) +
geom_col(fill = "green") +
labs(y = NULL)
p4 <- sampleTF %>%
top_n(nTerms, total) %>%
mutate(word = reorder(word, total)) %>%
ggplot(aes(total, word)) +
geom_col(fill = "purple") +
labs(y = NULL)
grid.arrange(p1, p2, p3, p4, nrow = 2)
The graphs above and the word cloud below highlight the top words used across all documents.
suppressMessages(suppressWarnings(library(wordcloud, warn.conflicts = FALSE, quietly = TRUE)))
sampleTF %>%
mutate(word = reorder(word, total)) %>%
with(wordcloud(word, total, max.words = 50, colors = brewer.pal(8, "Spectral")))
In the section below, bigrams (2-word tokens) are similarly explored. For a prediction algorithm, the use of n-grams will be evaluated, with consideration of n equal to 2, 3, and perhaps 4. Such an approach will allow for the prediction algorithm to be based on n-gram probability distributinos. For example, with a trigram, the probability of word i is based on the probability of the trigram relative to other trigrams with the same two prior words. In other words, we can evaluate: \[ P(w_i | w_{i-1}, w_{i-2}) \] which is the probability of the next word being \(w_i\) given the two prior words designated as \(w_{i-1}\) and \(w_{i-1}\).
The following table and graphs provide some similar exploratory analysis for a sample of possible bigrams. This analysis can be extended to trigrams as the algorithm is refined.
suppressMessages(suppressWarnings(library(doParallel, warn.conflicts = FALSE, quietly = TRUE)))
# Set up for parallel operations to speed up processing
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)
# Tokenize a sample of the data
sampleTokens <- sampleDataSet %>%
sample_frac(size = 0.25) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
data("stop_words")
sampleTokens <- sampleTokens %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!grepl("[[:digit:][:punct:]]+", word1)) %>%
filter(!grepl("[[:digit:][:punct:]]+", word2)) %>%
unite(bigram, c(word1, word2), sep = " ")
rm("stop_words")
# Count the number of occurrences of each unique bigram for each source
sampleTokens <- sampleTokens %>%
count(bigram, source, sort = TRUE)
blogsTokenCount <- sampleTokens %>%
filter(source == "blogs") %>%
with(length(bigram))
newsTokenCount <- sampleTokens %>%
filter(source == "news") %>%
with(length(bigram))
twitterTokenCount <- sampleTokens %>%
filter(source == "twitter") %>%
with(length(bigram))
sampleStats <- bind_cols(sampleStats,
tibble(uniqueBigrams = c(blogsTokenCount, newsTokenCount, twitterTokenCount)))
sampleStats
## # A tibble: 3 × 6
## source fileSize_MB lines totalTokens uniqueTokens uniqueBigrams
## <chr> <dbl> <int> <int> <int> <int>
## 1 blogs 200. 899288 14562338 319282 766310
## 2 news 196. 1010242 16359379 283812 893039
## 3 twitter 159. 2360148 12486499 369663 649090
# Create a term-frequency matrix by document type
sampleTF <- sampleTokens %>%
spread(source, n)
# Find the total for each term
sampleTF$total <- apply(sampleTF[ ,2:4], 1, "sum", na.rm = TRUE)
stopCluster(cluster)
nTerms <- 20
p1 <- sampleTF %>%
top_n(nTerms, blogs) %>%
mutate(bigram = reorder(bigram, blogs)) %>%
ggplot(aes(blogs, bigram)) +
geom_col(fill = "blue") +
labs(y = NULL)
p2 <- sampleTF %>%
top_n(nTerms, news) %>%
mutate(bigram = reorder(bigram, news)) %>%
ggplot(aes(news, bigram)) +
geom_col(fill = "red") +
labs(y = NULL)
p3 <- sampleTF %>%
top_n(nTerms, twitter) %>%
mutate(bigram = reorder(bigram, twitter)) %>%
ggplot(aes(twitter, bigram)) +
geom_col(fill = "green") +
labs(y = NULL)
p4 <- sampleTF %>%
top_n(nTerms, total) %>%
mutate(bigram = reorder(bigram, total)) %>%
ggplot(aes(total, bigram)) +
geom_col(fill = "purple") +
labs(y = NULL)
grid.arrange(p1, p2, p3, p4, nrow = 2)
sampleTF %>%
mutate(bigram = reorder(bigram, total)) %>%
with(wordcloud(bigram, total, max.words = 50, colors = brewer.pal(8, "Spectral")))
The general approach I plan to consider for this prediction problem is to use n-gram frequencies to evaluate the probabilities of future words given prior words. The proposed approach for further analysis and development of the algorithm is outlined as follows: