Synopsis
This milestone report details the progress of the Coursera Data Science Specialization Capstone Project, which involves exploring datasets in four languages: German, English (US), Finnish, and Russian, with a focus on the English (US) data. The datasets, sourced from Twitter, blogs, and news articles, are analyzed to develop a predictive text model. The aim is to create a model that can predict subsequent words based on given text inputs, which will eventually be implemented as a Shiny application to improve user interaction and functionality.
Data
I will use the “quanteda” package for its efficiency, support for multithreading, and default use of sparse matrices, all of which enhance performance and optimize memory usage.
library(quanteda.dictionaries) # Imports ‘quanteda.dictionaries’ for integrating predefined dictionaries with quanteda.
quanteda_options(“threads”=10) # Configures quanteda to use 10 threads for parallel processing.
library(quanteda.textstats)
library(quanteda.textplots)
library(ggplot2)
library(quanteda)
library(stopwords)
library(spacyr)
library(readtext)
library(tm)
Load the data size
size_blogs <- round(file.size(“en_US.blogs.txt”) / (1024^2), 1) size_news <- round(file.size(“en_US.news.txt”) / (1024^2), 1) size_twitter <- round(file.size(“en_US.twitter.txt”) / (1024^2), 1) # Create a named vector with the sizes of each file
sizes <- setNames(c(size_blogs, size_news, size_twitter), c(“Blogs”, “News”, “Twitter”))
sizes
Create simple summaries
blogs_lines <- readLines(“en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news_lines <- readLines(“en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter_lines <- readLines(“en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE) # Calculate the number of lines in each dataset
line_counts <- setNames(c(length(blogs_lines), length(news_lines), length(twitter_lines)), c(“Blogs”, “News”, “Twitter”))
line_counts
total_chars <- setNames(c(sum(nchar(blogs_lines)), sum(nchar(news_lines)), sum(nchar(twitter_lines))), c(“Blogs”, “News”, “Twitter”))
total_chars
News Twitter
avg_chars <- setNames(c(round(mean(nchar(blogs_lines)), 0),
round(mean(nchar(news_lines)), 0),
round(mean(nchar(twitter_lines)), 0)), c(“Blogs”, “News”, “Twitter”))
avg_chars
total_word_counts <- setNames(c(sum(word_counts_blogs), sum(word_counts_news), sum(word_counts_twitter)), c(“Blogs”, “News”, “Twitter”))
total_word_counts
avg_words <- setNames(c(round(mean(word_counts_blogs), 0),
round(mean(word_counts_news), 0),
round(mean(word_counts_twitter), 0)), c(“Blogs”, “News”, “Twitter”))
avg_words
Sample the data
to analyze 8000 lines per set (approx. 10% of total). Summary includes unique token counts (types).
set.seed(3333)
blogs_sample <- sample(blogs, size = 8000, replace = FALSE) news_sample <- sample(news, size = 8000, replace = FALSE) twitter_sample <- sample(twitter, size = 8000, replace = FALSE)
write.table(blogs_sample, file = “Sampling/blogs_sample.txt”, sep = " “) write.table(news_sample, file =”Sampling/news_sample.txt“, sep =” “) write.table(twitter_sample, file =”Sampling/twitter_sample.txt“, sep =” ") # Generate a corpus with the sampled data
tm_corpus <- VCorpus(DirSource(directory = “Sampling”, pattern = "*.txt")) q_corpus <- corpus(tm_corpus)
docvars(q_corpus, field = “TextId”) <- c(“Blogs”, “News”, “Twitter”)
summary(q_corpus)[, c(“TextId”, “Types”, “Tokens”, “Sentences”, “language”)]
Blogs 41346 416396
18258
en
News 41021 361075
14824
en
Twitter 25204 165669
8918
en
Exploratory analysis
Analyze word frequencies by generating a list of top terms, excluding punctuation, numbers, and stop words (e.g., articles, conjunctions, prepositions, pronouns, common verbs).
processed_tokens <- tokens(q_corpus,
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords(“english”)) # Create a document-feature matrix from the processed tokens dfm_terms <- dfm(processed_tokens)
top_terms <- as.data.frame(topfeatures(dfm_terms, n = 20))
top_terms\(Words <- factor(top_terms\)Words, levels = top_terms\(Words[order(top_terms\)Frequency)])
plot_top_terms <- ggplot(data = top_terms, aes(x = Frequency, y = Words)) + geom_col() +
ggtitle(“Top 20 Most Frequent Words”)
plot_top_terms
Figure 1 – The top 20 highest-frequency words
textplot_wordcloud(dfm_terms,
min_count = 100,
color = topo.colors(20))
Figure 2 – The word cloud from the document
Generate N-Grams
To facilitate next-word prediction by identifying sequences of words appearing together in our texts. Stopwords are retained as they are crucial for accurate prediction.
bigrams <- tokens(q_corpus,
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_ngrams(n = 2, concatenator = " ") # Create a document-feature matrix for the bigrams dfm_bigrams <- dfm(bigrams)
top_bigrams <- as.data.frame(topfeatures(dfm_bigrams, n = 20))
top_bigrams\(Words <- factor(top_bigrams\)Words, levels = top_bigrams\(Words[order(top_bigrams\)Frequency)])
plot_bigrams <- ggplot(data = top_bigrams, aes(x = Frequency, y = Words)) + geom_col() +
ggtitle(“Top 20 Most Frequent Bigrams”)
plot_bigrams
Figure 3 – The Top frequent 2-grams
trigrams <- tokens(q_corpus,
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_ngrams(n = 3, concatenator = " ") # Create a document-feature matrix for the trigrams dfm_trigrams <- dfm(trigrams)
top_trigrams <- as.data.frame(topfeatures(dfm_trigrams, n = 20))
top_trigrams\(Words <- factor(top_trigrams\)Words, levels = top_trigrams\(Words[order(top_trigrams\)Frequency)])
plot_trigrams <- ggplot(data = top_trigrams, aes(x = Frequency, y = Words)) + geom_col() +
ggtitle(“Top 20 Most Frequent Trigrams”)
plot_trigrams
Figure 4 – The top 20 most common 3-grams
four_grams <- tokens(q_corpus,
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_ngrams(n = 4, concatenator = " ") # Create a document-feature matrix for the four-grams dfm_four_grams <- dfm(four_grams)
top_four_grams <- as.data.frame(topfeatures(dfm_four_grams, n = 20))
top_four_grams\(Words <- factor(top_four_grams\)Words, levels = top_four_grams\(Words[order(top_four_grams\)Frequency)])
plot_four_grams <- ggplot(data = top_four_grams, aes(x = Frequency, y = Words)) + geom_col() +
ggtitle(“Top 20 Most Frequent Four-Grams”)
plot_four_grams
Conclusion
Figure 5 –The top 20 most common 4-grams
Sampled and tokenized 10% of the dataset for analysis, focusing on identifying top words and generating n-grams. Visualized top 20 most frequent words and n-grams (2-grams, 3-grams, 4-grams). Next steps include developing a prediction algorithm and creating a Shiny app.