Synopsis

This milestone report details the progress of the Coursera Data Science Specialization Capstone Project, which involves exploring datasets in four languages: German, English (US), Finnish, and Russian, with a focus on the English (US) data. The datasets, sourced from Twitter, blogs, and news articles, are analyzed to develop a predictive text model. The aim is to create a model that can predict subsequent words based on given text inputs, which will eventually be implemented as a Shiny application to improve user interaction and functionality.

Data

I will use the “quanteda” package for its efficiency, support for multithreading, and default use of sparse matrices, all of which enhance performance and optimize memory usage.

library(quanteda.dictionaries) # Imports ‘quanteda.dictionaries’ for integrating predefined dictionaries with quanteda.

quanteda_options(“threads”=10) # Configures quanteda to use 10 threads for parallel processing.

Imports ‘quanteda.textstats’ for calculating text statistics.

library(quanteda.textstats)

Imports ‘quanteda.textplots’ for visualizing text data.

library(quanteda.textplots)

Imports the ‘ggplot2’ package for creating graphical plots.

library(ggplot2)

Imports the ‘quanteda’ package for advanced text analysis.

library(quanteda)

Imports the ‘stopwords’ package to manage stopwords across different languages.

library(stopwords)

Imports the ‘spacyr’ package for text analysis with spaCy.

library(spacyr)

Imports the ‘readtext’ package for reading text documents.

library(readtext)

Imports the ‘tm’ package for text mining operations.

library(tm)

Load the data size

Determine the size of each dataset in megabytes (MB)

size_blogs <- round(file.size(“en_US.blogs.txt”) / (1024^2), 1) size_news <- round(file.size(“en_US.news.txt”) / (1024^2), 1) size_twitter <- round(file.size(“en_US.twitter.txt”) / (1024^2), 1) # Create a named vector with the sizes of each file

sizes <- setNames(c(size_blogs, size_news, size_twitter), c(“Blogs”, “News”, “Twitter”))

Output the file sizes

sizes

Blogs News Twitter ## 200.4 196.3 159.4

Create simple summaries

Read lines from the text files with UTF-8 encoding

blogs_lines <- readLines(“en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news_lines <- readLines(“en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter_lines <- readLines(“en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE) # Calculate the number of lines in each dataset

line_counts <- setNames(c(length(blogs_lines), length(news_lines), length(twitter_lines)), c(“Blogs”, “News”, “Twitter”))

Display the number of lines

line_counts

Blogs News Twitter ## 899288 1010242 2360148

Calculate the total number of characters for each dataset

total_chars <- setNames(c(sum(nchar(blogs_lines)), sum(nchar(news_lines)), sum(nchar(twitter_lines))), c(“Blogs”, “News”, “Twitter”))

Display the total character counts

total_chars

Blogs

News Twitter

206824505 203223159 162096241

Compute the average number of characters per line for each dataset

avg_chars <- setNames(c(round(mean(nchar(blogs_lines)), 0),

round(mean(nchar(news_lines)), 0),

round(mean(nchar(twitter_lines)), 0)), c(“Blogs”, “News”, “Twitter”))

Display the average character counts

avg_chars

Blogs News Twitter ## 230 201 69

Count the number of words in each line of the datasets word_counts_blogs <- lengths(strsplit(blogs_lines, " “)) word_counts_news <- lengths(strsplit(news_lines,” “)) word_counts_twitter <- lengths(strsplit(twitter_lines,” ")) # Calculate the total number of words for each dataset

total_word_counts <- setNames(c(sum(word_counts_blogs), sum(word_counts_news), sum(word_counts_twitter)), c(“Blogs”, “News”, “Twitter”))

Display the total word counts

total_word_counts

Blogs News Twitter ## 37334131 34372530 30373583

Compute the average number of words per line for each dataset

avg_words <- setNames(c(round(mean(word_counts_blogs), 0),

round(mean(word_counts_news), 0),

round(mean(word_counts_twitter), 0)), c(“Blogs”, “News”, “Twitter”))

Display the average word counts

avg_words

Blogs News Twitter ## 42 34 13

Sample the data

to analyze 8000 lines per set (approx. 10% of total). Summary includes unique token counts (types).

set.seed(3333)

blogs_sample <- sample(blogs, size = 8000, replace = FALSE) news_sample <- sample(news, size = 8000, replace = FALSE) twitter_sample <- sample(twitter, size = 8000, replace = FALSE)

write.table(blogs_sample, file = “Sampling/blogs_sample.txt”, sep = " “) write.table(news_sample, file =”Sampling/news_sample.txt“, sep =” “) write.table(twitter_sample, file =”Sampling/twitter_sample.txt“, sep =” ") # Generate a corpus with the sampled data

tm_corpus <- VCorpus(DirSource(directory = “Sampling”, pattern = "*.txt")) q_corpus <- corpus(tm_corpus)

docvars(q_corpus, field = “TextId”) <- c(“Blogs”, “News”, “Twitter”)

summary(q_corpus)[, c(“TextId”, “Types”, “Tokens”, “Sentences”, “language”)]

1

Blogs 41346 416396

18258

en

2

News 41021 361075

14824

en

3

Twitter 25204 165669

8918

en

TextId Types Tokens Sentences language

Exploratory analysis

Analyze word frequencies by generating a list of top terms, excluding punctuation, numbers, and stop words (e.g., articles, conjunctions, prepositions, pronouns, common verbs).

Tokenize the corpus, convert to lowercase, and remove punctuation, symbols, numbers, URLs, and separators

processed_tokens <- tokens(q_corpus,

remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%

tokens_tolower() %>%

tokens_remove(pattern = stopwords(“english”)) # Create a document-feature matrix from the processed tokens dfm_terms <- dfm(processed_tokens)

Extract the top 20 most frequent terms

top_terms <- as.data.frame(topfeatures(dfm_terms, n = 20))

Prepare the data for plotting top_terms$Words <- rownames(top_terms) colnames(top_terms) <- c(“Frequency”, “Words”)

top_terms\(Words <- factor(top_terms\)Words, levels = top_terms\(Words[order(top_terms\)Frequency)])

Create a bar plot for the top 20 most frequent words

plot_top_terms <- ggplot(data = top_terms, aes(x = Frequency, y = Words)) + geom_col() +

ggtitle(“Top 20 Most Frequent Words”)

Display the plot

plot_top_terms

Figure 1 – The top 20 highest-frequency words

Generate a word cloud from the document-feature matrix with a minimum count of 100

textplot_wordcloud(dfm_terms,

min_count = 100,

color = topo.colors(20))

Figure 2 – The word cloud from the document

Generate N-Grams

To facilitate next-word prediction by identifying sequences of words appearing together in our texts. Stopwords are retained as they are crucial for accurate prediction.

Tokenize the corpus, convert to lowercase, and remove unwanted characters, then create bigrams

bigrams <- tokens(q_corpus,

remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%

tokens_tolower() %>%

tokens_ngrams(n = 2, concatenator = " ") # Create a document-feature matrix for the bigrams dfm_bigrams <- dfm(bigrams)

Extract the top 20 most frequent bigrams

top_bigrams <- as.data.frame(topfeatures(dfm_bigrams, n = 20))

Prepare the data for plotting top_bigrams$Words <- rownames(top_bigrams) colnames(top_bigrams) <- c(“Frequency”, “Words”)

top_bigrams\(Words <- factor(top_bigrams\)Words, levels = top_bigrams\(Words[order(top_bigrams\)Frequency)])

Create a bar plot for the top 20 most frequent bigrams

plot_bigrams <- ggplot(data = top_bigrams, aes(x = Frequency, y = Words)) + geom_col() +

ggtitle(“Top 20 Most Frequent Bigrams”)

Display the plot

plot_bigrams

Figure 3 – The Top frequent 2-grams

Tokenize the corpus, convert to lowercase, and remove unwanted characters, then create trigrams

trigrams <- tokens(q_corpus,

remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%

tokens_tolower() %>%

tokens_ngrams(n = 3, concatenator = " ") # Create a document-feature matrix for the trigrams dfm_trigrams <- dfm(trigrams)

Extract the top 20 most frequent trigrams

top_trigrams <- as.data.frame(topfeatures(dfm_trigrams, n = 20))

Prepare the data for plotting top_trigrams$Words <- rownames(top_trigrams) colnames(top_trigrams) <- c(“Frequency”, “Words”)

top_trigrams\(Words <- factor(top_trigrams\)Words, levels = top_trigrams\(Words[order(top_trigrams\)Frequency)])

Create a bar plot for the top 20 most frequent trigrams

plot_trigrams <- ggplot(data = top_trigrams, aes(x = Frequency, y = Words)) + geom_col() +

ggtitle(“Top 20 Most Frequent Trigrams”)

Display the plot

plot_trigrams

Figure 4 – The top 20 most common 3-grams

Tokenize the corpus, convert to lowercase, and remove unwanted characters, then create four-grams

four_grams <- tokens(q_corpus,

remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%

tokens_tolower() %>%

tokens_ngrams(n = 4, concatenator = " ") # Create a document-feature matrix for the four-grams dfm_four_grams <- dfm(four_grams)

Extract the top 20 most frequent four-grams

top_four_grams <- as.data.frame(topfeatures(dfm_four_grams, n = 20))

Prepare the data for plotting top_four_grams$Words <- rownames(top_four_grams) colnames(top_four_grams) <- c(“Frequency”, “Words”)

top_four_grams\(Words <- factor(top_four_grams\)Words, levels = top_four_grams\(Words[order(top_four_grams\)Frequency)])

Create a bar plot for the top 20 most frequent four-grams

plot_four_grams <- ggplot(data = top_four_grams, aes(x = Frequency, y = Words)) + geom_col() +

ggtitle(“Top 20 Most Frequent Four-Grams”)

Display the plot

plot_four_grams

Conclusion

Figure 5 –The top 20 most common 4-grams

Sampled and tokenized 10% of the dataset for analysis, focusing on identifying top words and generating n-grams. Visualized top 20 most frequent words and n-grams (2-grams, 3-grams, 4-grams). Next steps include developing a prediction algorithm and creating a Shiny app.