The goal of this capstone project is to build a predictive text model using a large text corpus. We’ll use Natural Language Processing (NLP) and statistical text mining techniques to analyze language patterns and construct n-gram models for text prediction.
library(tidyverse)
library(readr)
library(stringi)
# Set working directory (update if necessary)
setwd("C:/Users/ADMIN/Documents/FPT_subject/2025-FALL/DSR/R lab/coursera_mock_10_week2/Coursera-SwiftKey/final/en_US")
# Read text files
blogs <- read_lines("en_US.blogs.txt", skip_empty_rows = TRUE)
news <- read_lines("en_US.news.txt", skip_empty_rows = TRUE)
twitter <- read_lines("en_US.twitter.txt", skip_empty_rows = TRUE)
# Quick summary of files
file_stats <- tibble(
source = c("Blogs", "News", "Twitter"),
size_MB = c(file.info("en_US.blogs.txt")$size,
file.info("en_US.news.txt")$size,
file.info("en_US.twitter.txt")$size) / 1024^2,
lines = c(length(blogs), length(news), length(twitter)),
words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter)))
)
file_stats
## # A tibble: 3 × 4
## source size_MB lines words
## <chr> <dbl> <int> <int>
## 1 Blogs 200. 899288 37546806
## 2 News 196. 1010242 34762658
## 3 Twitter 159. 2360148 30096649
To make computation efficient, we’ll sample 1% from each dataset and clean it.
set.seed(2025)
clean_text <- function(text) iconv(text, "latin1", "ASCII", sub = "")
sample_data <- c(
sample(clean_text(blogs), length(blogs) * 0.01),
sample(clean_text(news), length(news) * 0.01),
sample(clean_text(twitter), length(twitter) * 0.01)
)
length(sample_data)
## [1] 42695
We’ll use the quanteda package for faster corpus handling and cleaning.
library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
corpus_sample <- corpus(sample_data)
tokens_clean <- tokens(
corpus_sample,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE
) %>%
tokens_tolower() %>%
tokens_remove(stopwords("en"))
tokens_clean[1:5]
## Tokens consisting of 5 documents.
## text1 :
## [1] "cookies" "actually" "pretty" "good" "reminded" "us"
## [7] "lot" "raspberry" "strippers" "glaze" "lemon" "peel"
## [ ... and 28 more ]
##
## text2 :
## [1] "want" "miss"
##
## text3 :
## [1] "stir" "tofu" "salt" "turmeric"
## [5] "keep" "stiring" "spices" "distributed"
## [9] "throughout.cover" "cook" "4-5" "min"
## [ ... and 2 more ]
##
## text4 :
## [1] "silence" "thing"
##
## text5 :
## [1] "steve" "mcmahon" "director" "london" "pain"
## [6] "consortium" "said" "group" "several" "others"
## [11] "europe" "now"
## [ ... and 7 more ]
dfm_tokens <- dfm(tokens_clean)
top_words <- textstat_frequency(dfm_tokens, n = 15)
top_words %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency, fill = frequency)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Top 15 Most Frequent Words", x = "Word", y = "Frequency") +
theme_minimal()
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.4.3
## Loading required package: RColorBrewer
textplot_wordcloud(dfm_tokens, max_words = 100, color = RColorBrewer::brewer.pal(8, "Dark2"))
Let’s explore unigrams, bigrams, and trigrams to capture sequential word relationships.
tokens_ngrams_all <- tokens_ngrams(tokens_clean, n = 1:3)
dfm_ngrams <- dfm(tokens_ngrams_all)
# Get top features by n-gram level
top_1gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 1)), n = 10)
top_2gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 2)), n = 10)
top_3gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 3)), n = 10)
plot_ngram <- function(data, title) {
ggplot(data, aes(x = reorder(feature, frequency), y = frequency, fill = frequency)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = title, x = "N-gram", y = "Frequency") +
theme_minimal()
}
plot_ngram(top_1gram, "Top 10 Unigrams")
plot_ngram(top_2gram, "Top 10 Bigrams")
plot_ngram(top_3gram, "Top 10 Trigrams")
Summary: