Data Science Capstone: Milestone Report

Introduction

The goal of this capstone project is to build a predictive text model using a large text corpus. We’ll use Natural Language Processing (NLP) and statistical text mining techniques to analyze language patterns and construct n-gram models for text prediction.

Load Data

library(tidyverse)
library(readr)
library(stringi)

# Set working directory (update if necessary)
setwd("C:/Users/ADMIN/Documents/FPT_subject/2025-FALL/DSR/R lab/coursera_mock_10_week2/Coursera-SwiftKey/final/en_US")

# Read text files
blogs   <- read_lines("en_US.blogs.txt", skip_empty_rows = TRUE)
news    <- read_lines("en_US.news.txt", skip_empty_rows = TRUE)
twitter <- read_lines("en_US.twitter.txt", skip_empty_rows = TRUE)

# Quick summary of files
file_stats <- tibble(
  source = c("Blogs", "News", "Twitter"),
  size_MB = c(file.info("en_US.blogs.txt")$size,
              file.info("en_US.news.txt")$size,
              file.info("en_US.twitter.txt")$size) / 1024^2,
  lines = c(length(blogs), length(news), length(twitter)),
  words = c(sum(stri_count_words(blogs)),
            sum(stri_count_words(news)),
            sum(stri_count_words(twitter)))
)

file_stats

## # A tibble: 3 × 4
##   source  size_MB   lines    words
##   <chr>     <dbl>   <int>    <int>
## 1 Blogs      200.  899288 37546806
## 2 News       196. 1010242 34762658
## 3 Twitter    159. 2360148 30096649

Sampling

To make computation efficient, we’ll sample 1% from each dataset and clean it.

set.seed(2025)

clean_text <- function(text) iconv(text, "latin1", "ASCII", sub = "")

sample_data <- c(
  sample(clean_text(blogs),   length(blogs) * 0.01),
  sample(clean_text(news),    length(news) * 0.01),
  sample(clean_text(twitter), length(twitter) * 0.01)
)

length(sample_data)

## [1] 42695

Text Preprocessing

We’ll use the quanteda package for faster corpus handling and cleaning.

library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)

corpus_sample <- corpus(sample_data)

tokens_clean <- tokens(
  corpus_sample,
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE
) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en"))

tokens_clean[1:5]

## Tokens consisting of 5 documents.
## text1 :
##  [1] "cookies"   "actually"  "pretty"    "good"      "reminded"  "us"       
##  [7] "lot"       "raspberry" "strippers" "glaze"     "lemon"     "peel"     
## [ ... and 28 more ]
## 
## text2 :
## [1] "want" "miss"
## 
## text3 :
##  [1] "stir"             "tofu"             "salt"             "turmeric"        
##  [5] "keep"             "stiring"          "spices"           "distributed"     
##  [9] "throughout.cover" "cook"             "4-5"              "min"             
## [ ... and 2 more ]
## 
## text4 :
## [1] "silence" "thing"  
## 
## text5 :
##  [1] "steve"      "mcmahon"    "director"   "london"     "pain"      
##  [6] "consortium" "said"       "group"      "several"    "others"    
## [11] "europe"     "now"       
## [ ... and 7 more ]

Exploratory Data Analysis (EDA)

Token Frequency

dfm_tokens <- dfm(tokens_clean)

top_words <- textstat_frequency(dfm_tokens, n = 15)
top_words %>%
  ggplot(aes(x = reorder(feature, frequency), y = frequency, fill = frequency)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 15 Most Frequent Words", x = "Word", y = "Frequency") +
  theme_minimal()

Word Cloud

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.4.3

## Loading required package: RColorBrewer

textplot_wordcloud(dfm_tokens, max_words = 100, color = RColorBrewer::brewer.pal(8, "Dark2"))

Building N-Grams

Let’s explore unigrams, bigrams, and trigrams to capture sequential word relationships.

tokens_ngrams_all <- tokens_ngrams(tokens_clean, n = 1:3)
dfm_ngrams <- dfm(tokens_ngrams_all)

# Get top features by n-gram level
top_1gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 1)), n = 10)
top_2gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 2)), n = 10)
top_3gram <- textstat_frequency(dfm(tokens_ngrams(tokens_clean, n = 3)), n = 10)

Visualize Top N-Grams

plot_ngram <- function(data, title) {
  ggplot(data, aes(x = reorder(feature, frequency), y = frequency, fill = frequency)) +
    geom_col(show.legend = FALSE) +
    coord_flip() +
    labs(title = title, x = "N-gram", y = "Frequency") +
    theme_minimal()
}

plot_ngram(top_1gram, "Top 10 Unigrams")

plot_ngram(top_2gram, "Top 10 Bigrams")

plot_ngram(top_3gram, "Top 10 Trigrams")

Summary & Next Steps

Summary:

Text data from blogs, news, and Twitter were successfully loaded, cleaned, and sampled.
Common unigrams, bigrams, and trigrams were identified.
Quanteda enabled efficient tokenization and exploration.