Milestone Report - NLP Capstone Project

📘 1. Introduction

This report is part of the JHU/Coursera Data Science Capstone, where we build a predictive text model based on a large text corpus (blogs, news, and Twitter).

🎯 Project Goals:

Explore and summarize text data.
Build n-gram models.
Create a prediction algorithm.
Deploy it as a Shiny app.

📂 2. Loading the Dataset

blogs_file <- "./final/en_US/en_US.blogs.txt"
news_file <- "./final/en_US/en_US.news.txt"
twitter_file <- "./final/en_US/en_US.twitter.txt"

blogs <- readLines(blogs_file, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_file, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)

📊 3. Basic Summary Statistics

data_summary <- data.frame(
  Source = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(sum(stri_count_words(blogs)),
            sum(stri_count_words(news)),
            sum(stri_count_words(twitter))),
  Characters = c(sum(nchar(blogs)),
                 sum(nchar(news)),
                 sum(nchar(twitter)))
)

knitr::kable(data_summary, caption = "Basic statistics for the datasets")

Basic statistics for the datasets
Source	Lines	Words	Characters
Blogs	899288	37546250	206824505
News	1010242	34762395	203223159
Twitter	2360148	30093413	162096241

data_long <- data_summary %>%
  pivot_longer(cols = c(Lines, Words, Characters),
               names_to = "Metric",
               values_to = "Count")

ggplot(data_long, aes(x = Source, y = Count, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Distribution of Lines, Words, and Characters by Source",
       x = "Text Source", y = "Count") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

📌 Takeaway: Blogs have the most characters and longest lines; Twitter has the shortest due to platform constraints.

🔎 4. Data Sampling & Preprocessing

set.seed(2025)
sample_size <- 5000
text_sample <- c(sample(blogs, sample_size), sample(news, sample_size), sample(twitter, sample_size))

corpus <- VCorpus(VectorSource(text_sample))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

🧼 5. Profanity Filtering

profanity <- readLines("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
corpus <- tm_map(corpus, removeWords, profanity)

🧮 6. Word Frequency Analysis (Unigrams)

tdm <- TermDocumentMatrix(corpus)
tdm_m <- as.matrix(tdm)
word_freq <- sort(rowSums(tdm_m), decreasing = TRUE)
freq_df <- data.frame(word = names(word_freq), freq = word_freq)
top_words <- head(freq_df, 15)

ggplot(top_words, aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 Frequent Words", x = "Words", y = "Frequency")

📊 7. N-gram Visualization

sample_df <- tibble(text = sapply(corpus, as.character))

get_ngrams <- function(data, n) {
  data %>%
    unnest_tokens(ngram, text, token = "ngrams", n = n) %>%
    count(ngram, sort = TRUE)
}

bigram_df <- get_ngrams(sample_df, 2)
trigram_df <- get_ngrams(sample_df, 3)

top_trigrams_no_na <- trigram_df %>%
  filter(!is.na(ngram)) %>%
  slice_max(n, n = 15)

ggplot(top_trigrams_no_na, aes(x = reorder(ngram, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 15 Trigrams (NA Removed)", x = "Trigram", y = "Frequency")

🔮 8. Simple Next Word Prediction

predict_next_word <- function(input, ngram_df) {
  input <- tolower(input)
  input <- tail(strsplit(input, " ")[[1]], 2)
  match_str <- paste(input, collapse = " ")
  filtered <- ngram_df[grepl(paste0("^", match_str), ngram_df$ngram), ]
  head(filtered[order(-filtered$n), ], 3)
}

predict_next_word("i love", trigram_df)

## # A tibble: 0 × 2
## # ℹ 2 variables: ngram <chr>, n <int>

predict_next_word("thanks for", trigram_df)

## # A tibble: 0 × 2
## # ℹ 2 variables: ngram <chr>, n <int>

📈 9. Line Length Distribution

line_lengths <- nchar(text_sample)

ggplot(data.frame(lengths = line_lengths), aes(x = lengths)) +
  geom_histogram(bins = 50, fill = "tomato", color = "white") +
  labs(title = "Distribution of Line Lengths", x = "Line Length (chars)", y = "Frequency")

🧠 10. Plan for Prediction Algorithm

Use n-gram models
Implement Katz/Stupid Backoff
Hash table storage
Smoothing

💻 11. Plan for Shiny App

UI with input box & suggestions
Top 3 suggestions with scores
Fast back-end

🔍 12. Key Findings

Blogs longest; Twitter shortest
Cleaning exposes patterns
Trigrams reveal transitions
Early prediction is feasible
Profanity removal is key