Data Science Capstone: Milestone Report

# Define file paths
files <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")

# Load data with error handling
if(all(file.exists(files))) {
  blogs   <- readLines("en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
  news    <- readLines("en_US.news.txt", encoding = "UTF-8", warn = FALSE)
  twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
} else {
  stop("Data files not found. Please ensure files are in the working directory.")
}

# Function to calculate statistics
calc_stats <- function(text_lines, filename) {
  n_lines <- length(text_lines)
  n_words <- sum(stri_count_words(text_lines))
  n_chars <- sum(nchar(text_lines))
  size_mb <- round(object.size(text_lines) / (1024^2), 1)
  
  return(data.frame(File = filename, Lines = n_lines, Words = n_words, 
                    Chars = n_chars, Size_MB = size_mb))
}

summary_df <- rbind(
  calc_stats(blogs, "Blogs"),
  calc_stats(news, "News"),
  calc_stats(twitter, "Twitter")
)

kable(summary_df, caption = "Summary Statistics of Input Data", format.args = list(big.mark = ","))

Summary Statistics of Input Data
File	Lines	Words	Chars	Size_MB
Blogs	899,288	37,546,806	206,824,505	255.4 bytes
News	1,010,206	34,761,151	203,214,543	257.3 bytes
Twitter	2,360,148	30,096,690	162,096,241	319 bytes

#3 Data Sampling and Preprocessing

set.seed(123)
sample_pct <- 0.05 

sample_text <- c(
  sample(blogs, length(blogs) * sample_pct),
  sample(news, length(news) * sample_pct),
  sample(twitter, length(twitter) * sample_pct)
)

# Free up memory immediately after sampling
rm(blogs, news, twitter); gc()

##            used  (Mb) gc trigger  (Mb)  max used  (Mb)
## Ncells  2643732 141.2    8064720 430.8   6784801 362.4
## Vcells 12647931  96.5   91663959 699.4 114558574 874.1

# Clean and Tokenize
tokens_clean <- tokens(
  corpus(sample_text),
  remove_punct = TRUE,
  remove_numbers = TRUE,
  remove_symbols = TRUE,
  remove_url = TRUE
) %>%
  tokens_tolower()

#4 Building N-Gram Models

get_freq_table <- function(tokens, n) {
  if (n > 1) {
    toks <- tokens_ngrams(tokens, n = n, concatenator = " ")
  } else {
    toks <- tokens
  }
  
  dfm_obj <- dfm(toks)
  freq <- textstat_frequency(dfm_obj) %>%
    select(feature, frequency) %>%
    filter(frequency > 1) # Remove singletons to save memory
  
  if (n > 1) {
    # Extract context and last word for fast lookup
    freq$context <- sub(" [^ ]+$", "", freq$feature)
    freq$prediction <- sub("^.* ", "", freq$feature)
  }
  return(freq)
}

unigrams   <- get_freq_table(tokens_clean, 1)
bigrams    <- get_freq_table(tokens_clean, 2)
trigrams   <- get_freq_table(tokens_clean, 3)
fourgrams  <- get_freq_table(tokens_clean, 4)

#5 Prediction Logic: Stupid Backoff

predict_next_word <- function(input_text, n_results = 3) {
  # Standardize input cleaning
  input_tokens <- tokens(tolower(input_text), remove_punct = TRUE) %>% 
                  as.character()
  
  len <- length(input_tokens)
  
  # Step 1: 4-gram lookup
  if (len >= 3) {
    ctx <- paste(tail(input_tokens, 3), collapse = " ")
    res <- fourgrams %>% filter(context == ctx) %>% head(n_results)
    if (nrow(res) >= n_results) return(res$prediction)
  }
  
  # Step 2: 3-gram lookup (Backoff)
  if (len >= 2) {
    ctx <- paste(tail(input_tokens, 2), collapse = " ")
    res <- trigrams %>% filter(context == ctx) %>% head(n_results)
    if (nrow(res) > 0) {
      preds <- res$prediction
      if(length(preds) < n_results) {
         # Recursive call for more results if needed
         preds <- unique(c(preds, predict_next_word(tail(input_tokens, 1), n_results)))
      }
      return(preds[1:n_results])
    }
  }
  
  # Step 3: 2-gram lookup (Backoff)
  if (len >= 1) {
    ctx <- tail(input_tokens, 1)
    res <- bigrams %>% filter(context == ctx) %>% head(n_results)
    if (nrow(res) > 0) return(res$prediction[1:min(nrow(res), n_results)])
  }
  
  # Step 4: Unigram default
  return(head(unigrams$feature, n_results))
}

#6 Testing and Visualizations

ggplot(head(bigrams, 15), aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Most Frequent Bigrams", x = "Bigram", y = "Frequency") +
  theme_minimal()

#7 Model Prediction Test

test_phrases <- c("How are", "I want to", "The end of", "See you")

results <- lapply(test_phrases, function(p) {
  preds <- predict_next_word(p, 3)
  data.frame(Input = p, Predictions = paste(preds, collapse = ", "))
})

kable(bind_rows(results))

Input	Predictions
How are	you, u, ya
I want to	be, do, go
The end of	the, this, my
See you	there, in, at

Data Science Capstone: Milestone Report

Quaqu’Bryt

2025-12-22