Data Science Capstone: Milestone Report

# Chunk 1: Load libraries (Hidden from report)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.3

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.1     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidytext)
library(stringi)

# Define file paths (Ensure these are in your working directory)
files <- c(blogs = "en_US.blogs.txt", news = "en_US.news.txt", twitter = "en_US.twitter.txt")

# Basic Summary Table (Required for Task 1)
summary_table <- data.frame(
  File = names(files),
  Size_MB = sapply(files, function(x) file.size(x) / 1024^2),
  Lines = sapply(files, function(x) length(readLines(x, warn = FALSE))),
  Words = sapply(files, function(x) sum(stri_count_words(readLines(x, n = 5000, warn = FALSE)))) # Estimate based on 5k lines
)
print(summary_table)

##            File  Size_MB   Lines  Words
## blogs     blogs 200.4242  899288 206723
## news       news 196.2775 1010242 173090
## twitter twitter 159.3641 2360148  63132

set.seed(123)
sample_size <- 0.01 # 1% sample

get_sample <- function(path) {
  con <- file(path, "r")
  data <- readLines(con, warn = FALSE)
  close(con)
  sample(data, length(data) * sample_size)
}

# Create a combined, clean corpus
raw_sample <- c(get_sample(files["blogs"]), get_sample(files["news"]), get_sample(files["twitter"]))
clean_corpus <- tibble(text = raw_sample) %>%
  mutate(text = str_to_lower(text)) %>%
  mutate(text = str_replace_all(text, "[^[:alpha:][:space:]']", "")) # Remove numbers/special chars

# Generate Bigrams (2-word pairs)
bigrams <- clean_corpus %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE) %>%
  top_n(15)

## Selecting by n

# Plotting the result
ggplot(bigrams, aes(x = reorder(bigram, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 Most Common Bigrams", x = "Word Pairs", y = "Frequency") +
  theme_minimal()

Data Science Capstone: Milestone Report

Chaithra

2026-03-22