# Chunk 1: Load libraries (Hidden from report)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(stringi)
# Define file paths (Ensure these are in your working directory)
files <- c(blogs = "en_US.blogs.txt", news = "en_US.news.txt", twitter = "en_US.twitter.txt")
# Basic Summary Table (Required for Task 1)
summary_table <- data.frame(
File = names(files),
Size_MB = sapply(files, function(x) file.size(x) / 1024^2),
Lines = sapply(files, function(x) length(readLines(x, warn = FALSE))),
Words = sapply(files, function(x) sum(stri_count_words(readLines(x, n = 5000, warn = FALSE)))) # Estimate based on 5k lines
)
print(summary_table)
## File Size_MB Lines Words
## blogs blogs 200.4242 899288 206723
## news news 196.2775 1010242 173090
## twitter twitter 159.3641 2360148 63132
set.seed(123)
sample_size <- 0.01 # 1% sample
get_sample <- function(path) {
con <- file(path, "r")
data <- readLines(con, warn = FALSE)
close(con)
sample(data, length(data) * sample_size)
}
# Create a combined, clean corpus
raw_sample <- c(get_sample(files["blogs"]), get_sample(files["news"]), get_sample(files["twitter"]))
clean_corpus <- tibble(text = raw_sample) %>%
mutate(text = str_to_lower(text)) %>%
mutate(text = str_replace_all(text, "[^[:alpha:][:space:]']", "")) # Remove numbers/special chars
# Generate Bigrams (2-word pairs)
bigrams <- clean_corpus %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
top_n(15)
## Selecting by n
# Plotting the result
ggplot(bigrams, aes(x = reorder(bigram, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 15 Most Common Bigrams", x = "Word Pairs", y = "Frequency") +
theme_minimal()
