Introduction

The goal of this project is to analyze the HC Corpora dataset. This report provides an exploratory analysis of three data sources: Twitter, Blogs, and News. Due to large size of the data, a 1% sample was used.

Loading and Sampling

Load the raw text files and extract a random 1% sample from each source.

path_twitter <- "data/final/en_US/en_US.twitter.txt"
path_blogs   <- "data/final/en_US/en_US.blogs.txt"
path_news    <- "data/final/en_US/en_US.news.txt"

sample_data <- function(path, p = 0.01) {
  con <- file(path, "open" = "r")
  
  data <- readLines(con, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
  close(con)
  
  set.seed(123)
  sample_idx <- sample(length(data), length(data) * p)
  sampled <- data[sample_idx]
  
  rm(data)
  gc()
  
  return(sampled)
}

twitter_s <- sample_data(path_twitter)
blogs_s   <- sample_data(path_blogs)
news_s    <- sample_data(path_news)

Summary

summary_stats <- data.frame(
  Source = c("Twitter", "Blogs", "News"),
  Estimated_Lines = c(length(twitter_s)*100, length(blogs_s)*100, length(news_s)*100),
  Sample_Word_Count = c(sum(stri_count_words(twitter_s)), 
                        sum(stri_count_words(blogs_s)), 
                        sum(stri_count_words(news_s)))
)
knitr::kable(summary_stats, caption = "Dataset Overview")
Dataset Overview
Source Estimated_Lines Sample_Word_Count
Twitter 2360100 303012
Blogs 899200 374837
News 1010200 347636

Cleaning and Tokenization

all_data <- c(twitter_s, blogs_s, news_s)
corpus <- VCorpus(VectorSource(all_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)

tdm <- TermDocumentMatrix(corpus)

Exploratory Data Analysis

library(slam)

word_frequencies <- row_sums(tdm)
top_words <- sort(word_frequencies, decreasing = TRUE)[1:15]

df_freq <- data.frame(
  word = names(top_words), 
  freq = as.numeric(top_words)
)

library(ggplot2)
ggplot(df_freq, aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 Most Frequent Words", 
       x = "Words", 
       y = "Frequency") +
  theme_minimal()