The goal of this project is to analyze the HC Corpora dataset. This report provides an exploratory analysis of three data sources: Twitter, Blogs, and News. Due to large size of the data, a 1% sample was used.
Load the raw text files and extract a random 1% sample from each source.
path_twitter <- "data/final/en_US/en_US.twitter.txt"
path_blogs <- "data/final/en_US/en_US.blogs.txt"
path_news <- "data/final/en_US/en_US.news.txt"
sample_data <- function(path, p = 0.01) {
con <- file(path, "open" = "r")
data <- readLines(con, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(con)
set.seed(123)
sample_idx <- sample(length(data), length(data) * p)
sampled <- data[sample_idx]
rm(data)
gc()
return(sampled)
}
twitter_s <- sample_data(path_twitter)
blogs_s <- sample_data(path_blogs)
news_s <- sample_data(path_news)
summary_stats <- data.frame(
Source = c("Twitter", "Blogs", "News"),
Estimated_Lines = c(length(twitter_s)*100, length(blogs_s)*100, length(news_s)*100),
Sample_Word_Count = c(sum(stri_count_words(twitter_s)),
sum(stri_count_words(blogs_s)),
sum(stri_count_words(news_s)))
)
knitr::kable(summary_stats, caption = "Dataset Overview")
| Source | Estimated_Lines | Sample_Word_Count |
|---|---|---|
| 2360100 | 303012 | |
| Blogs | 899200 | 374837 |
| News | 1010200 | 347636 |
all_data <- c(twitter_s, blogs_s, news_s)
corpus <- VCorpus(VectorSource(all_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
tdm <- TermDocumentMatrix(corpus)
library(slam)
word_frequencies <- row_sums(tdm)
top_words <- sort(word_frequencies, decreasing = TRUE)[1:15]
df_freq <- data.frame(
word = names(top_words),
freq = as.numeric(top_words)
)
library(ggplot2)
ggplot(df_freq, aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 15 Most Frequent Words",
x = "Words",
y = "Frequency") +
theme_minimal()