This report summarizes the exploratory analysis of three English-language text datasets:
en_US.blogs.txt)en_US.news.txt)en_US.twitter.txt)library(data.table)
library(stringr)
library(ggplot2)
library(gridExtra)
# File names
blogs_file <- "en_US.blogs.txt"
news_file <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"
# Sample lines to save memory
sample_size <- 10000
set.seed(123)
blogs_sample <- sample(readLines(blogs_file, warn=FALSE), sample_size)
news_sample <- sample(readLines(news_file, warn=FALSE), sample_size)
twitter_sample <- sample(readLines(twitter_file, warn=FALSE), sample_size)
# Function to summarize dataset
get_summary <- function(lines) {
list(
num_lines = length(lines),
avg_words_per_line = mean(str_count(lines, "\\S+")),
avg_chars_per_line = mean(nchar(lines)),
max_line_length = max(nchar(lines))
)
}
blogs_summary <- get_summary(blogs_sample)
news_summary <- get_summary(news_sample)
twitter_summary <- get_summary(twitter_sample)
summary_table <- data.table(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(blogs_summary$num_lines, news_summary$num_lines, twitter_summary$num_lines),
Avg_Words = c(blogs_summary$avg_words_per_line, news_summary$avg_words_per_line, twitter_summary$avg_words_per_line),
Avg_Chars = c(blogs_summary$avg_chars_per_line, news_summary$avg_chars_per_line, twitter_summary$avg_chars_per_line),
Max_Line = c(blogs_summary$max_line_length, news_summary$max_line_length, twitter_summary$max_line_length)
)
summary_table
## Dataset Lines Avg_Words Avg_Chars Max_Line
## <char> <int> <num> <num> <int>
## 1: Blogs 10000 41.4435 229.7303 5054
## 2: News 10000 34.1001 201.5949 1840
## 3: Twitter 10000 12.9029 68.7301 140
# Histogram function
plot_line_lengths <- function(lines, title) {
line_lengths <- nchar(lines)
ggplot(data.frame(line_lengths), aes(x=line_lengths)) +
geom_histogram(binwidth=10, fill="skyblue", color="black") +
theme_minimal() +
labs(title=title, x="Line length (characters)", y="Frequency")
}
p1 <- plot_line_lengths(blogs_sample, "Blogs Line Lengths")
p2 <- plot_line_lengths(news_sample, "News Line Lengths")
p3 <- plot_line_lengths(twitter_sample, "Twitter Line Lengths")
grid.arrange(p1, p2, p3, ncol=3)
# Tokenization
tokenize <- function(lines) {
unlist(str_split(tolower(lines), "\\s+"))
}
blogs_tokens <- tokenize(blogs_sample)
news_tokens <- tokenize(news_sample)
twitter_tokens <- tokenize(twitter_sample)
# Top 10 words function
top_words <- function(tokens, n=10) {
dt <- data.table(word=tokens)
dt <- dt[, .N, by=word][order(-N)]
dt[1:n]
}
blogs_top10 <- top_words(blogs_tokens)
news_top10 <- top_words(news_tokens)
twitter_top10 <- top_words(twitter_tokens)
blogs_top10
## word N
## <char> <int>
## 1: the 20378
## 2: and 12010
## 3: to 11756
## 4: a 9731
## 5: of 9520
## 6: i 8378
## 7: in 6499
## 8: that 4927
## 9: is 4449
## 10: for 4002
news_top10
## word N
## <char> <int>
## 1: the 19137
## 2: to 8986
## 3: and 8756
## 4: a 8561
## 5: of 7754
## 6: in 6561
## 7: for 3440
## 8: that 3300
## 9: is 2864
## 10: on 2585
twitter_top10
## word N
## <char> <int>
## 1: the 4042
## 2: to 3257
## 3: i 3048
## 4: a 2544
## 5: you 2054
## 6: and 1802
## 7: in 1620
## 8: for 1587
## 9: of 1498
## 10: is 1444