# 计算基本统计量并显示表格
stats <- data.frame(
  File   = c("Blogs", "News", "Twitter"),
  Lines  = c(length(blogs), length(news), length(twitter)),
  Chars  = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
)
stats$Words <- c(
  sum(stri_count_words(blogs)),
  sum(stri_count_words(news)),
  sum(stri_count_words(twitter))
)
knitr::kable(stats, caption = "各文件基本统计量")
各文件基本统计量
File Lines Chars Words
Blogs 899288 206824505 37546806
News 1010242 203223159 34762658
Twitter 2360148 162096031 30096649
# 采样与绘图
sample_frac <- 0.01
set.seed(123)
sample_lines <- map_dfr(data_list, function(x) {
  n <- length(x)
  idx <- sample(n, size = floor(n * sample_frac))
  tibble(text = x[idx])
}, .id = "source") %>%
  mutate(nchar = nchar(text))

# 直方图
ggplot(sample_lines, aes(x = nchar, fill = source)) +
  geom_histogram(binwidth = 20, alpha = 0.6, position = "identity") +
  labs(title = "不同来源文本行长度分布对比",
       x = "每行字符数", y = "频数") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

# 词频分析
set.seed(456)
words <- sample_lines$text %>%
  str_to_lower() %>%
  str_split("\\s+") %>%
  unlist()
words <- words[words != ""]
word_freq <- table(words) %>% 
  sort(decreasing = TRUE) %>% 
  head(20) %>%
  as.data.frame()
names(word_freq) <- c("word", "freq")

ggplot(word_freq, aes(x = fct_reorder(word, freq), y = freq)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "出现频率最高的20个单词(采样数据)",
       x = "单词", y = "频次") +
  theme_minimal()