# 计算基本统计量并显示表格
stats <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Chars = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
)
stats$Words <- c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
)
knitr::kable(stats, caption = "各文件基本统计量")
各文件基本统计量
| Blogs |
899288 |
206824505 |
37546806 |
| News |
1010242 |
203223159 |
34762658 |
| Twitter |
2360148 |
162096031 |
30096649 |
# 采样与绘图
sample_frac <- 0.01
set.seed(123)
sample_lines <- map_dfr(data_list, function(x) {
n <- length(x)
idx <- sample(n, size = floor(n * sample_frac))
tibble(text = x[idx])
}, .id = "source") %>%
mutate(nchar = nchar(text))
# 直方图
ggplot(sample_lines, aes(x = nchar, fill = source)) +
geom_histogram(binwidth = 20, alpha = 0.6, position = "identity") +
labs(title = "不同来源文本行长度分布对比",
x = "每行字符数", y = "频数") +
theme_minimal() +
scale_fill_brewer(palette = "Set1")

# 词频分析
set.seed(456)
words <- sample_lines$text %>%
str_to_lower() %>%
str_split("\\s+") %>%
unlist()
words <- words[words != ""]
word_freq <- table(words) %>%
sort(decreasing = TRUE) %>%
head(20) %>%
as.data.frame()
names(word_freq) <- c("word", "freq")
ggplot(word_freq, aes(x = fct_reorder(word, freq), y = freq)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "出现频率最高的20个单词(采样数据)",
x = "单词", y = "频次") +
theme_minimal()
