library(jiebaR)
## Loading required package: jiebaRD
library(quanteda)
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Package version: 3.2.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textstats)
library(wordcloud2)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.7 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(tidyverse)
library(jiebaR)
library(dplyr)
library(tidytext)
library(stringr)
library(topicmodels)
setwd("G:/copaper/textm/text/text")
# https://quanteda.io/articles/pkgdown/examples/chinese.html
# https://quanteda.io/articles/pkgdown/quickstart_cn.html
tenyears <- readtext("*.txt",
docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8",dvsep = "_")
corp <- corpus(tenyears)
ch_stop <- stopwords("zh", source = "misc")
stopw<-c("性","化","1","5","9","0","中")
tokeninfo <- summary(corp)
if (require(ggplot2)) ggplot(data = tokeninfo, aes(x = Text, y = Tokens, group = 1)) +
geom_line() + geom_point() + theme_bw()#breaks = seq(1789, 2017, 12)

newword<-c("社会主义", "资本主义", "工商业","共产党","五年计划")
stopword1<-c("中","一个","走","道","占")
#segmenter <- worker(user="newword.txt",bylines = T, symbol=T)#stop_word = "stop.txt"
my_seg <- worker(user="newword",bylines = T)
book_wordsall <- corp %>%
tidy %>%mutate_all(funs(str_squish(.))) %>%
unnest_tokens(output = word,
input= text, drop = T,
token = function(x) segment(x, jiebar = my_seg)) %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+")) %>%
group_by(PM) %>%
#mutate(word_id = row_number()) %>% # create word index within each document
ungroup
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
book_words1 <- corp %>%
tidy %>%mutate_all(funs(str_squish(.))) %>%
unnest_tokens(output = word,
input= text, drop = T,
token = function(x) segment(x, jiebar = my_seg)) %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+")) %>%
group_by(PM) %>%
#mutate(word_id = row_number()) %>% # create word index within each document
ungroup
book_words2 <- corp %>%
tidy %>%mutate_all(funs(str_squish(.))) %>%
unnest_tokens(output = word,
input= text, drop = T,
token = function(x) segment(x, jiebar = my_seg)) %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+")) %>%
count(PM, Year,word, sort = TRUE)
book_words <- corp %>%
tidy %>%mutate_all(funs(str_squish(.))) %>%
unnest_tokens(output = word,
input= text, drop = T,
token = function(x) segment(x, jiebar = my_seg)) %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+")) %>%
count(PM,word, sort = TRUE)
#These words are, as measured by tf-idf, the most important to each novel and most readers would likely agree. What measuring tf-idf has done here is show us that Jane Austen used similar language across her six novels, and what distinguishes one novel from the rest within the collection of her works are the proper nouns, the names of people and places. This is the point of tf-idf; it identifies words that are important to one document within a collection of documents.
#mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
# "fig", "file", "cg", "cb", "cm",
# "ab", "_k", "_k_", "_x"))
#physics_words <- anti_join(physics_words, mystopwords, by = "word")
book_tf_idf <- book_words %>%
bind_tf_idf(word,PM,n)
book_tf_idf %>%
arrange(desc(tf_idf))
## # A tibble: 12,190 x 6
## PM word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 李克强 创新 197 0.00576 0.693 0.00399
## 2 周恩来 五年计划 209 0.00450 0.693 0.00312
## 3 李克强 深化 123 0.00360 0.693 0.00249
## 4 周恩来 资本主义 128 0.00276 0.693 0.00191
## 5 周恩来 基本建设 125 0.00269 0.693 0.00187
## 6 李克强 加大 84 0.00246 0.693 0.00170
## 7 周恩来 苏联 114 0.00246 0.693 0.00170
## 8 李克强 落实 76 0.00222 0.693 0.00154
## 9 李克强 持续 74 0.00216 0.693 0.00150
## 10 李克强 结构 68 0.00199 0.693 0.00138
## # ... with 12,180 more rows
book_tf_idf %>%
group_by(PM) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = PM)) +
geom_col(show.legend = FALSE) +
facet_wrap(~PM, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)

word_freq <- book_wordsall %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+")) %>% # remove words consisting of digits
count(word) %>%
arrange(desc(n))
word_freq %>%
filter(n > 50) %>%
filter(nchar(word) >=2) %>%
wordcloud2(shape = "star", size = 0.3)
usenet_words <- corp %>%
tidy %>%mutate_all(funs(str_squish(.))) %>%
unnest_tokens(output = word,
input= text, drop = T,
token = function(x) segment(x, jiebar = my_seg)) %>%
filter(!word %in% ch_stop) %>% # remove stopwords
filter(!word %in% stopword1) %>%
filter(word %>% str_detect(pattern = "\\D+"))
usenet_words %>%
count(word, sort = TRUE)
## # A tibble: 10,563 x 2
## word n
## <chr> <int>
## 1 发展 1034
## 2 建设 790
## 3 国家 652
## 4 人民 551
## 5 我国 539
## 6 经济 455
## 7 增长 441
## 8 社会主义 433
## 9 生产 431
## 10 工作 427
## # ... with 10,553 more rows
words_by_Year <- usenet_words %>%
count(Year, word, sort = TRUE) %>%
ungroup()
library(widyr)
newsgroup_cors <- words_by_Year %>%
pairwise_cor(Year, word, n, sort = TRUE)
library(ggraph)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
set.seed(2017)
newsgroup_cors %>%
filter(correlation > .5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation, width = correlation)) +
geom_node_point(size = 6, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()

word_sci_newsgroups <- usenet_words %>%
#filter(str_detect(newsgroup, "^sci")) %>%
group_by(word) %>%
mutate(word_total = n()) %>%
ungroup() %>%
filter(word_total > 10)
sci_dtm <- word_sci_newsgroups %>%
unite(document, PM, Year) %>%
count(document, word) %>%
cast_dtm(document, word, n)
sci_lda <- LDA(sci_dtm, k = 2, control = list(seed = 2016))
sci_lda %>%
tidy() %>%
group_by(topic) %>%
slice_max(beta, n = 8) %>%
ungroup() %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()

sci_lda %>%
tidy(matrix = "gamma") %>%
separate(document, c("PM", "Year"), sep = "_") %>%
mutate(newsgroup = reorder(PM, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ PM) +
labs(x = "Topic",
y = "# of messages where this was the highest % topic")

library(readr)
hanyuvalue <- read_table("G:/copaper/textm/hanyuvalue.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## word = col_character(),
## value = col_double()
## )
contributions <- usenet_words %>%
inner_join(hanyuvalue, by = "word") %>%
group_by(word) %>%
summarize(occurences = n(),
contribution = sum(value))
contributions
## # A tibble: 2,262 x 3
## word occurences contribution
## <chr> <int> <dbl>
## 1 癌症 3 3
## 2 爱国 5 7
## 3 爱国主义 1 1.4
## 4 安不忘危 1 -0.285
## 5 安家 1 0.75
## 6 安居 1 0.9
## 7 安居梦 1 1
## 8 安全 85 86.9
## 9 鞍山 7 19.6
## 10 按期 1 0.675
## # ... with 2,252 more rows
contributions %>%
slice_max(abs(contribution), n = 25) %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(contribution, word, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
labs(y = NULL)

top_sentiment_words <- words_by_Year %>%
inner_join(hanyuvalue, by = "word") %>%
mutate(contribution = value * n / sum(n))
top_sentiment_words %>%
slice_max(abs(contribution), n =40) %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(contribution, word, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ Year)+
labs(y = NULL)

#quanteda first five
#docnames(ch_toks1) <- docnames(corp)
qcorp <- corpus_subset(corp, Year <=1970)
hcorp <- corpus_subset(corp, Year >=1970)
qch_toks <- qcorp %>%
segment(jiebar =my_seg)%>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
qch_dfm <- dfm(qch_toks)
topfeatures(qch_dfm,20) #词频排序
## 国家 建设 我国 人民 发展 工业 生产 社会主义
## 560 525 455 446 413 393 384 384
## 增加 增长 农业 工作 计划 必须 企业 完成
## 319 318 317 302 275 274 234 221
## 方面 已经 全国 五年计划
## 219 214 213 209
qfeatures_dfm_inaug <- textstat_frequency(qch_dfm)
qfeatures_dfm_inaug %>%
filter(frequency > 100) %>%
#filter(nchar(word) >=2) %>%
wordcloud2(shape = "star", size = 0.3)
wordcloud2(qfeatures_dfm_inaug, size = 1,shape = 'star')
qch_dfm%>%
textstat_frequency(n = 20) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()

hch_toks <- hcorp %>%
segment(jiebar =my_seg)%>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
hch_dfm <- dfm(hch_toks)
topfeatures(hch_dfm)
## 发展 改革 推进 建设 经济 加强 创新 企业 新 加快
## 621 329 288 265 249 241 197 191 186 170
hch_dfm%>%
textstat_frequency(n = 20) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()

hfeatures_dfm_inaug <- textstat_frequency(hch_dfm)
hfeatures_dfm_inaug %>%
filter(frequency > 50) %>%
#filter(nchar(word) >=2) %>%
wordcloud2(shape = "star", size = 0.3)
dfm_weight_pres1 <- corp %>%
#segment(jiebar =my_seg)%>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
dfm_weight_pres2 <- corp %>%
segment(jiebar =my_seg)%>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
dfm_weight_pres <- dfm(dfm_weight_pres1)
dfm_weight_pres3 <- dfm(dfm_weight_pres2)
docvars(dfm_weight_pres3) <- docvars(dfm_weight_pres)
freq_weight <- textstat_frequency(dfm_weight_pres3, n = 10, groups = PM)
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight):1,
labels = freq_weight$feature) +
labs(x = NULL, y = "Relative frequency")

ch_col <- textstat_collocations(qch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col, 10))
| 农业 生产 |
89 |
0 |
2 |
4.114043 |
29.88157 |
| 社会主义 改造 |
71 |
0 |
2 |
4.902053 |
28.33213 |
| 生活 水平 |
40 |
0 |
2 |
5.493943 |
24.72993 |
| 第一个 五年计划 |
116 |
0 |
2 |
8.717146 |
24.69897 |
| 占 比重 |
32 |
0 |
2 |
5.851792 |
22.99078 |
| 资本主义 工商业 |
32 |
0 |
2 |
6.504642 |
22.48245 |
| 人民 生活 |
48 |
0 |
2 |
3.810470 |
21.57885 |
| 全国 人民 |
51 |
0 |
2 |
3.610849 |
21.53231 |
| 国家 预算 |
48 |
0 |
2 |
3.956755 |
21.28164 |
| 生产 合作社 |
39 |
0 |
2 |
4.224305 |
20.82794 |
ch_col1 <- textstat_collocations(hch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col1 , 10))
| 人民 群众 |
37 |
0 |
2 |
5.793552 |
23.91550 |
| 特色 社会主义 |
29 |
0 |
2 |
7.657403 |
21.63584 |
| 深入 推进 |
39 |
0 |
2 |
4.874122 |
21.08252 |
| 地方 政府 |
31 |
0 |
2 |
5.135853 |
20.96735 |
| 核心 党中央 |
22 |
0 |
2 |
7.562209 |
19.79827 |
| 中国 特色 |
44 |
0 |
2 |
7.353838 |
19.06355 |
| 脱贫 攻坚 |
21 |
0 |
2 |
7.569948 |
18.40448 |
| 政府 工作 |
25 |
0 |
2 |
4.149822 |
17.46341 |
| 改革 完善 |
34 |
0 |
2 |
3.427815 |
17.11324 |
| 全面 建成 |
29 |
0 |
2 |
6.893374 |
16.50033 |