text.R

library(jiebaR)

## Loading required package: jiebaRD

library(quanteda)

## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.

## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.

## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.

## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.

## Package version: 3.2.1
## Unicode version: 13.0
## ICU version: 69.1

## Parallel computing: 8 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

library(readtext)
library(quanteda.textstats)
library(wordcloud2)
library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(tidyverse)
library(jiebaR)  
library(dplyr)
library(tidytext)
library(stringr)
library(topicmodels)
setwd("G:/copaper/textm/text/text")

#   https://quanteda.io/articles/pkgdown/examples/chinese.html
#  https://quanteda.io/articles/pkgdown/quickstart_cn.html

tenyears <- readtext("*.txt",
                     docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8",dvsep = "_")
corp <- corpus(tenyears)
ch_stop <- stopwords("zh", source = "misc")
stopw<-c("性","化","１","５","９","０","中")

tokeninfo <- summary(corp)
if (require(ggplot2)) ggplot(data = tokeninfo, aes(x = Text, y = Tokens, group = 1)) + 
    geom_line() + geom_point() +  theme_bw()#breaks = seq(1789, 2017, 12)

newword<-c("社会主义", "资本主义", "工商业","共产党","五年计划")
stopword1<-c("中","一个","走","道","占")
#segmenter <- worker(user="newword.txt",bylines = T, symbol=T)#stop_word = "stop.txt"
my_seg <- worker(user="newword",bylines = T)


book_wordsall <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    group_by(PM) %>%
    #mutate(word_id = row_number()) %>% # create word index within each document
    ungroup

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

book_words1 <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    group_by(PM) %>%
    #mutate(word_id = row_number()) %>% # create word index within each document
    ungroup

book_words2 <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    count(PM, Year,word, sort = TRUE)


book_words <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    count(PM,word, sort = TRUE)


#These words are, as measured by tf-idf, the most important to each novel and most readers would likely agree. What measuring tf-idf has done here is show us that Jane Austen used similar language across her six novels, and what distinguishes one novel from the rest within the collection of her works are the proper nouns, the names of people and places. This is the point of tf-idf; it identifies words that are important to one document within a collection of documents.
#mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
#                              "fig", "file", "cg", "cb", "cm",
#                               "ab", "_k", "_k_", "_x"))

#physics_words <- anti_join(physics_words, mystopwords, by = "word")
book_tf_idf <- book_words %>%
    bind_tf_idf(word,PM,n)

book_tf_idf %>%
    arrange(desc(tf_idf))

## # A tibble: 12,190 x 6
##    PM     word         n      tf   idf  tf_idf
##    <chr>  <chr>    <int>   <dbl> <dbl>   <dbl>
##  1 李克强 创新       197 0.00576 0.693 0.00399
##  2 周恩来 五年计划   209 0.00450 0.693 0.00312
##  3 李克强 深化       123 0.00360 0.693 0.00249
##  4 周恩来 资本主义   128 0.00276 0.693 0.00191
##  5 周恩来 基本建设   125 0.00269 0.693 0.00187
##  6 李克强 加大        84 0.00246 0.693 0.00170
##  7 周恩来 苏联       114 0.00246 0.693 0.00170
##  8 李克强 落实        76 0.00222 0.693 0.00154
##  9 李克强 持续        74 0.00216 0.693 0.00150
## 10 李克强 结构        68 0.00199 0.693 0.00138
## # ... with 12,180 more rows

book_tf_idf %>%
    group_by(PM) %>%
    slice_max(tf_idf, n = 15) %>%
    ungroup() %>%
    ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = PM)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~PM, ncol = 2, scales = "free") +
    labs(x = "tf-idf", y = NULL)

word_freq <- book_wordsall %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>% # remove words consisting of digits
    count(word) %>%
    arrange(desc(n)) 
word_freq %>%
    filter(n > 50) %>% 
    filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)

usenet_words <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+"))

usenet_words %>%
    count(word, sort = TRUE)

## # A tibble: 10,563 x 2
##    word         n
##    <chr>    <int>
##  1 发展      1034
##  2 建设       790
##  3 国家       652
##  4 人民       551
##  5 我国       539
##  6 经济       455
##  7 增长       441
##  8 社会主义   433
##  9 生产       431
## 10 工作       427
## # ... with 10,553 more rows

words_by_Year <- usenet_words %>%
    count(Year, word, sort = TRUE) %>%
    ungroup()

library(widyr)

newsgroup_cors <- words_by_Year %>%
    pairwise_cor(Year, word, n, sort = TRUE)

library(ggraph)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

set.seed(2017)

newsgroup_cors %>%
    filter(correlation > .5) %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(alpha = correlation, width = correlation)) +
    geom_node_point(size = 6, color = "lightblue") +
    geom_node_text(aes(label = name), repel = TRUE) +
    theme_void()

word_sci_newsgroups <- usenet_words %>%
    #filter(str_detect(newsgroup, "^sci")) %>%
    group_by(word) %>%
    mutate(word_total = n()) %>%
    ungroup() %>%
    filter(word_total > 10)
sci_dtm <- word_sci_newsgroups %>%
    unite(document, PM, Year) %>%
    count(document, word) %>%
    cast_dtm(document, word, n)

sci_lda <- LDA(sci_dtm, k = 2, control = list(seed = 2016))
sci_lda %>%
    tidy() %>%
    group_by(topic) %>%
    slice_max(beta, n = 8) %>%
    ungroup() %>%
    mutate(term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(beta, term, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    scale_y_reordered()

sci_lda %>%
    tidy(matrix = "gamma") %>%
    separate(document, c("PM", "Year"), sep = "_") %>%
    mutate(newsgroup = reorder(PM, gamma * topic)) %>%
    ggplot(aes(factor(topic), gamma)) +
    geom_boxplot() +
    facet_wrap(~ PM) +
    labs(x = "Topic",
         y = "# of messages where this was the highest % topic")

library(readr)
hanyuvalue <- read_table("G:/copaper/textm/hanyuvalue.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   word = col_character(),
##   value = col_double()
## )

contributions <- usenet_words %>%
    inner_join(hanyuvalue, by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))
contributions

## # A tibble: 2,262 x 3
##    word     occurences contribution
##    <chr>         <int>        <dbl>
##  1 癌症              3        3    
##  2 爱国              5        7    
##  3 爱国主义          1        1.4  
##  4 安不忘危          1       -0.285
##  5 安家              1        0.75 
##  6 安居              1        0.9  
##  7 安居梦            1        1    
##  8 安全             85       86.9  
##  9 鞍山              7       19.6  
## 10 按期              1        0.675
## # ... with 2,252 more rows

contributions %>%
    slice_max(abs(contribution), n = 25) %>%
    mutate(word = reorder(word, contribution)) %>%
    ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    labs(y = NULL)

top_sentiment_words <- words_by_Year %>%
    inner_join(hanyuvalue, by = "word") %>%
    mutate(contribution = value * n / sum(n))
top_sentiment_words %>%
    slice_max(abs(contribution), n =40) %>%
    mutate(word = reorder(word, contribution)) %>%
    ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ Year)+
    labs(y = NULL)

#quanteda first five   
#docnames(ch_toks1) <- docnames(corp)
qcorp <- corpus_subset(corp, Year <=1970)
hcorp <- corpus_subset(corp, Year >=1970)
qch_toks <- qcorp %>% 
  segment(jiebar =my_seg)%>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
qch_dfm <- dfm(qch_toks)
topfeatures(qch_dfm,20) #词频排序

##     国家     建设     我国     人民     发展     工业     生产 社会主义 
##      560      525      455      446      413      393      384      384 
##     增加     增长     农业     工作     计划     必须     企业     完成 
##      319      318      317      302      275      274      234      221 
##     方面     已经     全国 五年计划 
##      219      214      213      209

qfeatures_dfm_inaug <- textstat_frequency(qch_dfm)
qfeatures_dfm_inaug %>%
    filter(frequency > 100) %>% 
    #filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)

wordcloud2(qfeatures_dfm_inaug, size = 1,shape = 'star')

qch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

hch_toks <- hcorp %>%
    segment(jiebar =my_seg)%>%  
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
hch_dfm <- dfm(hch_toks)
topfeatures(hch_dfm)

## 发展 改革 推进 建设 经济 加强 创新 企业   新 加快 
##  621  329  288  265  249  241  197  191  186  170

hch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

hfeatures_dfm_inaug <- textstat_frequency(hch_dfm)
hfeatures_dfm_inaug %>%
    filter(frequency > 50) %>% 
    #filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)

dfm_weight_pres1 <- corp %>% 
    #segment(jiebar =my_seg)%>%
    tokens(remove_punct = TRUE) %>%
    tokens_remove(pattern = ch_stop)%>%
    tokens_remove(pattern = stopw)

dfm_weight_pres2 <- corp %>% 
  segment(jiebar =my_seg)%>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)


dfm_weight_pres <- dfm(dfm_weight_pres1)
dfm_weight_pres3 <- dfm(dfm_weight_pres2)
docvars(dfm_weight_pres3) <- docvars(dfm_weight_pres)
freq_weight <- textstat_frequency(dfm_weight_pres3, n = 10, groups = PM)
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
  geom_point() +
  facet_wrap(~ group, scales = "free") +
  coord_flip() +
  scale_x_continuous(breaks = nrow(freq_weight):1,
                     labels = freq_weight$feature) +
  labs(x = NULL, y = "Relative frequency")

ch_col <- textstat_collocations(qch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col, 10))

collocation	count	length	lambda	z
农业生产	89	2	4.114043	29.88157
社会主义改造	71	2	4.902053	28.33213
生活水平	40	2	5.493943	24.72993
第一个五年计划	116	2	8.717146	24.69897
占比重	32	2	5.851792	22.99078
资本主义工商业	32	2	6.504642	22.48245
人民生活	48	2	3.810470	21.57885
全国人民	51	2	3.610849	21.53231
国家预算	48	2	3.956755	21.28164
生产合作社	39	2	4.224305	20.82794

ch_col1 <- textstat_collocations(hch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col1 , 10))

collocation	count	length	lambda	z
人民群众	37	2	5.793552	23.91550
特色社会主义	29	2	7.657403	21.63584
深入推进	39	2	4.874122	21.08252
地方政府	31	2	5.135853	20.96735
核心党中央	22	2	7.562209	19.79827
中国特色	44	2	7.353838	19.06355
脱贫攻坚	21	2	7.569948	18.40448
政府工作	25	2	4.149822	17.46341
改革完善	34	2	3.427815	17.11324
全面建成	29	2	6.893374	16.50033

text.R

XY

2022-06-02