library(jiebaR)
## Loading required package: jiebaRD
library(quanteda)
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Package version: 3.2.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textstats)
library(wordcloud2)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(tidyverse)
library(jiebaR)  
library(dplyr)
library(tidytext)
library(stringr)
library(topicmodels)
setwd("G:/copaper/textm/text/text")

#   https://quanteda.io/articles/pkgdown/examples/chinese.html
#  https://quanteda.io/articles/pkgdown/quickstart_cn.html

tenyears <- readtext("*.txt",
                     docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8",dvsep = "_")
corp <- corpus(tenyears)
ch_stop <- stopwords("zh", source = "misc")
stopw<-c("性","化","1","5","9","0","中")

tokeninfo <- summary(corp)
if (require(ggplot2)) ggplot(data = tokeninfo, aes(x = Text, y = Tokens, group = 1)) + 
    geom_line() + geom_point() +  theme_bw()#breaks = seq(1789, 2017, 12)

newword<-c("社会主义", "资本主义", "工商业","共产党","五年计划")
stopword1<-c("中","一个","走","道","占")
#segmenter <- worker(user="newword.txt",bylines = T, symbol=T)#stop_word = "stop.txt"
my_seg <- worker(user="newword",bylines = T)


book_wordsall <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    group_by(PM) %>%
    #mutate(word_id = row_number()) %>% # create word index within each document
    ungroup
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
book_words1 <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    group_by(PM) %>%
    #mutate(word_id = row_number()) %>% # create word index within each document
    ungroup

book_words2 <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    count(PM, Year,word, sort = TRUE)


book_words <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>%
    count(PM,word, sort = TRUE)


#These words are, as measured by tf-idf, the most important to each novel and most readers would likely agree. What measuring tf-idf has done here is show us that Jane Austen used similar language across her six novels, and what distinguishes one novel from the rest within the collection of her works are the proper nouns, the names of people and places. This is the point of tf-idf; it identifies words that are important to one document within a collection of documents.
#mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
#                              "fig", "file", "cg", "cb", "cm",
#                               "ab", "_k", "_k_", "_x"))

#physics_words <- anti_join(physics_words, mystopwords, by = "word")
book_tf_idf <- book_words %>%
    bind_tf_idf(word,PM,n)

book_tf_idf %>%
    arrange(desc(tf_idf))
## # A tibble: 12,190 x 6
##    PM     word         n      tf   idf  tf_idf
##    <chr>  <chr>    <int>   <dbl> <dbl>   <dbl>
##  1 李克强 创新       197 0.00576 0.693 0.00399
##  2 周恩来 五年计划   209 0.00450 0.693 0.00312
##  3 李克强 深化       123 0.00360 0.693 0.00249
##  4 周恩来 资本主义   128 0.00276 0.693 0.00191
##  5 周恩来 基本建设   125 0.00269 0.693 0.00187
##  6 李克强 加大        84 0.00246 0.693 0.00170
##  7 周恩来 苏联       114 0.00246 0.693 0.00170
##  8 李克强 落实        76 0.00222 0.693 0.00154
##  9 李克强 持续        74 0.00216 0.693 0.00150
## 10 李克强 结构        68 0.00199 0.693 0.00138
## # ... with 12,180 more rows
book_tf_idf %>%
    group_by(PM) %>%
    slice_max(tf_idf, n = 15) %>%
    ungroup() %>%
    ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = PM)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~PM, ncol = 2, scales = "free") +
    labs(x = "tf-idf", y = NULL)

word_freq <- book_wordsall %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+")) %>% # remove words consisting of digits
    count(word) %>%
    arrange(desc(n)) 
word_freq %>%
    filter(n > 50) %>% 
    filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)
usenet_words <- corp %>%
    tidy %>%mutate_all(funs(str_squish(.))) %>%
    unnest_tokens(output = word, 
                  input= text, drop = T,
                  token = function(x) segment(x, jiebar = my_seg)) %>%
    filter(!word %in% ch_stop) %>% # remove stopwords
    filter(!word %in% stopword1) %>%
    filter(word %>% str_detect(pattern = "\\D+"))

usenet_words %>%
    count(word, sort = TRUE)
## # A tibble: 10,563 x 2
##    word         n
##    <chr>    <int>
##  1 发展      1034
##  2 建设       790
##  3 国家       652
##  4 人民       551
##  5 我国       539
##  6 经济       455
##  7 增长       441
##  8 社会主义   433
##  9 生产       431
## 10 工作       427
## # ... with 10,553 more rows
words_by_Year <- usenet_words %>%
    count(Year, word, sort = TRUE) %>%
    ungroup()

library(widyr)

newsgroup_cors <- words_by_Year %>%
    pairwise_cor(Year, word, n, sort = TRUE)

library(ggraph)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
set.seed(2017)

newsgroup_cors %>%
    filter(correlation > .5) %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(alpha = correlation, width = correlation)) +
    geom_node_point(size = 6, color = "lightblue") +
    geom_node_text(aes(label = name), repel = TRUE) +
    theme_void()

word_sci_newsgroups <- usenet_words %>%
    #filter(str_detect(newsgroup, "^sci")) %>%
    group_by(word) %>%
    mutate(word_total = n()) %>%
    ungroup() %>%
    filter(word_total > 10)
sci_dtm <- word_sci_newsgroups %>%
    unite(document, PM, Year) %>%
    count(document, word) %>%
    cast_dtm(document, word, n)

sci_lda <- LDA(sci_dtm, k = 2, control = list(seed = 2016))
sci_lda %>%
    tidy() %>%
    group_by(topic) %>%
    slice_max(beta, n = 8) %>%
    ungroup() %>%
    mutate(term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(beta, term, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    scale_y_reordered()

sci_lda %>%
    tidy(matrix = "gamma") %>%
    separate(document, c("PM", "Year"), sep = "_") %>%
    mutate(newsgroup = reorder(PM, gamma * topic)) %>%
    ggplot(aes(factor(topic), gamma)) +
    geom_boxplot() +
    facet_wrap(~ PM) +
    labs(x = "Topic",
         y = "# of messages where this was the highest % topic")

library(readr)
hanyuvalue <- read_table("G:/copaper/textm/hanyuvalue.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   word = col_character(),
##   value = col_double()
## )
contributions <- usenet_words %>%
    inner_join(hanyuvalue, by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))
contributions
## # A tibble: 2,262 x 3
##    word     occurences contribution
##    <chr>         <int>        <dbl>
##  1 癌症              3        3    
##  2 爱国              5        7    
##  3 爱国主义          1        1.4  
##  4 安不忘危          1       -0.285
##  5 安家              1        0.75 
##  6 安居              1        0.9  
##  7 安居梦            1        1    
##  8 安全             85       86.9  
##  9 鞍山              7       19.6  
## 10 按期              1        0.675
## # ... with 2,252 more rows
contributions %>%
    slice_max(abs(contribution), n = 25) %>%
    mutate(word = reorder(word, contribution)) %>%
    ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    labs(y = NULL)

top_sentiment_words <- words_by_Year %>%
    inner_join(hanyuvalue, by = "word") %>%
    mutate(contribution = value * n / sum(n))
top_sentiment_words %>%
    slice_max(abs(contribution), n =40) %>%
    mutate(word = reorder(word, contribution)) %>%
    ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ Year)+
    labs(y = NULL)

#quanteda first five   
#docnames(ch_toks1) <- docnames(corp)
qcorp <- corpus_subset(corp, Year <=1970)
hcorp <- corpus_subset(corp, Year >=1970)
qch_toks <- qcorp %>% 
  segment(jiebar =my_seg)%>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
qch_dfm <- dfm(qch_toks)
topfeatures(qch_dfm,20) #词频排序
##     国家     建设     我国     人民     发展     工业     生产 社会主义 
##      560      525      455      446      413      393      384      384 
##     增加     增长     农业     工作     计划     必须     企业     完成 
##      319      318      317      302      275      274      234      221 
##     方面     已经     全国 五年计划 
##      219      214      213      209
qfeatures_dfm_inaug <- textstat_frequency(qch_dfm)
qfeatures_dfm_inaug %>%
    filter(frequency > 100) %>% 
    #filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)
wordcloud2(qfeatures_dfm_inaug, size = 1,shape = 'star')
qch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

hch_toks <- hcorp %>%
    segment(jiebar =my_seg)%>%  
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
hch_dfm <- dfm(hch_toks)
topfeatures(hch_dfm)
## 发展 改革 推进 建设 经济 加强 创新 企业   新 加快 
##  621  329  288  265  249  241  197  191  186  170
hch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

hfeatures_dfm_inaug <- textstat_frequency(hch_dfm)
hfeatures_dfm_inaug %>%
    filter(frequency > 50) %>% 
    #filter(nchar(word) >=2) %>%
    wordcloud2(shape = "star", size = 0.3)
dfm_weight_pres1 <- corp %>% 
    #segment(jiebar =my_seg)%>%
    tokens(remove_punct = TRUE) %>%
    tokens_remove(pattern = ch_stop)%>%
    tokens_remove(pattern = stopw)

dfm_weight_pres2 <- corp %>% 
  segment(jiebar =my_seg)%>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)


dfm_weight_pres <- dfm(dfm_weight_pres1)
dfm_weight_pres3 <- dfm(dfm_weight_pres2)
docvars(dfm_weight_pres3) <- docvars(dfm_weight_pres)
freq_weight <- textstat_frequency(dfm_weight_pres3, n = 10, groups = PM)
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
  geom_point() +
  facet_wrap(~ group, scales = "free") +
  coord_flip() +
  scale_x_continuous(breaks = nrow(freq_weight):1,
                     labels = freq_weight$feature) +
  labs(x = NULL, y = "Relative frequency")

ch_col <- textstat_collocations(qch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col, 10))
collocation count count_nested length lambda z
农业 生产 89 0 2 4.114043 29.88157
社会主义 改造 71 0 2 4.902053 28.33213
生活 水平 40 0 2 5.493943 24.72993
第一个 五年计划 116 0 2 8.717146 24.69897
占 比重 32 0 2 5.851792 22.99078
资本主义 工商业 32 0 2 6.504642 22.48245
人民 生活 48 0 2 3.810470 21.57885
全国 人民 51 0 2 3.610849 21.53231
国家 预算 48 0 2 3.956755 21.28164
生产 合作社 39 0 2 4.224305 20.82794
ch_col1 <- textstat_collocations(hch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col1 , 10))
collocation count count_nested length lambda z
人民 群众 37 0 2 5.793552 23.91550
特色 社会主义 29 0 2 7.657403 21.63584
深入 推进 39 0 2 4.874122 21.08252
地方 政府 31 0 2 5.135853 20.96735
核心 党中央 22 0 2 7.562209 19.79827
中国 特色 44 0 2 7.353838 19.06355
脱贫 攻坚 21 0 2 7.569948 18.40448
政府 工作 25 0 2 4.149822 17.46341
改革 完善 34 0 2 3.427815 17.11324
全面 建成 29 0 2 6.893374 16.50033