#广州大学公共管理学院社会学系创新班课程:文本分析入门/政府工作报告
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 4.0.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.3
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.0.3
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(readtext)
## Warning: package 'readtext' was built under R version 4.0.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.0.3
library(ggplot2)
library(tidyverse)
## -- Attaching packages -------------------------------- tidyverse 1.3.0 --
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## √ purrr   0.3.4
## -- Conflicts ----------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd("D:/text")
tenyears <- readtext("*.txt",
                             docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8" )
corp <- corpus(tenyears)
ch_stop <- stopwords("zh", source = "misc")
stopw<-c("性","化","1","5","9","0","中")
#subset
qcorp <- corpus_subset(corp, Year <=1970)
hcorp <- corpus_subset(corp, Year >=1970)
#first five 
qch_toks <- qcorp %>% 
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
qch_dfm <- dfm(qch_toks)
topfeatures(qch_dfm,20)
## 建设 工业 国家 主义 生产 人民 计划 社会 我国 发展 工作 经济 农业 增长 五年 增加 
##  730  711  685  654  585  557  552  492  455  424  384  370  360  335  329  302 
## 一个 百分 完成 必须 
##  283  283  275  274
qfeatures_dfm_inaug <- textstat_frequency(qch_dfm)
wordcloud2(qfeatures_dfm_inaug, size = 1,shape = 'star')
qch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

#recent years
hch_toks <- hcorp %>% 
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
hch_dfm <- dfm(hch_toks)
topfeatures(hch_dfm)
## 发展 改革 经济 推进 社会 建设 加强 企业 政策 创新 
##  661  406  346  292  285  284  244  237  228  220
hfeatures_dfm_inaug <- textstat_frequency(hch_dfm)
wordcloud2(hfeatures_dfm_inaug, size = 1,shape = 'star')
hch_dfm%>% 
  textstat_frequency(n = 20) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,  y = "Frequency") +
  theme_minimal()

##group
dfm_weight_pres <- corp %>% 
  tokens(remove_punct = TRUE) %>%
  tokens_remove(pattern = ch_stop)%>%
  tokens_remove(pattern = stopw)
dfm_weight_pres <- dfm(dfm_weight_pres)

freq_weight <- textstat_frequency(dfm_weight_pres, n = 10, groups = "PM")

ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
  geom_point() +
  facet_wrap(~ group, scales = "free") +
  coord_flip() +
  scale_x_continuous(breaks = nrow(freq_weight):1,
                     labels = freq_weight$feature) +
  labs(x = NULL, y = "Relative frequency")