#广州大学公共管理学院社会学系创新班课程:文本分析入门/政府工作报告
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 4.0.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.3
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.0.3
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(readtext)
## Warning: package 'readtext' was built under R version 4.0.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.0.3
library(ggplot2)
library(tidyverse)
## -- Attaching packages -------------------------------- tidyverse 1.3.0 --
## √ tibble 3.0.3 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## √ purrr 0.3.4
## -- Conflicts ----------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("D:/text")
tenyears <- readtext("*.txt",
docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8" )
corp <- corpus(tenyears)
ch_stop <- stopwords("zh", source = "misc")
stopw<-c("性","化","1","5","9","0","中")
#subset
qcorp <- corpus_subset(corp, Year <=1970)
hcorp <- corpus_subset(corp, Year >=1970)
#first five
qch_toks <- qcorp %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
qch_dfm <- dfm(qch_toks)
topfeatures(qch_dfm,20)
## 建设 工业 国家 主义 生产 人民 计划 社会 我国 发展 工作 经济 农业 增长 五年 增加
## 730 711 685 654 585 557 552 492 455 424 384 370 360 335 329 302
## 一个 百分 完成 必须
## 283 283 275 274
qfeatures_dfm_inaug <- textstat_frequency(qch_dfm)
wordcloud2(qfeatures_dfm_inaug, size = 1,shape = 'star')
qch_dfm%>%
textstat_frequency(n = 20) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()

#recent years
hch_toks <- hcorp %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
hch_dfm <- dfm(hch_toks)
topfeatures(hch_dfm)
## 发展 改革 经济 推进 社会 建设 加强 企业 政策 创新
## 661 406 346 292 285 284 244 237 228 220
hfeatures_dfm_inaug <- textstat_frequency(hch_dfm)
wordcloud2(hfeatures_dfm_inaug, size = 1,shape = 'star')
hch_dfm%>%
textstat_frequency(n = 20) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()

##group
dfm_weight_pres <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)%>%
tokens_remove(pattern = stopw)
dfm_weight_pres <- dfm(dfm_weight_pres)
freq_weight <- textstat_frequency(dfm_weight_pres, n = 10, groups = "PM")
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight):1,
labels = freq_weight$feature) +
labs(x = NULL, y = "Relative frequency")
