更多细节参考 tidytext/ quanteda/jiebar 官方教程 和课程分享链接
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 4.0.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.3
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.0.3
## Package version: 2.1.2
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(readtext)
## Warning: package 'readtext' was built under R version 4.0.3
library(tidyverse)
## -- Attaching packages ------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.3 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ---------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("E:/course/zf")
dat_txtmultiple2 <- readtext("*.txt",
docvarsfrom = "filenames",docvarnames = c( "PM","Year"),encoding ="UTF-8" )
corp <- corpus(dat_txtmultiple2)
summary(corpus(dat_txtmultiple2), 5)
## Corpus consisting of 17 documents, showing 5 documents:
##
## Text Types Tokens Sentences PM Year
## 华国锋_1978.txt 2965 19119 659 华国锋 1978
## 华国锋_1979.txt 2655 17330 504 华国锋 1979
## 华国锋_1980.txt 1454 8376 342 华国锋 1980
## 赵紫阳_1981.txt 2845 20135 671 赵紫阳 1981
## 赵紫阳_1982.txt 2728 18480 586 赵紫阳 1982
ch_stop <- stopwords("zh", source = "misc")
# tokenize
ch_toks <- corp %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(pattern = ch_stop)
# construct a dfm
ch_dfm <- dfm(ch_toks)
topfeatures(ch_dfm)
## 主义 经济 人民 建设 发展 社会 国家 生产 我国 工业
## 1817 1562 1560 1518 1458 1456 1405 1338 1155 1143
features_dfm_inaug <- textstat_frequency(ch_dfm)
textplot_wordcloud(ch_dfm, min_count = 100, random_order = FALSE,
rotation = 0, max_words = 100,
min_size = 0.5, max_size = 2.8,
color = RColorBrewer::brewer.pal(8, "Dark2"))
#词频统计、画图
ch_dfm%>%
textstat_frequency(n = 10) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
#词云
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.0.3
wordcloud2(demoFreq)
ff<-features_dfm_inaug[features_dfm_inaug[, 2] > 400]
wordcloud2(demoFreq, size = 1,shape = 'pentagon')
wordcloud2(ff, size = 1,shape = 'star')
#图画效果参考wordcloud2帮助调整
#jiebar
tt = worker()
segment("宝玉听了,喜不自禁,笑道:“待我放下书,帮你来收拾。”黛玉道:“什么
书?”宝玉见问,慌的藏了,便说道:“不过是《中庸》《大学》。”黛玉道:“你
又在我跟前弄鬼。趁早儿给我瞧瞧,好多着呢!”宝玉道:“妹妹,要论你我是不
怕的,你看了好歹别告诉人。真是好文章!你要看了,连饭也不想吃呢!”一面说,
一面递过去。黛玉把花具放下,接书来瞧,从头看去,越看越爱,不顿饭时,已看
了好几出了。但觉词句警人,馀香满口。一面看了,只管出神,心内还默默记诵。
宝玉笑道:“妹妹,你说好不好?”黛玉笑着点头儿。宝玉笑道:“我就是个‘多
愁多病的身’,你就是那‘倾国倾城的貌’。”黛玉听了,不觉带腮连耳的通红了,
登时竖起两道似蹙非蹙的眉,瞪了一双似睁非睁的眼,桃腮带怒,薄面含嗔,指着
宝玉道:“你这该死的,胡说了!好好儿的,把这些淫词艳曲弄了来,说这些混帐
话,欺负我。我告诉舅舅、舅母去!”说到“欺负”二字,就把眼圈儿红了,转身
就走。宝玉急了,忙向前拦住道:“好妹妹,千万饶我这一遭儿罢!要有心欺负你,
明儿我掉在池子里,叫个癞头鼋吃了去,变个大忘八,等你明儿做了‘一品夫人’
病老归西的时候儿,我往你坟上替你驼一辈子碑去。”说的黛玉“扑嗤”的一声笑
了,一面揉着眼,一面笑道:“一般唬的这么个样儿,还只管胡说。呸!原来也是
个‘银样蜡枪头’。”宝玉听了,笑道:“你说说,你这个呢?我也告诉去。”黛
玉笑道:“你说你会‘过目成诵’,难道我就不能‘一目十行’了?”宝玉一面收
书,一面笑道:“正经快把花儿埋了罢,别提那些个了。”二人便收拾落花。", tt)
## [1] "宝玉" "听" "了" "喜不自禁" "笑道" "待"
## [7] "我" "放下" "书" "帮" "你" "来"
## [13] "收拾" "黛" "玉" "道" "什么" "书"
## [19] "宝玉" "见" "问" "慌" "的" "藏"
## [25] "了" "便" "说道" "不过" "是" "中庸"
## [31] "大学" "黛" "玉" "道" "你" "又"
## [37] "在" "我" "跟前" "弄鬼" "趁早" "儿"
## [43] "给" "我" "瞧瞧" "好" "多着呢" "宝玉"
## [49] "道" "妹妹" "要论" "你" "我" "是"
## [55] "不" "怕" "的" "你" "看" "了"
## [61] "好歹" "别" "告诉" "人" "真是" "好"
## [67] "文章" "你" "要" "看" "了" "连饭"
## [73] "也" "不想" "吃呢" "一面" "说" "一面"
## [79] "递过去" "黛玉" "把" "花具" "放下" "接书来"
## [85] "瞧" "从头" "看去" "越" "看" "越"
## [91] "爱" "不" "顿饭" "时" "已" "看"
## [97] "了" "好几" "出" "了" "但觉" "词句"
## [103] "警人" "馀" "香" "满口" "一面" "看"
## [109] "了" "只管" "出神" "心内" "还" "默默"
## [115] "记诵" "宝玉" "笑道" "妹妹" "你" "说"
## [121] "好不好" "黛" "玉" "笑" "着" "点头"
## [127] "儿" "宝玉" "笑道" "我" "就是" "个"
## [133] "多" "愁多" "病的身" "你" "就是" "那"
## [139] "倾国倾城" "的貌" "黛" "玉" "听" "了"
## [145] "不觉" "带" "腮" "连" "耳" "的"
## [151] "通红" "了" "登时" "竖起" "两道" "似"
## [157] "蹙" "非" "蹙" "的" "眉" "瞪"
## [163] "了" "一双" "似睁非" "睁" "的" "眼"
## [169] "桃腮带" "怒" "薄面" "含" "嗔" "指着"
## [175] "宝玉" "道" "你" "这" "该死" "的"
## [181] "胡说" "了" "好好儿" "的" "把" "这些"
## [187] "淫词艳曲" "弄" "了" "来" "说" "这些"
## [193] "混帐" "话" "欺负" "我" "我" "告诉"
## [199] "舅舅" "舅母" "去" "说" "到" "欺负"
## [205] "二" "字" "就" "把" "眼圈儿" "红"
## [211] "了" "转身" "就" "走" "宝玉" "急"
## [217] "了" "忙" "向前" "拦住" "道" "好"
## [223] "妹妹" "千万" "饶" "我" "这" "一遭"
## [229] "儿" "罢" "要" "有心" "欺负" "你"
## [235] "明儿" "我" "掉" "在" "池子" "里"
## [241] "叫个" "癞头" "鼋" "吃" "了" "去"
## [247] "变个" "大忘八" "等" "你" "明儿" "做了"
## [253] "一品夫人" "病老" "归西" "的" "时候" "儿"
## [259] "我往" "你" "坟" "上" "替" "你"
## [265] "驼" "一辈子" "碑" "去" "说" "的"
## [271] "黛" "玉" "扑嗤" "的" "一声" "笑"
## [277] "了" "一面" "揉" "着眼" "一面" "笑道"
## [283] "一般" "唬" "的" "这么" "个" "样儿"
## [289] "还" "只管" "胡说" "呸" "原来" "也"
## [295] "是" "个" "银" "样" "蜡" "枪头"
## [301] "宝玉" "听" "了" "笑道" "你" "说"
## [307] "说" "你" "这个" "呢" "我" "也"
## [313] "告诉" "去" "黛" "玉笑道" "你" "说"
## [319] "你" "会" "过目成诵" "难道" "我" "就"
## [325] "不能" "一目十行" "了" "宝玉" "一面" "收"
## [331] "书" "一面" "笑道" "正经" "快" "把"
## [337] "花儿" "埋" "了" "罢" "别提" "那些"
## [343] "个" "了" "二人" "便" "收拾" "落花"
newword<-c("似蹙非蹙","似睁非睁")
new_user_word(tt, newword)
## [1] TRUE
segment("宝玉听了,喜不自禁,笑道:“待我放下书,帮你来收拾。”黛玉道:“什么
书?”宝玉见问,慌的藏了,便说道:“不过是《中庸》《大学》。”黛玉道:“你
又在我跟前弄鬼。趁早儿给我瞧瞧,好多着呢!”宝玉道:“妹妹,要论你我是不
怕的,你看了好歹别告诉人。真是好文章!你要看了,连饭也不想吃呢!”一面说,
一面递过去。黛玉把花具放下,接书来瞧,从头看去,越看越爱,不顿饭时,已看
了好几出了。但觉词句警人,馀香满口。一面看了,只管出神,心内还默默记诵。
宝玉笑道:“妹妹,你说好不好?”黛玉笑着点头儿。宝玉笑道:“我就是个‘多
愁多病的身’,你就是那‘倾国倾城的貌’。”黛玉听了,不觉带腮连耳的通红了,
登时竖起两道似蹙非蹙的眉,瞪了一双似睁非睁的眼,桃腮带怒,薄面含嗔,指着
宝玉道:“你这该死的,胡说了!好好儿的,把这些淫词艳曲弄了来,说这些混帐
话,欺负我。我告诉舅舅、舅母去!”说到“欺负”二字,就把眼圈儿红了,转身
就走。宝玉急了,忙向前拦住道:“好妹妹,千万饶我这一遭儿罢!要有心欺负你,
明儿我掉在池子里,叫个癞头鼋吃了去,变个大忘八,等你明儿做了‘一品夫人’
病老归西的时候儿,我往你坟上替你驼一辈子碑去。”说的黛玉“扑嗤”的一声笑
了,一面揉着眼,一面笑道:“一般唬的这么个样儿,还只管胡说。呸!原来也是
个‘银样蜡枪头’。”宝玉听了,笑道:“你说说,你这个呢?我也告诉去。”黛
玉笑道:“你说你会‘过目成诵’,难道我就不能‘一目十行’了?”宝玉一面收
书,一面笑道:“正经快把花儿埋了罢,别提那些个了。”二人便收拾落花。", tt)
## [1] "宝玉" "听" "了" "喜不自禁" "笑道" "待"
## [7] "我" "放下" "书" "帮" "你" "来"
## [13] "收拾" "黛" "玉" "道" "什么" "书"
## [19] "宝玉" "见" "问" "慌" "的" "藏"
## [25] "了" "便" "说道" "不过" "是" "中庸"
## [31] "大学" "黛" "玉" "道" "你" "又"
## [37] "在" "我" "跟前" "弄鬼" "趁早" "儿"
## [43] "给" "我" "瞧瞧" "好" "多着呢" "宝玉"
## [49] "道" "妹妹" "要论" "你" "我" "是"
## [55] "不" "怕" "的" "你" "看" "了"
## [61] "好歹" "别" "告诉" "人" "真是" "好"
## [67] "文章" "你" "要" "看" "了" "连饭"
## [73] "也" "不想" "吃呢" "一面" "说" "一面"
## [79] "递过去" "黛玉" "把" "花具" "放下" "接书来"
## [85] "瞧" "从头" "看去" "越" "看" "越"
## [91] "爱" "不" "顿饭" "时" "已" "看"
## [97] "了" "好几" "出" "了" "但觉" "词句"
## [103] "警人" "馀" "香" "满口" "一面" "看"
## [109] "了" "只管" "出神" "心内" "还" "默默"
## [115] "记诵" "宝玉" "笑道" "妹妹" "你" "说"
## [121] "好不好" "黛" "玉" "笑" "着" "点头"
## [127] "儿" "宝玉" "笑道" "我" "就是" "个"
## [133] "多" "愁多" "病的身" "你" "就是" "那"
## [139] "倾国倾城" "的貌" "黛" "玉" "听" "了"
## [145] "不觉" "带" "腮" "连" "耳" "的"
## [151] "通红" "了" "登时" "竖起" "两道" "似蹙非蹙"
## [157] "的" "眉" "瞪" "了" "一双" "似睁非睁"
## [163] "的" "眼" "桃腮带" "怒" "薄面" "含"
## [169] "嗔" "指着" "宝玉" "道" "你" "这"
## [175] "该死" "的" "胡说" "了" "好好儿" "的"
## [181] "把" "这些" "淫词艳曲" "弄" "了" "来"
## [187] "说" "这些" "混帐" "话" "欺负" "我"
## [193] "我" "告诉" "舅舅" "舅母" "去" "说"
## [199] "到" "欺负" "二" "字" "就" "把"
## [205] "眼圈儿" "红" "了" "转身" "就" "走"
## [211] "宝玉" "急" "了" "忙" "向前" "拦住"
## [217] "道" "好" "妹妹" "千万" "饶" "我"
## [223] "这" "一遭" "儿" "罢" "要" "有心"
## [229] "欺负" "你" "明儿" "我" "掉" "在"
## [235] "池子" "里" "叫个" "癞头" "鼋" "吃"
## [241] "了" "去" "变个" "大忘八" "等" "你"
## [247] "明儿" "做了" "一品夫人" "病老" "归西" "的"
## [253] "时候" "儿" "我往" "你" "坟" "上"
## [259] "替" "你" "驼" "一辈子" "碑" "去"
## [265] "说" "的" "黛" "玉" "扑嗤" "的"
## [271] "一声" "笑" "了" "一面" "揉" "着眼"
## [277] "一面" "笑道" "一般" "唬" "的" "这么"
## [283] "个" "样儿" "还" "只管" "胡说" "呸"
## [289] "原来" "也" "是" "个" "银" "样"
## [295] "蜡" "枪头" "宝玉" "听" "了" "笑道"
## [301] "你" "说" "说" "你" "这个" "呢"
## [307] "我" "也" "告诉" "去" "黛" "玉笑道"
## [313] "你" "说" "你" "会" "过目成诵" "难道"
## [319] "我" "就" "不能" "一目十行" "了" "宝玉"
## [325] "一面" "收" "书" "一面" "笑道" "正经"
## [331] "快" "把" "花儿" "埋" "了" "罢"
## [337] "别提" "那些" "个" "了" "二人" "便"
## [343] "收拾" "落花"
segment("当赤道留住雪花眼泪融掉细沙你肯珍惜我吗 如浮云陪伴天马公演一个童话 当配乐遗下结他画布忘掉了画 请想起我如绿草当这地球没有花 ", tt)
## [1] "当" "赤道" "留住" "雪花" "眼泪" "融掉" "细沙" "你" "肯" "珍惜"
## [11] "我" "吗" "如" "浮云" "陪伴" "天马" "公演" "一个" "童话" "当"
## [21] "配乐" "遗下" "结他" "画布" "忘掉" "了" "画" "请" "想起" "我"
## [31] "如" "绿草" "当" "这" "地球" "没有" "花"
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.