## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: tidytext
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Loading required package: gutenbergr
## Loading required package: stringr
## Loading required package: wordcloud2
## Loading required package: ggplot2
## Loading required package: tidyr
## Loading required package: scales
本篇擷取中央社的報導
斷詞引擎建立完成後,可以使用不同方式進行斷詞
## [1] "近幾年來" "良心" "投資" "逐漸"
## [5] "形成" "一股" "風潮" "投資人"
## [9] "對於" "企業" "社會" "責任"
## [13] "愈來愈" "重視" "環境保護" "社會"
## [17] "責任" "與" "公司" "治理"
## [21] "加總" "的" "ESG" "概念"
## [25] "也" "成為" "當今" "華爾街"
## [29] "和" "法人" "投資" "圈"
## [33] "最" "熱門" "的熱" "搜"
## [37] "關鍵" "英文" "縮寫" "什麼"
## [41] "是" "ESG" "就是" "環境保護"
## [45] "Environmental" "社會" "責任" "Social"
## [49] "與" "公司" "治理" "Governance"
## [53] "這" "3" "個" "英文字"
## [57] "的" "縮寫" "從政" "府"
## [61] "推動" "限塑" "政策" "上市公司"
## [65] "需編" "製" "與" "申報"
## [69] "企業" "社會" "責任" "報告書"
## [73] "民間團體" "自動" "發起" "淨灘"
## [77] "活動" "等" "背後" "都"
## [81] "蘊含" "著" "ESG" "的"
## [85] "意涵" "ESG" "投資" "可以"
## [89] "回溯到" "2006" "年" "聯合國"
## [93] "責任" "投資" "原則" "UNPRI"
## [97] "的" "成立" "致力於" "推動"
## [101] "全球" "各大" "投資" "機構"
## [105] "在" "投資決策" "過程" "中"
## [109] "必須" "納入" "ESG" "因子"
## [113] "並" "協助" "PRI" "的"
## [117] "簽署" "國家" "全面" "提升"
## [121] "責任" "投資" "的" "能力"
## [125] "戮力" "經營" "本業" "的"
## [129] "企業" "若" "不" "重視"
## [133] "ESG" "就算" "有" "好"
## [137] "的" "本益比" "好" "的"
## [141] "成長" "也" "會" "影響"
## [145] "公司" "價值" "尤其" "是"
## [149] "近年來" "環境" "變遷" "等"
## [153] "無差別" "衝擊" "影響" "企業"
## [157] "經營" "表現" "投資人" "轉向"
## [161] "從" "ESG" "SDGs" "聯合國"
## [165] "永續" "發展" "目標" "等"
## [169] "評價" "模式" "尋找" "投資"
## [173] "標的"
## [1] TRUE
## [1] "近幾年來" "良心投資" "逐漸"
## [4] "形成" "一股" "風潮"
## [7] "投資人" "對於" "企業社會責任"
## [10] "愈來愈" "重視" "環境保護"
## [13] "社會責任" "與" "公司治理"
## [16] "加總" "的" "ESG"
## [19] "概念" "也" "成為"
## [22] "當今" "華爾街" "和"
## [25] "法人" "投資圈" "最"
## [28] "熱門" "的" "熱搜"
## [31] "關鍵" "英文" "縮寫"
## [34] "什麼" "是" "ESG"
## [37] "就是" "環境保護" "Environmental"
## [40] "社會責任" "Social" "與"
## [43] "公司治理" "Governance" "這"
## [46] "3" "個" "英文字"
## [49] "的" "縮寫" "從政"
## [52] "府" "推動" "限塑"
## [55] "政策" "上市公司" "需編"
## [58] "製" "與" "申報"
## [61] "企業社會責任報告書" "民間團體" "自動"
## [64] "發起" "淨灘" "活動"
## [67] "等" "背後" "都"
## [70] "蘊含" "著" "ESG"
## [73] "的" "意涵" "ESG"
## [76] "投資" "可以" "回溯到"
## [79] "2006" "年" "聯合國"
## [82] "責任" "投資" "原則"
## [85] "UNPRI" "的" "成立"
## [88] "致力於" "推動" "全球"
## [91] "各大" "投資" "機構"
## [94] "在" "投資決策" "過程"
## [97] "中" "必須" "納入"
## [100] "ESG" "因子" "並"
## [103] "協助" "PRI" "的"
## [106] "簽署" "國家" "全面"
## [109] "提升" "責任" "投資"
## [112] "的" "能力" "戮力"
## [115] "經營" "本業" "的"
## [118] "企業" "若" "不"
## [121] "重視" "ESG" "就算"
## [124] "有" "好" "的"
## [127] "本益比" "好" "的"
## [130] "成長" "也" "會"
## [133] "影響" "公司" "價值"
## [136] "尤其" "是" "近年來"
## [139] "環境" "變遷" "等"
## [142] "無差別" "衝擊" "影響"
## [145] "企業" "經營" "表現"
## [148] "投資人" "轉向" "從"
## [151] "ESG" "SDGs" "聯合國"
## [154] "永續" "發展" "目標"
## [157] "等" "評價" "模式"
## [160] "尋找" "投資" "標的"
## [1] "近幾年來" "良心" "投資" "逐漸"
## [5] "形成" "一股" "風潮" "投資人"
## [9] "對於" "企業" "社會" "責任"
## [13] "愈來愈" "重視" "環境保護" "社會"
## [17] "責任" "與" "公司治理" "加總"
## [21] "的" "ESG" "概念" "也"
## [25] "成為" "當今" "華爾街" "和"
## [29] "法人" "投資" "圈" "最"
## [33] "熱門" "的熱" "搜" "關鍵"
## [37] "英文" "縮寫" "什麼" "是"
## [41] "ESG" "就是" "環境保護" "Environmental"
## [45] "社會" "責任" "Social" "與"
## [49] "公司治理" "Governance" "這" "3"
## [53] "個" "英文字" "的" "縮寫"
## [57] "從政" "府" "推動" "限塑"
## [61] "政策" "上市公司" "需編" "製"
## [65] "與" "申報" "企業" "社會"
## [69] "責任" "報告書" "民間團體" "自動"
## [73] "發起" "淨灘" "活動" "等"
## [77] "背後" "都" "蘊含" "著"
## [81] "ESG" "的" "意涵" "ESG"
## [85] "投資" "可以" "回溯到" "2006"
## [89] "年" "聯合國" "責任" "投資"
## [93] "原則" "UNPRI" "的" "成立"
## [97] "致力於" "推動" "全球" "各大"
## [101] "投資" "機構" "在" "投資決策"
## [105] "過程" "中" "必須" "納入"
## [109] "ESG" "因子" "並" "協助"
## [113] "PRI" "的" "簽署" "國家"
## [117] "全面" "提升" "責任" "投資"
## [121] "的" "能力" "戮力" "經營"
## [125] "本業" "的" "企業" "若"
## [129] "不" "重視" "ESG" "就算"
## [133] "有" "好" "的" "本益比"
## [137] "好" "的" "成長" "也"
## [141] "會" "影響" "公司" "價值"
## [145] "尤其" "是" "近年來" "環境"
## [149] "變遷" "等" "無差別" "衝擊"
## [153] "影響" "企業" "經營" "表現"
## [157] "投資人" "轉向" "從" "ESG"
## [161] "SDGs" "聯合國" "永續" "發展"
## [165] "目標" "等" "評價" "模式"
## [169] "尋找" "投資" "標的"
tokens <- segment(chinese_text, jieba_tokenizer)
stop_words <- c("對於", "愈來愈", "就是", "什麼")
result <- filter_segment(tokens, stop_words)
result
## [1] "近幾年來" "良心" "投資" "逐漸"
## [5] "形成" "一股" "風潮" "投資人"
## [9] "企業" "社會" "責任" "重視"
## [13] "環境保護" "社會" "責任" "與"
## [17] "公司治理" "加總" "的" "ESG"
## [21] "概念" "也" "成為" "當今"
## [25] "華爾街" "和" "法人" "投資"
## [29] "圈" "最" "熱門" "的熱"
## [33] "搜" "關鍵" "英文" "縮寫"
## [37] "是" "ESG" "環境保護" "Environmental"
## [41] "社會" "責任" "Social" "與"
## [45] "公司治理" "Governance" "這" "3"
## [49] "個" "英文字" "的" "縮寫"
## [53] "從政" "府" "推動" "限塑"
## [57] "政策" "上市公司" "需編" "製"
## [61] "與" "申報" "企業" "社會"
## [65] "責任" "報告書" "民間團體" "自動"
## [69] "發起" "淨灘" "活動" "等"
## [73] "背後" "都" "蘊含" "著"
## [77] "ESG" "的" "意涵" "ESG"
## [81] "投資" "可以" "回溯到" "2006"
## [85] "年" "聯合國" "責任" "投資"
## [89] "原則" "UNPRI" "的" "成立"
## [93] "致力於" "推動" "全球" "各大"
## [97] "投資" "機構" "在" "投資決策"
## [101] "過程" "中" "必須" "納入"
## [105] "ESG" "因子" "並" "協助"
## [109] "PRI" "的" "簽署" "國家"
## [113] "全面" "提升" "責任" "投資"
## [117] "的" "能力" "戮力" "經營"
## [121] "本業" "的" "企業" "若"
## [125] "不" "重視" "ESG" "就算"
## [129] "有" "好" "的" "本益比"
## [133] "好" "的" "成長" "也"
## [137] "會" "影響" "公司" "價值"
## [141] "尤其" "是" "近年來" "環境"
## [145] "變遷" "等" "無差別" "衝擊"
## [149] "影響" "企業" "經營" "表現"
## [153] "投資人" "轉向" "從" "ESG"
## [157] "SDGs" "聯合國" "永續" "發展"
## [161] "目標" "等" "評價" "模式"
## [165] "尋找" "投資" "標的"
## [1] "近幾年來" "良心" "投資" "逐漸"
## [5] "形成" "一股" "風潮" "投資人"
## [9] "對於" "企業" "社會" "責任"
## [13] "愈來愈" "重視" "環境保護" "社會"
## [17] "責任" "與" "公司" "治理"
## [21] "加總" "的" "ESG" "概念"
## [25] "也" "成為" "當今" "華爾街"
## [29] "和" "法人" "投資" "圈"
## [33] "最" "熱門" "的熱" "搜"
## [37] "關鍵" "英文" "縮寫" "什麼"
## [41] "是" "ESG" "就是" "環境保護"
## [45] "Environmental" "社會" "責任" "Social"
## [49] "與" "公司" "治理" "Governance"
## [53] "這" "3" "個" "英文字"
## [57] "的" "縮寫" "從政" "府"
## [61] "推動" "限塑" "政策" "上市公司"
## [65] "需編" "製" "與" "申報"
## [69] "企業" "社會" "責任" "報告書"
## [73] "民間團體" "自動" "發起" "淨灘"
## [77] "活動" "等" "背後" "都"
## [81] "蘊含" "著" "ESG" "的"
## [85] "意涵" "ESG" "投資" "可以"
## [89] "回溯到" "2006" "年" "聯合國"
## [93] "責任" "投資" "原則" "UNPRI"
## [97] "的" "成立" "致力於" "推動"
## [101] "全球" "各大" "投資" "機構"
## [105] "在" "投資決策" "過程" "中"
## [109] "必須" "納入" "ESG" "因子"
## [113] "並" "協助" "PRI" "的"
## [117] "簽署" "國家" "全面" "提升"
## [121] "責任" "投資" "的" "能力"
## [125] "戮力" "經營" "本業" "的"
## [129] "企業" "若" "不" "重視"
## [133] "ESG" "就算" "有" "好"
## [137] "的" "本益比" "好" "的"
## [141] "成長" "也" "會" "影響"
## [145] "公司" "價值" "尤其" "是"
## [149] "近年來" "環境" "變遷" "等"
## [153] "無差別" "衝擊" "影響" "企業"
## [157] "經營" "表現" "投資人" "轉向"
## [161] "從" "ESG" "SDGs" "聯合國"
## [165] "永續" "發展" "目標" "等"
## [169] "評價" "模式" "尋找" "投資"
## [173] "標的"
## [1] "近幾年來" "良心" "投資" "逐漸"
## [5] "形成" "一股" "風潮" "投資人"
## [9] "企業" "社會" "責任" "重視"
## [13] "環境保護" "社會" "責任" "公司治理"
## [17] "加總" "ESG" "概念" "成為"
## [21] "當今" "華爾街" "法人" "投資"
## [25] "熱門" "的熱" "關鍵" "英文"
## [29] "縮寫" "ESG" "環境保護" "Environmental"
## [33] "社會" "責任" "Social" "公司治理"
## [37] "Governance" "英文字" "縮寫" "從政"
## [41] "推動" "限塑" "政策" "上市公司"
## [45] "需編" "申報" "企業" "社會"
## [49] "責任" "報告書" "民間團體" "自動"
## [53] "發起" "淨灘" "活動" "背後"
## [57] "蘊含" "ESG" "意涵" "ESG"
## [61] "投資" "可以" "回溯到" "2006"
## [65] "聯合國" "責任" "投資" "原則"
## [69] "UNPRI" "成立" "致力於" "推動"
## [73] "全球" "各大" "投資" "機構"
## [77] "投資決策" "過程" "必須" "納入"
## [81] "ESG" "因子" "協助" "PRI"
## [85] "簽署" "國家" "全面" "提升"
## [89] "責任" "投資" "能力" "戮力"
## [93] "經營" "本業" "企業" "重視"
## [97] "ESG" "就算" "本益比" "成長"
## [101] "影響" "公司" "價值" "尤其"
## [105] "近年來" "環境" "變遷" "無差別"
## [109] "衝擊" "影響" "企業" "經營"
## [113] "表現" "投資人" "轉向" "ESG"
## [117] "SDGs" "聯合國" "永續" "發展"
## [121] "目標" "評價" "模式" "尋找"
## [125] "投資" "標的"
english_text <- c("Because of you",
"I never stray too far from the sidewalk",
"Because of you",
"I learned to play on the safe side so I don’t get hurt")
english_text
## [1] "Because of you"
## [2] "I never stray too far from the sidewalk"
## [3] "Because of you"
## [4] "I learned to play on the safe side so I don’t get hurt"
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because of you
## 2 2 I never stray too far from the sidewalk
## 3 3 Because of you
## 4 4 I learned to play on the safe side so I don’t get hurt
## # A tibble: 27 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 of
## 3 1 you
## 4 2 i
## 5 2 never
## 6 2 stray
## 7 2 too
## 8 2 far
## 9 2 from
## 10 2 the
## # … with 17 more rows
根據文件說明:“If a function, should take a character vector and return a list of character vectors of the same length”得知,我們可以自訂義斷詞函式,而input的型態為vector,output為list
chinese_text_df <- tibble(paragraph=1:length(chinese_text), text = chinese_text)
chinese_text_df %>% unnest_tokens(word, text, token=chinese_tokenizer)
## # A tibble: 206 x 2
## paragraph word
## <int> <chr>
## 1 1 近幾年來
## 2 1 良心
## 3 1 投資
## 4 1 逐漸
## 5 1 形成
## 6 1 一股
## 7 1 風潮
## 8 1 投資人
## 9 1 對於
## 10 1 企業
## # … with 196 more rows
## [[1]]
## [1] "近幾年來「良心投資」逐漸形成一股風潮,投資人對於企業社會責任愈來愈重視,環境保護、社會責任與公司治理加總的ESG概念,也成為當今華爾街和法人投資圈最熱門的熱搜關鍵英文縮寫"
## [2] "什麼是「ESG」?就是環境保護(Environmental)、社會責任(Social)與公司治理(Governance)這3個英文字的縮寫,從政府推動限塑政策、上市公司需編製與申報企業社會責任報告書、民間團體自動發起淨灘活動等,背後都蘊含著ESG的意涵"
## [3] "ESG投資可以回溯到2006年聯合國責任投資原則(UNPRI)的成立,致力於推動全球各大投資機構在投資決策過程中必須納入ESG因子,並協助PRI的簽署國家全面提升責任投資的能力"
## [4] "戮力經營本業的企業,若不重視ESG,就算有好的本益比、好的成長,也會影響公司價值,尤其是近年來環境變遷等「無差別衝擊」影響企業經營表現,投資人轉向從ESG、SDGs(聯合國永續發展目標)等評價模式尋找投資標的"
##
## [[2]]
## [1] "為什麼ESG對企業和投資人來說會麼這麼重要?施宜君指出,通常重視ESG的企業,信用評等較為良好,且公司財務較為穩健,往往較有機會可以帶來較佳報酬率以及穩定性,在後疫情時代,採用ESG因子來謹慎選擇股票、債券等標的,也可望提前避開問題企業、進一步防範投資風險"
## [2] "國泰投信認為,ESG就像是企業永續的KPI,ESG做得好的公司往往能贏得社會長期肯定,對公司永續經營、企業長線獲利等都能帶來挹注,連帶也吸引海內外資金持續流入ESG市場"
利用stringr套件中的strsplit,以“。”進行斷句
## [[1]]
## [1] "近幾年來" "良心" "投資" "逐漸" "形成" "一股"
## [7] "風潮" "投資人" "對於" "企業" "社會" "責任"
## [13] "愈來愈" "重視" "環境保護" "社會" "責任" "公司治理"
## [19] "加總" "ESG" "概念" "成為" "當今" "華爾街"
## [25] "法人" "投資" "熱門" "的熱" "關鍵" "英文"
## [31] "縮寫"
##
## [[2]]
## [1] "ESG" "就是" "環境保護" "Environmental"
## [5] "社會" "責任" "Social" "公司治理"
## [9] "Governance" "英文字" "縮寫" "從政"
## [13] "推動" "限塑" "政策" "上市公司"
## [17] "需編" "申報" "企業" "社會"
## [21] "責任" "報告書" "民間團體" "自動"
## [25] "發起" "淨灘" "活動" "背後"
## [29] "蘊含" "ESG" "意涵"
##
## [[3]]
## [1] "ESG" "投資" "可以" "回溯到" "2006" "聯合國"
## [7] "責任" "投資" "原則" "UNPRI" "成立" "致力於"
## [13] "推動" "全球" "各大" "投資" "機構" "投資決策"
## [19] "過程" "必須" "納入" "ESG" "因子" "協助"
## [25] "PRI" "簽署" "國家" "全面" "提升" "責任"
## [31] "投資" "能力"
##
## [[4]]
## [1] "戮力" "經營" "本業" "企業" "重視" "ESG" "就算" "本益比"
## [9] "成長" "影響" "公司" "價值" "尤其" "近年來" "環境" "變遷"
## [17] "無差別" "衝擊" "影響" "企業" "經營" "表現" "投資人" "轉向"
## [25] "ESG" "SDGs" "聯合國" "永續" "發展" "目標" "評價" "模式"
## [33] "尋找" "投資" "標的"
Gutenberg free eBooks
Also, various chinese books can be found in the link below:
紅樓夢 by Xueqin Cao:
# 下載 "紅樓夢" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
red <- gutenberg_download(24264) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
紅樓夢每章節的開頭都會有“第X回”的詞,但有些僅以“第X回”表示,有些則在“第X回”後面加上章節名稱,EX.「第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀」
根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 24264 "第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀" 1
## 2 24264 "------------------------------------------------------… 1
## 3 24264 "此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去,"… 1
## 4 24264 "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記"… 1
## 5 24264 "何事何人?自又云:“今風塵碌碌,一事無成,忽念及當日所有之女子,一"… 1
## 6 24264 "一細考較去,覺其行止見識,皆出于我之上.何我堂堂須眉,誠不若彼裙釵"… 1
## 7 24264 "哉?實愧則有余,悔又無益之大無可如何之日也!當此,則自欲將已往所賴"… 1
## 8 24264 "天恩祖德,錦衣紈褲之時,飫甘饜肥之日,背父兄教育之恩,負師友規談之"… 1
## 9 24264 "德,以至今日一技無成,半生潦倒之罪,編述一集,以告天下人:我之罪固"… 1
## 10 24264 "不免,然閨閣中本自歷歷有人,万不可因我之不肖,自護己短,一并使其泯"… 1
## 11 24264 "滅也.雖今日之茅椽蓬牖,瓦灶繩床,其晨夕風露,階柳庭花,亦未有妨我"… 1
## 12 24264 "之襟怀筆墨者.雖我未學,下筆無文,又何妨用假語村言,敷演出一段故事"… 1
## 13 24264 "來,亦可使閨閣昭傳,复可悅世之目,破人愁悶,不亦宜乎?\"故曰\"賈雨村"… 1
## 14 24264 "\"云云." 1
## 15 24264 " 此回中凡用“夢”用“幻”等字,是提醒閱者眼目,亦是此書立意本旨"… 1
## 16 24264 "." 1
## 17 24264 " 列位看官:你道此書從何而來?說起根由雖近荒唐,細按則深有趣味."… 1
## 18 24264 "待在下將此來歷注明,方使閱者了然不惑." 1
## 19 24264 " 原來女媧氏煉石補天之時,于大荒山無稽崖練成高經十二丈,方經二十"… 1
## 20 24264 "四丈頑石三万六千五百零一塊.媧皇氏只用了三万六千五百塊,只單單剩了"… 1
下載下來的書已經完成斷句
jieba_tokenizer <- worker(user="dream_of_the_red_chamber_lexicon.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
## tibble [462,936 × 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:462936] 24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
## $ chapter : int [1:462936] 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr [1:462936] "第一回" " " "甄士隱夢幻識通靈" " " ...
## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 24264 1 第一回
## 2 24264 1
## 3 24264 1 甄士隱夢幻識通靈
## 4 24264 1
## 5 24264 1 賈雨村
## 6 24264 1 風塵
## 7 24264 1 怀
## 8 24264 1 閨秀
## 9 24264 1 此
## 10 24264 1 開卷
## 11 24264 1 第一回
## 12 24264 1 也
## 13 24264 1 作者
## 14 24264 1 自云
## 15 24264 1 因
## 16 24264 1 曾
## 17 24264 1 歷
## 18 24264 1 過
## 19 24264 1 一番
## 20 24264 1 夢幻
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 寶玉 3751
## 2 笑道 1893
## 3 什么 1738
## 4 鳳姐 1652
## 5 賈母 1611
## 6 一個 1392
## 7 我們 1165
## 8 那里 1136
## 9 襲人 1126
## 10 黛玉 1055
## 11 寶釵 1018
## 12 王夫人 1005
## 13 如今 967
## 14 怎么 963
## 15 你們 952
## 16 知道 932
## 17 說道 927
## 18 起來 920
## 19 出來 907
## 20 賈政 904
以句子數量來計算
plot <-
bind_rows(
red %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light"))
plot
frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 40`)
frequency
## # A tibble: 43,276 x 4
## word `First 80` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 阿房宮 0.00000621 Last 40 NA
## 2 阿膠 0.00000621 Last 40 NA
## 3 阿彌陀 0.0000124 Last 40 NA
## 4 阿彌陀佛 0.000168 Last 40 0.000131
## 5 阿物儿 0.0000249 Last 40 NA
## 6 呵呵 0.0000373 Last 40 NA
## 7 呵呵大笑 0.00000621 Last 40 NA
## 8 呵叱 0.00000621 Last 40 NA
## 9 啊呀 NA Last 40 0.0000263
## 10 哎喲 0.0000124 Last 40 NA
## # … with 43,266 more rows
ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 80", x = "Last 40")