Ch.0:套件安裝及載入

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","knitr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4

Ch.1:使用Jieba套件進行中文斷詞

本篇擷取中央社的報導

基本斷詞

建立文本

chinese_text <- "近幾年來「良心投資」逐漸形成一股風潮,投資人對於企業社會責任愈來愈重視,環境保護、社會責任與公司治理加總的ESG概念,也成為當今華爾街和法人投資圈最熱門的熱搜關鍵英文縮寫。什麼是「ESG」?就是環境保護(Environmental)、社會責任(Social)與公司治理(Governance)這3個英文字的縮寫,從政府推動限塑政策、上市公司需編製與申報企業社會責任報告書、民間團體自動發起淨灘活動等,背後都蘊含著ESG的意涵。ESG投資可以回溯到2006年聯合國責任投資原則(UNPRI)的成立,致力於推動全球各大投資機構在投資決策過程中必須納入ESG因子,並協助PRI的簽署國家全面提升責任投資的能力。戮力經營本業的企業,若不重視ESG,就算有好的本益比、好的成長,也會影響公司價值,尤其是近年來環境變遷等「無差別衝擊」影響企業經營表現,投資人轉向從ESG、SDGs(聯合國永續發展目標)等評價模式尋找投資標的。"

初始化斷詞引擎

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

斷詞

斷詞引擎建立完成後,可以使用不同方式進行斷詞

segment(chinese_text, jieba_tokenizer)
##   [1] "近幾年來"      "良心"          "投資"          "逐漸"         
##   [5] "形成"          "一股"          "風潮"          "投資人"       
##   [9] "對於"          "企業"          "社會"          "責任"         
##  [13] "愈來愈"        "重視"          "環境保護"      "社會"         
##  [17] "責任"          "與"            "公司"          "治理"         
##  [21] "加總"          "的"            "ESG"           "概念"         
##  [25] "也"            "成為"          "當今"          "華爾街"       
##  [29] "和"            "法人"          "投資"          "圈"           
##  [33] "最"            "熱門"          "的熱"          "搜"           
##  [37] "關鍵"          "英文"          "縮寫"          "什麼"         
##  [41] "是"            "ESG"           "就是"          "環境保護"     
##  [45] "Environmental" "社會"          "責任"          "Social"       
##  [49] "與"            "公司"          "治理"          "Governance"   
##  [53] "這"            "3"             "個"            "英文字"       
##  [57] "的"            "縮寫"          "從政"          "府"           
##  [61] "推動"          "限塑"          "政策"          "上市公司"     
##  [65] "需編"          "製"            "與"            "申報"         
##  [69] "企業"          "社會"          "責任"          "報告書"       
##  [73] "民間團體"      "自動"          "發起"          "淨灘"         
##  [77] "活動"          "等"            "背後"          "都"           
##  [81] "蘊含"          "著"            "ESG"           "的"           
##  [85] "意涵"          "ESG"           "投資"          "可以"         
##  [89] "回溯到"        "2006"          "年"            "聯合國"       
##  [93] "責任"          "投資"          "原則"          "UNPRI"        
##  [97] "的"            "成立"          "致力於"        "推動"         
## [101] "全球"          "各大"          "投資"          "機構"         
## [105] "在"            "投資決策"      "過程"          "中"           
## [109] "必須"          "納入"          "ESG"           "因子"         
## [113] "並"            "協助"          "PRI"           "的"           
## [117] "簽署"          "國家"          "全面"          "提升"         
## [121] "責任"          "投資"          "的"            "能力"         
## [125] "戮力"          "經營"          "本業"          "的"           
## [129] "企業"          "若"            "不"            "重視"         
## [133] "ESG"           "就算"          "有"            "好"           
## [137] "的"            "本益比"        "好"            "的"           
## [141] "成長"          "也"            "會"            "影響"         
## [145] "公司"          "價值"          "尤其"          "是"           
## [149] "近年來"        "環境"          "變遷"          "等"           
## [153] "無差別"        "衝擊"          "影響"          "企業"         
## [157] "經營"          "表現"          "投資人"        "轉向"         
## [161] "從"            "ESG"           "SDGs"          "聯合國"       
## [165] "永續"          "發展"          "目標"          "等"           
## [169] "評價"          "模式"          "尋找"          "投資"         
## [173] "標的"

加入自訂詞彙進行斷詞

以參數形式手動加入

# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("社會責任","企業社會責任", "企業社會責任報告書","良心投資","投資圈","熱搜","公司治理"))
## [1] TRUE
segment(chinese_text, jieba_tokenizer)
##   [1] "近幾年來"           "良心投資"           "逐漸"              
##   [4] "形成"               "一股"               "風潮"              
##   [7] "投資人"             "對於"               "企業社會責任"      
##  [10] "愈來愈"             "重視"               "環境保護"          
##  [13] "社會責任"           "與"                 "公司治理"          
##  [16] "加總"               "的"                 "ESG"               
##  [19] "概念"               "也"                 "成為"              
##  [22] "當今"               "華爾街"             "和"                
##  [25] "法人"               "投資圈"             "最"                
##  [28] "熱門"               "的"                 "熱搜"              
##  [31] "關鍵"               "英文"               "縮寫"              
##  [34] "什麼"               "是"                 "ESG"               
##  [37] "就是"               "環境保護"           "Environmental"     
##  [40] "社會責任"           "Social"             "與"                
##  [43] "公司治理"           "Governance"         "這"                
##  [46] "3"                  "個"                 "英文字"            
##  [49] "的"                 "縮寫"               "從政"              
##  [52] "府"                 "推動"               "限塑"              
##  [55] "政策"               "上市公司"           "需編"              
##  [58] "製"                 "與"                 "申報"              
##  [61] "企業社會責任報告書" "民間團體"           "自動"              
##  [64] "發起"               "淨灘"               "活動"              
##  [67] "等"                 "背後"               "都"                
##  [70] "蘊含"               "著"                 "ESG"               
##  [73] "的"                 "意涵"               "ESG"               
##  [76] "投資"               "可以"               "回溯到"            
##  [79] "2006"               "年"                 "聯合國"            
##  [82] "責任"               "投資"               "原則"              
##  [85] "UNPRI"              "的"                 "成立"              
##  [88] "致力於"             "推動"               "全球"              
##  [91] "各大"               "投資"               "機構"              
##  [94] "在"                 "投資決策"           "過程"              
##  [97] "中"                 "必須"               "納入"              
## [100] "ESG"                "因子"               "並"                
## [103] "協助"               "PRI"                "的"                
## [106] "簽署"               "國家"               "全面"              
## [109] "提升"               "責任"               "投資"              
## [112] "的"                 "能力"               "戮力"              
## [115] "經營"               "本業"               "的"                
## [118] "企業"               "若"                 "不"                
## [121] "重視"               "ESG"                "就算"              
## [124] "有"                 "好"                 "的"                
## [127] "本益比"             "好"                 "的"                
## [130] "成長"               "也"                 "會"                
## [133] "影響"               "公司"               "價值"              
## [136] "尤其"               "是"                 "近年來"            
## [139] "環境"               "變遷"               "等"                
## [142] "無差別"             "衝擊"               "影響"              
## [145] "企業"               "經營"               "表現"              
## [148] "投資人"             "轉向"               "從"                
## [151] "ESG"                "SDGs"               "聯合國"            
## [154] "永續"               "發展"               "目標"              
## [157] "等"                 "評價"               "模式"              
## [160] "尋找"               "投資"               "標的"

以外部檔案形式加入

# 使用使用者自訂字典
jieba_tokenizer <- worker(user="user_dict.txt")
segment(chinese_text, jieba_tokenizer)
##   [1] "近幾年來"           "良心投資"           "逐漸"              
##   [4] "形成"               "一股"               "風潮"              
##   [7] "投資人"             "對於"               "企業社會責任"      
##  [10] "愈來愈"             "重視"               "環境保護"          
##  [13] "社會責任"           "與"                 "公司治理"          
##  [16] "加總"               "的"                 "ESG"               
##  [19] "概念"               "也"                 "成為"              
##  [22] "當今"               "華爾街"             "和"                
##  [25] "法人"               "投資圈"             "最"                
##  [28] "熱門"               "的"                 "熱搜"              
##  [31] "關鍵"               "英文"               "縮寫"              
##  [34] "什麼"               "是"                 "ESG"               
##  [37] "就是"               "環境保護"           "Environmental"     
##  [40] "社會責任"           "Social"             "與"                
##  [43] "公司治理"           "Governance"         "這"                
##  [46] "3"                  "個"                 "英文字"            
##  [49] "的"                 "縮寫"               "從政"              
##  [52] "府"                 "推動"               "限塑"              
##  [55] "政策"               "上市公司"           "需編"              
##  [58] "製"                 "與"                 "申報"              
##  [61] "企業社會責任報告書" "民間團體"           "自動"              
##  [64] "發起"               "淨灘"               "活動"              
##  [67] "等"                 "背後"               "都"                
##  [70] "蘊含"               "著"                 "ESG"               
##  [73] "的"                 "意涵"               "ESG"               
##  [76] "投資"               "可以"               "回溯到"            
##  [79] "2006"               "年"                 "聯合國"            
##  [82] "責任"               "投資"               "原則"              
##  [85] "UNPRI"              "的"                 "成立"              
##  [88] "致力於"             "推動"               "全球"              
##  [91] "各大"               "投資"               "機構"              
##  [94] "在"                 "投資決策"           "過程"              
##  [97] "中"                 "必須"               "納入"              
## [100] "ESG"                "因子"               "並"                
## [103] "協助"               "PRI"                "的"                
## [106] "簽署"               "國家"               "全面"              
## [109] "提升"               "責任"               "投資"              
## [112] "的"                 "能力"               "戮力"              
## [115] "經營"               "本業"               "的"                
## [118] "企業"               "若"                 "不"                
## [121] "重視"               "ESG"                "就算"              
## [124] "有"                 "好"                 "的"                
## [127] "本益比"             "好"                 "的"                
## [130] "成長"               "也"                 "會"                
## [133] "影響"               "公司"               "價值"              
## [136] "尤其"               "是"                 "近年來"            
## [139] "環境"               "變遷"               "等"                
## [142] "無差別"             "衝擊"               "影響"              
## [145] "企業"               "經營"               "表現"              
## [148] "投資人"             "轉向"               "從"                
## [151] "ESG"                "SDGs"               "聯合國"            
## [154] "永續"               "發展"               "目標"              
## [157] "等"                 "評價"               "模式"              
## [160] "尋找"               "投資"               "標的"

停用字詞使用

以手動方式加入

tokens <- segment(chinese_text, jieba_tokenizer)
stop_words <- c("對於", "愈來愈", "就是", "什麼")
result <- filter_segment(tokens, stop_words)
result
##   [1] "近幾年來"           "良心投資"           "逐漸"              
##   [4] "形成"               "一股"               "風潮"              
##   [7] "投資人"             "企業社會責任"       "重視"              
##  [10] "環境保護"           "社會責任"           "與"                
##  [13] "公司治理"           "加總"               "的"                
##  [16] "ESG"                "概念"               "也"                
##  [19] "成為"               "當今"               "華爾街"            
##  [22] "和"                 "法人"               "投資圈"            
##  [25] "最"                 "熱門"               "的"                
##  [28] "熱搜"               "關鍵"               "英文"              
##  [31] "縮寫"               "是"                 "ESG"               
##  [34] "環境保護"           "Environmental"      "社會責任"          
##  [37] "Social"             "與"                 "公司治理"          
##  [40] "Governance"         "這"                 "3"                 
##  [43] "個"                 "英文字"             "的"                
##  [46] "縮寫"               "從政"               "府"                
##  [49] "推動"               "限塑"               "政策"              
##  [52] "上市公司"           "需編"               "製"                
##  [55] "與"                 "申報"               "企業社會責任報告書"
##  [58] "民間團體"           "自動"               "發起"              
##  [61] "淨灘"               "活動"               "等"                
##  [64] "背後"               "都"                 "蘊含"              
##  [67] "著"                 "ESG"                "的"                
##  [70] "意涵"               "ESG"                "投資"              
##  [73] "可以"               "回溯到"             "2006"              
##  [76] "年"                 "聯合國"             "責任"              
##  [79] "投資"               "原則"               "UNPRI"             
##  [82] "的"                 "成立"               "致力於"            
##  [85] "推動"               "全球"               "各大"              
##  [88] "投資"               "機構"               "在"                
##  [91] "投資決策"           "過程"               "中"                
##  [94] "必須"               "納入"               "ESG"               
##  [97] "因子"               "並"                 "協助"              
## [100] "PRI"                "的"                 "簽署"              
## [103] "國家"               "全面"               "提升"              
## [106] "責任"               "投資"               "的"                
## [109] "能力"               "戮力"               "經營"              
## [112] "本業"               "的"                 "企業"              
## [115] "若"                 "不"                 "重視"              
## [118] "ESG"                "就算"               "有"                
## [121] "好"                 "的"                 "本益比"            
## [124] "好"                 "的"                 "成長"              
## [127] "也"                 "會"                 "影響"              
## [130] "公司"               "價值"               "尤其"              
## [133] "是"                 "近年來"             "環境"              
## [136] "變遷"               "等"                 "無差別"            
## [139] "衝擊"               "影響"               "企業"              
## [142] "經營"               "表現"               "投資人"            
## [145] "轉向"               "從"                 "ESG"               
## [148] "SDGs"               "聯合國"             "永續"              
## [151] "發展"               "目標"               "等"                
## [154] "評價"               "模式"               "尋找"              
## [157] "投資"               "標的"

以外部檔案形式加入

jieba_tokenizer <- worker(stop_word="user_dict.txt")
segment(chinese_text, jieba_tokenizer)
##   [1] "近幾年來"      "良心"          "投資"          "逐漸"         
##   [5] "形成"          "一股"          "風潮"          "投資人"       
##   [9] "對於"          "企業"          "社會"          "責任"         
##  [13] "愈來愈"        "重視"          "環境保護"      "社會"         
##  [17] "責任"          "與"            "公司"          "治理"         
##  [21] "加總"          "的"            "ESG"           "概念"         
##  [25] "也"            "成為"          "當今"          "華爾街"       
##  [29] "和"            "法人"          "投資"          "圈"           
##  [33] "最"            "熱門"          "的熱"          "搜"           
##  [37] "關鍵"          "英文"          "縮寫"          "什麼"         
##  [41] "是"            "ESG"           "就是"          "環境保護"     
##  [45] "Environmental" "社會"          "責任"          "Social"       
##  [49] "與"            "公司"          "治理"          "Governance"   
##  [53] "這"            "3"             "個"            "英文字"       
##  [57] "的"            "縮寫"          "從政"          "府"           
##  [61] "推動"          "限塑"          "政策"          "上市公司"     
##  [65] "需編"          "製"            "與"            "申報"         
##  [69] "企業"          "社會"          "責任"          "報告書"       
##  [73] "民間團體"      "自動"          "發起"          "淨灘"         
##  [77] "活動"          "等"            "背後"          "都"           
##  [81] "蘊含"          "著"            "ESG"           "的"           
##  [85] "意涵"          "ESG"           "投資"          "可以"         
##  [89] "回溯到"        "2006"          "年"            "聯合國"       
##  [93] "責任"          "投資"          "原則"          "UNPRI"        
##  [97] "的"            "成立"          "致力於"        "推動"         
## [101] "全球"          "各大"          "投資"          "機構"         
## [105] "在"            "投資決策"      "過程"          "中"           
## [109] "必須"          "納入"          "ESG"           "因子"         
## [113] "並"            "協助"          "PRI"           "的"           
## [117] "簽署"          "國家"          "全面"          "提升"         
## [121] "責任"          "投資"          "的"            "能力"         
## [125] "戮力"          "經營"          "本業"          "的"           
## [129] "企業"          "若"            "不"            "重視"         
## [133] "ESG"           "就算"          "有"            "好"           
## [137] "的"            "本益比"        "好"            "的"           
## [141] "成長"          "也"            "會"            "影響"         
## [145] "公司"          "價值"          "尤其"          "是"           
## [149] "近年來"        "環境"          "變遷"          "等"           
## [153] "無差別"        "衝擊"          "影響"          "企業"         
## [157] "經營"          "表現"          "投資人"        "轉向"         
## [161] "從"            "ESG"           "SDGs"          "聯合國"       
## [165] "永續"          "發展"          "目標"          "等"           
## [169] "評價"          "模式"          "尋找"          "投資"         
## [173] "標的"

篩選詞彙長度

將詞彙長度為1的詞清除

tokens <- result[nchar(result)>1]
tokens
##   [1] "近幾年來"           "良心投資"           "逐漸"              
##   [4] "形成"               "一股"               "風潮"              
##   [7] "投資人"             "企業社會責任"       "重視"              
##  [10] "環境保護"           "社會責任"           "公司治理"          
##  [13] "加總"               "ESG"                "概念"              
##  [16] "成為"               "當今"               "華爾街"            
##  [19] "法人"               "投資圈"             "熱門"              
##  [22] "熱搜"               "關鍵"               "英文"              
##  [25] "縮寫"               "ESG"                "環境保護"          
##  [28] "Environmental"      "社會責任"           "Social"            
##  [31] "公司治理"           "Governance"         "英文字"            
##  [34] "縮寫"               "從政"               "推動"              
##  [37] "限塑"               "政策"               "上市公司"          
##  [40] "需編"               "申報"               "企業社會責任報告書"
##  [43] "民間團體"           "自動"               "發起"              
##  [46] "淨灘"               "活動"               "背後"              
##  [49] "蘊含"               "ESG"                "意涵"              
##  [52] "ESG"                "投資"               "可以"              
##  [55] "回溯到"             "2006"               "聯合國"            
##  [58] "責任"               "投資"               "原則"              
##  [61] "UNPRI"              "成立"               "致力於"            
##  [64] "推動"               "全球"               "各大"              
##  [67] "投資"               "機構"               "投資決策"          
##  [70] "過程"               "必須"               "納入"              
##  [73] "ESG"                "因子"               "協助"              
##  [76] "PRI"                "簽署"               "國家"              
##  [79] "全面"               "提升"               "責任"              
##  [82] "投資"               "能力"               "戮力"              
##  [85] "經營"               "本業"               "企業"              
##  [88] "重視"               "ESG"                "就算"              
##  [91] "本益比"             "成長"               "影響"              
##  [94] "公司"               "價值"               "尤其"              
##  [97] "近年來"             "環境"               "變遷"              
## [100] "無差別"             "衝擊"               "影響"              
## [103] "企業"               "經營"               "表現"              
## [106] "投資人"             "轉向"               "ESG"               
## [109] "SDGs"               "聯合國"             "永續"              
## [112] "發展"               "目標"               "評價"              
## [115] "模式"               "尋找"               "投資"              
## [118] "標的"

Ch.2:使用Tidy Text套件進行英文斷詞,並建立DataFrame

英文斷詞

建立文本

english_text <- c("Because of you",
          "I never stray too far from the sidewalk",
          "Because of you",
          "I learned to play on the safe side so I don’t get hurt")

english_text
## [1] "Because of you"                                        
## [2] "I never stray too far from the sidewalk"               
## [3] "Because of you"                                        
## [4] "I learned to play on the safe side so I don’t get hurt"

建立英文句子斷詞後的tibble

english_text_df <- tibble(line = 1:length(english_text), text = english_text)

english_text_df
## # A tibble: 4 x 2
##    line text                                                  
##   <int> <chr>                                                 
## 1     1 Because of you                                        
## 2     2 I never stray too far from the sidewalk               
## 3     3 Because of you                                        
## 4     4 I learned to play on the safe side so I don’t get hurt
english_text_df %>%
  unnest_tokens(word, text)
## # A tibble: 27 x 2
##     line word   
##    <int> <chr>  
##  1     1 because
##  2     1 of     
##  3     1 you    
##  4     2 i      
##  5     2 never  
##  6     2 stray  
##  7     2 too    
##  8     2 far    
##  9     2 from   
## 10     2 the    
## # ... with 17 more rows

ch.3:建立中文斷詞後的DataFrame

?unnest_tokens
## starting httpd help server ... done

根據文件說明:“If a function, should take a character vector and return a list of character vectors of the same length”得知,我們可以自訂義斷詞函式,而input的型態為vector,output為list

初始化斷詞引擎

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")

自定義斷詞函式

chinese_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

建立文本

chinese_text <- c("近幾年來「良心投資」逐漸形成一股風潮,投資人對於企業社會責任愈來愈重視,環境保護、社會責任與公司治理加總的ESG概念,也成為當今華爾街和法人投資圈最熱門的熱搜關鍵英文縮寫。什麼是「ESG」?就是環境保護(Environmental)、社會責任(Social)與公司治理(Governance)這3個英文字的縮寫,從政府推動限塑政策、上市公司需編製與申報企業社會責任報告書、民間團體自動發起淨灘活動等,背後都蘊含著ESG的意涵。ESG投資可以回溯到2006年聯合國責任投資原則(UNPRI)的成立,致力於推動全球各大投資機構在投資決策過程中必須納入ESG因子,並協助PRI的簽署國家全面提升責任投資的能力。戮力經營本業的企業,若不重視ESG,就算有好的本益比、好的成長,也會影響公司價值,尤其是近年來環境變遷等「無差別衝擊」影響企業經營表現,投資人轉向從ESG、SDGs(聯合國永續發展目標)等評價模式尋找投資標的。", "為什麼ESG對企業和投資人來說會麼這麼重要?施宜君指出,通常重視ESG的企業,信用評等較為良好,且公司財務較為穩健,往往較有機會可以帶來較佳報酬率以及穩定性,在後疫情時代,採用ESG因子來謹慎選擇股票、債券等標的,也可望提前避開問題企業、進一步防範投資風險。國泰投信認為,ESG就像是企業永續的KPI,ESG做得好的公司往往能贏得社會長期肯定,對公司永續經營、企業長線獲利等都能帶來挹注,連帶也吸引海內外資金持續流入ESG市場。")

建立中文斷詞後的DataFrame

chinese_text_df <- tibble(paragraph=1:length(chinese_text), text = chinese_text)
chinese_text_df %>% unnest_tokens(word, text, token=chinese_tokenizer)
## # A tibble: 195 x 2
##    paragraph word        
##        <int> <chr>       
##  1         1 近幾年來    
##  2         1 良心投資    
##  3         1 逐漸        
##  4         1 形成        
##  5         1 一股        
##  6         1 風潮        
##  7         1 投資人      
##  8         1 企業社會責任
##  9         1 重視        
## 10         1 環境保護    
## # ... with 185 more rows

ch.4 中文斷句

中文斷句

chinese_sentences <- strsplit(chinese_text, "[。]")
chinese_sentences
## [[1]]
## [1] "近幾年來「良心投資」逐漸形成一股風潮,投資人對於企業社會責任愈來愈重視,環境保護、社會責任與公司治理加總的ESG概念,也成為當今華爾街和法人投資圈最熱門的熱搜關鍵英文縮寫"                                                 
## [2] "什麼是「ESG」?就是環境保護(Environmental)、社會責任(Social)與公司治理(Governance)這3個英文字的縮寫,從政府推動限塑政策、上市公司需編製與申報企業社會責任報告書、民間團體自動發起淨灘活動等,背後都蘊含著ESG的意涵"
## [3] "ESG投資可以回溯到2006年聯合國責任投資原則(UNPRI)的成立,致力於推動全球各大投資機構在投資決策過程中必須納入ESG因子,並協助PRI的簽署國家全面提升責任投資的能力"                                                          
## [4] "戮力經營本業的企業,若不重視ESG,就算有好的本益比、好的成長,也會影響公司價值,尤其是近年來環境變遷等「無差別衝擊」影響企業經營表現,投資人轉向從ESG、SDGs(聯合國永續發展目標)等評價模式尋找投資標的"                  
## 
## [[2]]
## [1] "為什麼ESG對企業和投資人來說會麼這麼重要?施宜君指出,通常重視ESG的企業,信用評等較為良好,且公司財務較為穩健,往往較有機會可以帶來較佳報酬率以及穩定性,在後疫情時代,採用ESG因子來謹慎選擇股票、債券等標的,也可望提前避開問題企業、進一步防範投資風險"
## [2] "國泰投信認為,ESG就像是企業永續的KPI,ESG做得好的公司往往能贏得社會長期肯定,對公司永續經營、企業長線獲利等都能帶來挹注,連帶也吸引海內外資金持續流入ESG市場"

利用stringr套件中的strsplit,以“。”進行斷句

將第一段的句子進行斷詞

chinese_sentences[[1]] %>% chinese_tokenizer()
## [[1]]
##  [1] "近幾年來"     "良心投資"     "逐漸"         "形成"         "一股"        
##  [6] "風潮"         "投資人"       "企業社會責任" "重視"         "環境保護"    
## [11] "社會責任"     "公司治理"     "加總"         "ESG"          "概念"        
## [16] "成為"         "當今"         "華爾街"       "法人"         "投資圈"      
## [21] "熱門"         "熱搜"         "關鍵"         "英文"         "縮寫"        
## 
## [[2]]
##  [1] "ESG"                "環境保護"           "Environmental"     
##  [4] "社會責任"           "Social"             "公司治理"          
##  [7] "Governance"         "英文字"             "縮寫"              
## [10] "從政"               "推動"               "限塑"              
## [13] "政策"               "上市公司"           "需編"              
## [16] "申報"               "企業社會責任報告書" "民間團體"          
## [19] "自動"               "發起"               "淨灘"              
## [22] "活動"               "背後"               "蘊含"              
## [25] "ESG"                "意涵"              
## 
## [[3]]
##  [1] "ESG"      "投資"     "可以"     "回溯到"   "2006"     "聯合國"  
##  [7] "責任"     "投資"     "原則"     "UNPRI"    "成立"     "致力於"  
## [13] "推動"     "全球"     "各大"     "投資"     "機構"     "投資決策"
## [19] "過程"     "必須"     "納入"     "ESG"      "因子"     "協助"    
## [25] "PRI"      "簽署"     "國家"     "全面"     "提升"     "責任"    
## [31] "投資"     "能力"    
## 
## [[4]]
##  [1] "戮力"   "經營"   "本業"   "企業"   "重視"   "ESG"    "就算"   "本益比"
##  [9] "成長"   "影響"   "公司"   "價值"   "尤其"   "近年來" "環境"   "變遷"  
## [17] "無差別" "衝擊"   "影響"   "企業"   "經營"   "表現"   "投資人" "轉向"  
## [25] "ESG"    "SDGs"   "聯合國" "永續"   "發展"   "目標"   "評價"   "模式"  
## [33] "尋找"   "投資"   "標的"

ch.5 Gutenberg free eBooks

Gutenberg free eBooks

https://www.gutenberg.org/

Also, various chinese books can be found in the link below:

https://www.gutenberg.org/browse/languages/zh

紅樓夢 by Xueqin Cao:

https://www.gutenberg.org/ebooks/24264

# 下載 "紅樓夢" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
red <- gutenbergr::gutenberg_download(24264) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org

紅樓夢每章節的開頭都會有“第X回”的詞,但有些僅以“第X回”表示,有些則在“第X回”後面加上章節名稱,EX.「第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀」

red <- red %>% 
  mutate(chapter = cumsum(str_detect(red$text, regex("^第.*回( |$)"))))

根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節

head(red, 20)
## # A tibble: 20 x 3
##    gutenberg_id text                                                     chapter
##           <int> <chr>                                                      <int>
##  1        24264 "第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀"                   1
##  2        24264 "------------------------------------------------------~       1
##  3        24264 "此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去,"~       1
##  4        24264 "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記"~       1
##  5        24264 "何事何人?自又云:“今風塵碌碌,一事無成,忽念及當日所有之女子,一"~       1
##  6        24264 "一細考較去,覺其行止見識,皆出于我之上.何我堂堂須眉,誠不若彼裙釵"~       1
##  7        24264 "哉?實愧則有余,悔又無益之大無可如何之日也!當此,則自欲將已往所賴"~       1
##  8        24264 "天恩祖德,錦衣紈褲之時,飫甘饜肥之日,背父兄教育之恩,負師友規談之"~       1
##  9        24264 "德,以至今日一技無成,半生潦倒之罪,編述一集,以告天下人:我之罪固"~       1
## 10        24264 "不免,然閨閣中本自歷歷有人,万不可因我之不肖,自護己短,一并使其泯"~       1
## 11        24264 "滅也.雖今日之茅椽蓬牖,瓦灶繩床,其晨夕風露,階柳庭花,亦未有妨我"~       1
## 12        24264 "之襟怀筆墨者.雖我未學,下筆無文,又何妨用假語村言,敷演出一段故事"~       1
## 13        24264 "來,亦可使閨閣昭傳,复可悅世之目,破人愁悶,不亦宜乎?\"故曰\"賈雨村"~       1
## 14        24264 "\"云云."                                                     1
## 15        24264 "  此回中凡用“夢”用“幻”等字,是提醒閱者眼目,亦是此書立意本旨"~       1
## 16        24264 "."                                                           1
## 17        24264 "  列位看官:你道此書從何而來?說起根由雖近荒唐,細按則深有趣味."~       1
## 18        24264 "待在下將此來歷注明,方使閱者了然不惑."                       1
## 19        24264 "  原來女媧氏煉石補天之時,于大荒山無稽崖練成高經十二丈,方經二十"~       1
## 20        24264 "四丈頑石三万六千五百零一塊.媧皇氏只用了三万六千五百塊,只單單剩了"~       1

下載下來的書已經完成斷句

使用紅樓夢專有名詞字典

jieba_tokenizer <- worker(user="dream_of_the_red_chamber_lexicon.traditional.dict", stop_word = "stop_words.txt")

設定斷詞

jieba_tokenizer <- worker(user="dream_of_the_red_chamber_lexicon.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- red %>% unnest_tokens(word, text, token=red_tokenizer)
str(tokens)
## tibble [462,355 x 3] (S3: tbl_df/tbl/data.frame)
##  $ gutenberg_id: int [1:462355] 24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
##  $ chapter     : int [1:462355] 1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr [1:462355] "第一回" " " "甄士隱夢幻識通靈" " " ...
head(tokens, 20)
## # A tibble: 20 x 3
##    gutenberg_id chapter word            
##           <int>   <int> <chr>           
##  1        24264       1 第一回          
##  2        24264       1                
##  3        24264       1 甄士隱夢幻識通靈
##  4        24264       1                
##  5        24264       1 賈雨村          
##  6        24264       1 風塵            
##  7        24264       1 怀              
##  8        24264       1 閨秀            
##  9        24264       1 此              
## 10        24264       1 開卷            
## 11        24264       1 第一回          
## 12        24264       1 也              
## 13        24264       1 作者            
## 14        24264       1 自云            
## 15        24264       1 因              
## 16        24264       1 曾              
## 17        24264       1 歷              
## 18        24264       1 過              
## 19        24264       1 一番            
## 20        24264       1 夢幻

ch.6 圖

文字雲

計算詞彙的出現次數,如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

印出最常見的20個詞彙

head(tokens_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 寶玉    3751
##  2 笑道    1893
##  3 什么    1738
##  4 鳳姐    1652
##  5 賈母    1611
##  6 一個    1392
##  7 我們    1165
##  8 那里    1136
##  9 襲人    1126
## 10 黛玉    1055
## 11 寶釵    1018
## 12 王夫人  1005
## 13 如今     967
## 14 怎么     963
## 15 你們     952
## 16 知道     932
## 17 說道     927
## 18 起來     920
## 19 出來     907
## 20 賈政     904
tokens_count %>% wordcloud2()

各章節長度

以句子數量來計算

plot <- 
  bind_rows(
    red %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量") + 
  theme(text = element_text(family = "Heiti TC Light"))
plot
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

計算前80回合後40回的詞彙在全文中出現比率的差異

frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>%
  gather(part, proportion, `Last 40`)
frequency
## # A tibble: 43,275 x 4
##    word      `First 80` part    proportion
##    <chr>          <dbl> <chr>        <dbl>
##  1 乙卯      0.00000623 Last 40 NA        
##  2 丁与些    0.00000623 Last 40 NA        
##  3 丁并家    0.00000623 Last 40 NA        
##  4 丁的      0.00000623 Last 40 NA        
##  5 丁是丁    0.0000125  Last 40 NA        
##  6 丁郎認父  0.00000623 Last 40 NA        
##  7 丁香      0.00000623 Last 40 NA        
##  8 丁憂     NA          Last 40  0.0000659
##  9 乃天      0.0000125  Last 40 NA        
## 10 乃木      0.00000623 Last 40 NA        
## # ... with 43,265 more rows
ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 80", x = "Last 40")
## Warning: Removed 35063 rows containing missing values (geom_point).
## Warning: Removed 35064 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database