###系統參數設定

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

###安裝需要的packages

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
require(readr)
## Loading required package: readr
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
require(knitr)
## Loading required package: knitr
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(tm)
## Loading required package: tm
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate

分析題目:KTV熱門排行榜歌曲詞彙與風格分析

#資料來源:"mymusic.net KTV熱唱必備►國語K歌排行超精選 https://www.mymusic.net.tw/ux/w/themeInfo/3354

###初始化斷詞引擎

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

分析排名前50歌曲常用詞彙

###以預設參數進行斷詞

all_song <- read_file("Lyric_All.txt")
allsong_jieba <- segment(all_song, jieba_tokenizer)

allsong_jieba[1:200]
##   [1] "你"     "總"     "感到"   "落寞"   "沮喪"   "你"     "總"    
##   [8] "感到"   "失望"   "對於"   "人生"   "未來"   "總有"   "太多"  
##  [15] "迷惘"   "你"     "總偽裝" "自己"   "不痛"   "你"     "總笑"  
##  [22] "著"     "逞強"   "對於"   "愛情"   "害怕"   "觸碰"   "放棄"  
##  [29] "掙扎"   "你"     "看"     "著"     "我"     "眼睛"   "你"    
##  [36] "記著"   "我"     "聲音"   "無畏"   "風雨"   "別忘記" "還有"  
##  [43] "我"     "站"     "在"     "這裡"   "我"     "只"     "想"    
##  [50] "做"     "你"     "的"     "太陽"   "你"     "的"     "太陽"  
##  [57] "在"     "你"     "的"     "心裡"   "呀"     "在"     "你"    
##  [64] "的"     "心底"   "呀"     "不管"   "是"     "多"     "遠"    
##  [71] "的"     "遠方"   "不要"   "害怕"   "我"     "在"     "身旁"  
##  [78] "想"     "做"     "你"     "的"     "太陽"   "你"     "的"    
##  [85] "太陽"   "在"     "你"     "的"     "心裡"   "呀"     "在"    
##  [92] "你"     "的"     "心底"   "呀"     "就算"   "不能"   "在"    
##  [99] "你"     "身旁"   "也"     "要"     "奮力"   "為"     "你"    
## [106] "而"     "發光"   "你"     "總"     "感到"   "落寞"   "沮喪"  
## [113] "你"     "總"     "感到"   "失望"   "對於"   "人生"   "未來"  
## [120] "總有"   "太多"   "迷惘"   "你"     "總偽裝" "自己"   "不痛"  
## [127] "你"     "總笑"   "著"     "逞強"   "對於"   "愛情"   "害怕"  
## [134] "觸碰"   "放棄"   "掙扎"   "你"     "看"     "著"     "我"    
## [141] "眼睛"   "你"     "記著"   "我"     "聲音"   "無畏"   "風雨"  
## [148] "別忘記" "還有"   "我"     "站"     "在"     "這裡"   "我"    
## [155] "只"     "想"     "做"     "你"     "的"     "太陽"   "你"    
## [162] "的"     "太陽"   "在"     "你"     "的"     "心裡"   "呀"    
## [169] "在"     "你"     "的"     "心底"   "呀"     "不管"   "是"    
## [176] "多"     "遠"     "的"     "遠方"   "不要"   "害怕"   "我"    
## [183] "在"     "身旁"   "想"     "做"     "你"     "的"     "太陽"  
## [190] "你"     "的"     "太陽"   "在"     "你"     "的"     "心裡"  
## [197] "呀"     "在"     "你"     "的"

抓取前50名歌曲,依每首歌曲的特性,創建其user_dict。將user_dict統整,並去除重複字詞。

###使用使用者自訂字典,以外部檔案格式加入

jieba_tokenizer <- worker(user="user_dict_All.txt")
allsong_userdict <- segment(all_song, jieba_tokenizer)

allsong_userdict[1:200]
##   [1] "你"     "總"     "感到"   "落寞"   "沮喪"   "你"     "總"    
##   [8] "感到"   "失望"   "對於"   "人生"   "未來"   "總有"   "太"    
##  [15] "多"     "迷惘"   "你"     "總"     "偽裝"   "自己"   "不痛"  
##  [22] "你"     "總"     "笑著"   "逞強"   "對於"   "愛情"   "害怕"  
##  [29] "觸碰"   "放棄"   "掙扎"   "你"     "看著"   "我"     "眼睛"  
##  [36] "你"     "記著"   "我"     "聲音"   "無畏"   "風"     "雨"    
##  [43] "別忘記" "還有"   "我"     "站"     "在"     "這"     "裡"    
##  [50] "我"     "只想"   "做"     "你"     "的"     "太陽"   "你"    
##  [57] "的"     "太陽"   "在"     "你"     "的"     "心"     "裡"    
##  [64] "呀"     "在"     "你"     "的"     "心底"   "呀"     "不管"  
##  [71] "是"     "多遠"   "的"     "遠方"   "不要"   "害怕"   "我"    
##  [78] "在"     "身旁"   "想"     "做"     "你"     "的"     "太陽"  
##  [85] "你"     "的"     "太陽"   "在"     "你"     "的"     "心"    
##  [92] "裡"     "呀"     "在"     "你"     "的"     "心底"   "呀"    
##  [99] "就算"   "不能"   "在"     "你"     "身旁"   "也"     "要"    
## [106] "奮力"   "為"     "你"     "而"     "發光"   "你"     "總"    
## [113] "感到"   "落寞"   "沮喪"   "你"     "總"     "感到"   "失望"  
## [120] "對於"   "人生"   "未來"   "總有"   "太"     "多"     "迷惘"  
## [127] "你"     "總"     "偽裝"   "自己"   "不痛"   "你"     "總"    
## [134] "笑著"   "逞強"   "對於"   "愛情"   "害怕"   "觸碰"   "放棄"  
## [141] "掙扎"   "你"     "看著"   "我"     "眼睛"   "你"     "記著"  
## [148] "我"     "聲音"   "無畏"   "風"     "雨"     "別忘記" "還有"  
## [155] "我"     "站"     "在"     "這"     "裡"     "我"     "只想"  
## [162] "做"     "你"     "的"     "太陽"   "你"     "的"     "太陽"  
## [169] "在"     "你"     "的"     "心"     "裡"     "呀"     "在"    
## [176] "你"     "的"     "心底"   "呀"     "不管"   "是"     "多遠"  
## [183] "的"     "遠方"   "不要"   "害怕"   "我"     "在"     "身旁"  
## [190] "想"     "做"     "你"     "的"     "太陽"   "你"     "的"    
## [197] "太陽"   "在"     "你"     "的"

抓取前50名歌曲,依每首歌曲的特性,創建其stop_word。將stop_word統整,並去除重複字詞。

###使用停用詞參數,以外部檔案格式加入

jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word="stop_words_All.txt")

allsong_stopword <- segment(all_song, jieba_tokenizer)

allsong_stopword[1:200]
##   [1] "落寞"     "沮喪"     "失望"     "人生"     "迷惘"     "偽裝"    
##   [7] "不痛"     "逞強"     "愛情"     "害怕"     "觸碰"     "放棄"    
##  [13] "掙扎"     "眼睛"     "記著"     "聲音"     "無畏"     "風"      
##  [19] "雨"       "別忘記"   "太陽"     "太陽"     "心"       "心底"    
##  [25] "遠方"     "害怕"     "身旁"     "太陽"     "太陽"     "心"      
##  [31] "心底"     "身旁"     "奮力"     "發光"     "落寞"     "沮喪"    
##  [37] "失望"     "人生"     "迷惘"     "偽裝"     "不痛"     "逞強"    
##  [43] "愛情"     "害怕"     "觸碰"     "放棄"     "掙扎"     "眼睛"    
##  [49] "記著"     "聲音"     "無畏"     "風"       "雨"       "別忘記"  
##  [55] "太陽"     "太陽"     "心"       "心底"     "遠方"     "害怕"    
##  [61] "身旁"     "太陽"     "太陽"     "心"       "心底"     "身旁"    
##  [67] "奮力"     "發光"     "記得"     "所有的"   "幸福"     "太陽"    
##  [73] "太陽"     "心"       "心底"     "遠方"     "害怕"     "身旁"    
##  [79] "太陽"     "太陽"     "心"       "心底"     "身旁"     "奮力"    
##  [85] "發光"     "藍色"     "最愛"     "顏色"     "燦爛"     "笑容"    
##  [91] "擁抱"     "面孔"     "消失"     "我錯了"   "搞錯"     "天灰"    
##  [97] "雨"       "凝望"     "回不去"   "快樂"     "輕易"     "結束"    
## [103] "抱著"     "哭"       "努力"     "祝福"     "心如刀割" "頑固"    
## [109] "絕口不提" "幸福"     "懷念"     "失去"     "曉得"     "藍色"    
## [115] "最愛"     "顏色"     "燦爛"     "笑容"     "擁抱"     "面孔"    
## [121] "消失"     "我錯了"   "搞錯"     "越痛"     "苦"       "約"      
## [127] "快樂"     "輕易"     "結束"     "抱著"     "哭"       "努力"    
## [133] "祝福"     "祝福"     "最愛"     "愛你"     "飛翔"     "可憐"    
## [139] "受傷"     "一個人"   "可憐"     "永遠"     "輕易"     "結束"    
## [145] "抱著"     "哭"       "努力"     "祝福"     "祝福"     "最愛"    
## [151] "永遠"     "飛翔"     "頑固"     "絕口不提" "幸福"     "懷念"    
## [157] "失去"     "曉得"     "落寞"     "沮喪"     "失望"     "人生"    
## [163] "迷惘"     "偽裝"     "不痛"     "逞強"     "愛情"     "害怕"    
## [169] "觸碰"     "放棄"     "掙扎"     "眼睛"     "記著"     "聲音"    
## [175] "無畏"     "風"       "雨"       "別忘記"   "太陽"     "太陽"    
## [181] "心"       "心底"     "遠方"     "害怕"     "身旁"     "太陽"    
## [187] "太陽"     "心"       "心底"     "身旁"     "奮力"     "發光"    
## [193] "落寞"     "沮喪"     "失望"     "人生"     "迷惘"     "偽裝"    
## [199] "不痛"     "逞強"

###計算每一個字詞出現次數

allsong_stopword <- as.data.frame(allsong_stopword)

allsong_count <- allsong_stopword %>% 
  count(allsong_stopword,sort=T) %>%
  filter(n>5)

colnames(allsong_count) <- c("word","count")

allsong_count[1:200,]
## # A tibble: 200 x 2
##    word   count
##    <fct>  <int>
##  1 喵        67
##  2 騙吃      44
##  3 心        30
##  4 腦公      28
##  5 辣台妹    27
##  6 太陽      26
##  7 搖落      23
##  8 世界      21
##  9 夢        20
## 10 難過      20
## # … with 190 more rows

###繪製文字雲

allsong_count %>% wordcloud2()

分析情歌常用詞彙

###以預設參數進行斷詞

love_song <- read_file("song.txt")
lovesong_jieba <- segment(love_song, jieba_tokenizer)

lovesong_jieba[1:200]
##   [1] "還沒"      "櫻花季"    "還沒"      "照相機"    "還沒"     
##   [6] "光臨"      "餐廳"      "期待"      "有著"      "旅行"     
##  [11] "等待"      "日落"      "巴黎"      "鐵塔"      "之下"     
##  [16] "等待"      "願意"      "等待"      "一點一滴"  "珍惜"     
##  [21] "好好的"    "愛你"      "倒數"      "剩下"      "快樂"     
##  [26] "相擁"      "狂熱"      "永遠"      "深刻"      "心跳"     
##  [31] "倒數"      "生命"      "剩下"      "溫熱"      "至少"     
##  [36] "烏黑"      "頭髮"      "漆黑"      "過後"      "旭日"     
##  [41] "淚流"      "堅持"      "日"        "復"        "日"       
##  [46] "放棄"      "重複"      "願意"      "還沒"      "退化"     
##  [51] "眼睛"      "抓緊時間"  "不止息"    "風景"      "新"       
##  [56] "生命"      "咖啡"      "酸"        "晚餐"      "不吃"     
##  [61] "冷"        "錯過"      "後悔"      "夢"        "未"       
##  [66] "實現"      "醒"        "心"        "開過"      "灰"       
##  [71] "追逐"      "旅途"      "曲折"      "曲折"      "至少"     
##  [76] "痛"        "快樂"      "愛過"      "算"        "活著"     
##  [81] "別無所求"  "still"     "same"      "有變"      "Nothing"  
##  [86] "changed"   "討厭"      "下雨天"    "認錯"      "脾氣"     
##  [91] "硬"        "點"        "清楚"      "辦法"      "改變"     
##  [96] "後悔"      "高中"      "花錢"      "裝很吵"    "排氣管"   
## [101] "努力"      "賺錢"      "養"        "養成"      "壞習慣"   
## [106] "晃晃"      "玩"        "without"   "feels"     "like"     
## [111] "情人節"    "台"        "相機"      "快門"      "張"       
## [116] "書桌"      "檯燈"      "without"   "feels"     "likeA"    
## [121] "sentence"  "without"   "spacesIt"  "like"      "youtube"  
## [126] "without"   "playlist"  "chapter"   "without"   "pages"    
## [131] "without"   "feels"     "wrong"     "live"      "without"  
## [136] "alone"     "想念"      "角度"      "重"        "You"      
## [141] "know"      "feel"      "inside"    "live"      "without"  
## [146] "辦法"      "been"      "through"   "重蹈"      "覆徹"     
## [151] "can"       "沒得"      "負責"      "damn"      "bad"      
## [156] "need"      "right"     "面子"      "兩邊"      "榨乾"     
## [161] "搶劫"      "Goddamn"   "got"       "nothing"   "left"     
## [166] "掉進去"    "fun"       "down"      "here"      "救救"     
## [171] "call"      "my"        "friends"   "訴苦"      "好幾遍"   
## [176] "振作"      "點"        "know"      "man"       "but"      
## [181] "fuckin"    "can"       "刪了"      "instagram" "Your"     
## [186] "post"      "see"       "promise"   "一支"      "Hennessy" 
## [191] "know"      "baby"      "why"       "gonna"     "go"       
## [196] "捨不得"    "放手"      "yeah"      "失去"      "失去"

###使用使用者自訂字典,以外部檔案格式加入

jieba_tokenizer <- worker(user="user_dict_All.txt")
lovesong_userdict <- segment(love_song, jieba_tokenizer)

lovesong_userdict[1:200]
##   [1] "還沒"     "到"       "的"       "櫻花季"   "還沒"     "用"      
##   [7] "的"       "照相機"   "還沒"     "光臨"     "的"       "餐廳"    
##  [13] "還"       "在"       "期待"     "有著"     "你"       "的"      
##  [19] "旅行"     "等待"     "日落"     "的"       "巴黎"     "鐵塔"    
##  [25] "之下"     "牽著"     "你"       "等待"     "說"       "著"      
##  [31] "我"       "願意"     "等待"     "未來"     "每天"     "身邊"    
##  [37] "有"       "你"       "一點一滴" "每"       "一天"     "珍惜"    
##  [43] "怕"       "突然"     "來不及"   "好好的"   "愛你"     "時針"    
##  [49] "一直"     "倒數"     "著"       "我們"     "剩下"     "的"      
##  [55] "快樂"     "此刻"     "相擁"     "的"       "狂熱"     "卻"      
##  [61] "永遠"     "都"       "深刻"     "心跳"     "一直"     "倒數"    
##  [67] "著"       "生命"     "剩下"     "的"       "溫熱"     "至少"    
##  [73] "用力"     "地"       "愛"       "著"       "還"       "烏黑"    
##  [79] "的"       "頭髮"     "有"       "你"       "就"       "不怕"    
##  [85] "白"       "了"       "漆黑"     "過後"     "是"       "旭日"    
##  [91] "淚流"     "以後"     "是"       "堅持"     "真的"     "愛"      
##  [97] "是"       "日"       "復"       "日"       "從不"     "放棄"    
## [103] "重複"     "說"       "你"       "願意"     "還沒"     "退化"    
## [109] "的"       "眼睛"     "抓緊時間" "看看"     "你"       "愛"      
## [115] "是"       "從來"     "不止息"   "一個"     "風景"     "每天"    
## [121] "新"       "的"       "生命"     "咖啡"     "再"       "不"      
## [127] "喝"       "就"       "酸"       "了"       "晚餐"     "再"      
## [133] "不吃"     "就"       "冷"       "了"       "愛"       "著"      
## [139] "為"       "什麼"     "不"       "說"       "呢"       "難道"    
## [145] "錯過"     "了"       "才"       "來"       "後悔"     "著"      
## [151] "誰"       "夢"       "未"       "實現"     "就"       "醒"      
## [157] "了"       "誰"       "心"       "沒"       "開過"     "就"      
## [163] "灰"       "了"       "追逐"     "愛"       "的"       "旅途"    
## [169] "曲折"     "就算"     "再"       "曲折"     "為"       "你"      
## [175] "都"       "值得"     "至少"     "痛"       "並"       "快樂"    
## [181] "著"       "愛過"     "才"       "算"       "活著"     "有"      
## [187] "你"       "別無所求" "了"       "I"        "m"        "still"   
## [193] "the"      "same"     "都"       "好像"     "沒"       "有變"    
## [199] "Nothing"  "changed"

###使用停用詞參數,以外部檔案格式加入

jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word="stop_words_All.txt")

lovesong_stopword <- segment(love_song, jieba_tokenizer)

lovesong_stopword[1:200]
##   [1] "還沒"      "櫻花季"    "還沒"      "照相機"    "還沒"     
##   [6] "光臨"      "餐廳"      "期待"      "有著"      "旅行"     
##  [11] "等待"      "日落"      "巴黎"      "鐵塔"      "之下"     
##  [16] "等待"      "願意"      "等待"      "一點一滴"  "珍惜"     
##  [21] "好好的"    "愛你"      "倒數"      "剩下"      "快樂"     
##  [26] "相擁"      "狂熱"      "永遠"      "深刻"      "心跳"     
##  [31] "倒數"      "生命"      "剩下"      "溫熱"      "至少"     
##  [36] "烏黑"      "頭髮"      "漆黑"      "過後"      "旭日"     
##  [41] "淚流"      "堅持"      "日"        "復"        "日"       
##  [46] "放棄"      "重複"      "願意"      "還沒"      "退化"     
##  [51] "眼睛"      "抓緊時間"  "不止息"    "風景"      "新"       
##  [56] "生命"      "咖啡"      "酸"        "晚餐"      "不吃"     
##  [61] "冷"        "錯過"      "後悔"      "夢"        "未"       
##  [66] "實現"      "醒"        "心"        "開過"      "灰"       
##  [71] "追逐"      "旅途"      "曲折"      "曲折"      "至少"     
##  [76] "痛"        "快樂"      "愛過"      "算"        "活著"     
##  [81] "別無所求"  "still"     "same"      "有變"      "Nothing"  
##  [86] "changed"   "討厭"      "下雨天"    "認錯"      "脾氣"     
##  [91] "硬"        "點"        "清楚"      "辦法"      "改變"     
##  [96] "後悔"      "高中"      "花錢"      "裝很吵"    "排氣管"   
## [101] "努力"      "賺錢"      "養"        "養成"      "壞習慣"   
## [106] "晃晃"      "玩"        "without"   "feels"     "like"     
## [111] "情人節"    "台"        "相機"      "快門"      "張"       
## [116] "書桌"      "檯燈"      "without"   "feels"     "likeA"    
## [121] "sentence"  "without"   "spacesIt"  "like"      "youtube"  
## [126] "without"   "playlist"  "chapter"   "without"   "pages"    
## [131] "without"   "feels"     "wrong"     "live"      "without"  
## [136] "alone"     "想念"      "角度"      "重"        "You"      
## [141] "know"      "feel"      "inside"    "live"      "without"  
## [146] "辦法"      "been"      "through"   "重蹈"      "覆徹"     
## [151] "can"       "沒得"      "負責"      "damn"      "bad"      
## [156] "need"      "right"     "面子"      "兩邊"      "榨乾"     
## [161] "搶劫"      "Goddamn"   "got"       "nothing"   "left"     
## [166] "掉進去"    "fun"       "down"      "here"      "救救"     
## [171] "call"      "my"        "friends"   "訴苦"      "好幾遍"   
## [176] "振作"      "點"        "know"      "man"       "but"      
## [181] "fuckin"    "can"       "刪了"      "instagram" "Your"     
## [186] "post"      "see"       "promise"   "一支"      "Hennessy" 
## [191] "know"      "baby"      "why"       "gonna"     "go"       
## [196] "捨不得"    "放手"      "yeah"      "失去"      "失去"

###計算每一個字詞出現次數

lovesong_stopword <- as.data.frame(lovesong_stopword)

lovesong_count <- lovesong_stopword %>% 
  count(lovesong_stopword,sort=T) %>%
  filter(n>3)

colnames(lovesong_count) <- c("word","count")

lovesong_count[1:200,]
## # A tibble: 200 x 2
##    word    count
##    <fct>   <int>
##  1 腦公       28
##  2 親愛的     15
##  3 透明       12
##  4 想念       12
##  5 回憶       10
##  6 世界       10
##  7 忘記       10
##  8 微笑       10
##  9 without    10
## 10 愛你        9
## # … with 190 more rows

###情歌文字雲

lovesong_count %>% filter(count > 3) -> lovesong_count

wordcloud(lovesong_count$word,lovesong_count$count,random.order = FALSE, scale = c(5,.2),ordered.colors = FALSE,rot.per = FALSE,colors = brewer.pal(8,"Dark2"),family='STHeitiTC-Light')

情緒分析

歌曲常用正負情緒詞

###安裝需要的packages

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(reshape2)
library(tidyr)
library(readr)
library(scales)

###準備LIWC字典 以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

###與LIWC情緒字典做join 文集中的字出現在LIWC字典中是屬於positive還是negative

lovesong_count<-lovesong_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
lovesong_count
## # A tibble: 28 x 3
##    word   count sentiment
##    <chr>  <int> <fct>    
##  1 親愛的    15 positive 
##  2 想念      12 positive 
##  3 微笑      10 positive 
##  4 朋友       9 positive 
##  5 眼淚       9 negative 
##  6 擁抱       9 positive 
##  7 折磨       9 negative 
##  8 瘋狂       7 negative 
##  9 後悔       7 negative 
## 10 快樂       7 positive 
## # … with 18 more rows
allsong_count<-allsong_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
allsong_count
## # A tibble: 41 x 3
##    word   count sentiment
##    <chr>  <int> <fct>    
##  1 難過      20 negative 
##  2 愛情      19 positive 
##  3 快樂      18 positive 
##  4 擁抱      18 positive 
##  5 親愛的    17 positive 
##  6 抱歉      14 negative 
##  7 害怕      14 negative 
##  8 朋友      14 positive 
##  9 微笑      14 positive 
## 10 想念      14 positive 
## # … with 31 more rows

###排行榜歌曲正負情緒詞彙排名

allsong_count  %>%
  group_by(word,sentiment) %>%
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%filter(count>5) %>% 
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  theme(text = element_text(family='STHeitiTC-Light'))+
  coord_flip()

###排行榜情歌正負情緒詞彙排名

lovesong_count  %>%
  group_by(word,sentiment) %>%
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%filter(count>2) %>% 
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  theme(text = element_text(family='STHeitiTC-Light'))+
  coord_flip()

排行榜歌曲個歌曲正負情緒詞個數

all_song_rank <- read.csv("Lyric_All.csv", fileEncoding="Big-5", fill=TRUE)
all_song_rank$Lyric <- as.character(all_song_rank$Lyric)
all_song_rank_jieba <- segment(all_song_rank$Lyric, jieba_tokenizer)
all_song_rank_jieba[1:200]
##   [1] "落寞"     "沮喪"     "失望"     "人生"     "迷惘"     "偽裝"    
##   [7] "不痛"     "逞強"     "愛情"     "害怕"     "觸碰"     "放棄"    
##  [13] "掙扎"     "眼睛"     "記著"     "聲音"     "無畏"     "風"      
##  [19] "雨"       "別忘記"   "太陽"     "太陽"     "心"       "心底"    
##  [25] "遠方"     "害怕"     "身旁"     "太陽"     "太陽"     "心"      
##  [31] "心底"     "身旁"     "奮力"     "發光"     "落寞"     "沮喪"    
##  [37] "失望"     "人生"     "迷惘"     "偽裝"     "不痛"     "逞強"    
##  [43] "愛情"     "害怕"     "觸碰"     "放棄"     "掙扎"     "眼睛"    
##  [49] "記著"     "聲音"     "無畏"     "風"       "雨"       "別忘記"  
##  [55] "太陽"     "太陽"     "心"       "心底"     "遠方"     "害怕"    
##  [61] "身旁"     "太陽"     "太陽"     "心"       "心底"     "身旁"    
##  [67] "奮力"     "發光"     "記得"     "所有的"   "幸福"     "太陽"    
##  [73] "太陽"     "心"       "心底"     "遠方"     "害怕"     "身旁"    
##  [79] "太陽"     "太陽"     "心"       "心底"     "身旁"     "奮力"    
##  [85] "發光"     "藍色"     "最愛"     "顏色"     "燦爛"     "笑容"    
##  [91] "擁抱"     "面孔"     "消失"     "我錯了"   "搞錯"     "天灰"    
##  [97] "雨"       "凝望"     "回不去"   "快樂"     "輕易"     "結束"    
## [103] "抱著"     "哭"       "努力"     "祝福"     "心如刀割" "頑固"    
## [109] "絕口不提" "幸福"     "懷念"     "失去"     "曉得"     "藍色"    
## [115] "最愛"     "顏色"     "燦爛"     "笑容"     "擁抱"     "面孔"    
## [121] "消失"     "我錯了"   "搞錯"     "越痛"     "苦"       "約"      
## [127] "快樂"     "輕易"     "結束"     "抱著"     "哭"       "努力"    
## [133] "祝福"     "祝福"     "最愛"     "愛你"     "飛翔"     "可憐"    
## [139] "受傷"     "一個人"   "可憐"     "永遠"     "輕易"     "結束"    
## [145] "抱著"     "哭"       "努力"     "祝福"     "祝福"     "最愛"    
## [151] "永遠"     "飛翔"     "頑固"     "絕口不提" "幸福"     "懷念"    
## [157] "失去"     "曉得"     "落寞"     "沮喪"     "失望"     "人生"    
## [163] "迷惘"     "偽裝"     "不痛"     "逞強"     "愛情"     "害怕"    
## [169] "觸碰"     "放棄"     "掙扎"     "眼睛"     "記著"     "聲音"    
## [175] "無畏"     "風"       "雨"       "別忘記"   "太陽"     "太陽"    
## [181] "心"       "心底"     "遠方"     "害怕"     "身旁"     "太陽"    
## [187] "太陽"     "心"       "心底"     "身旁"     "奮力"     "發光"    
## [193] "落寞"     "沮喪"     "失望"     "人生"     "迷惘"     "偽裝"    
## [199] "不痛"     "逞強"
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word = "stop_words_All.txt")

# 設定斷詞function
song_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- all_song_rank %>% unnest_tokens(word, Lyric, token=song_tokenizer)
str(tokens)
## 'data.frame':    5408 obs. of  2 variables:
##  $ Rank: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word: chr  "落寞" "沮喪" "失望" "人生" ...
head(tokens, 100)
##      Rank   word
## 1       1   落寞
## 1.1     1   沮喪
## 1.2     1   失望
## 1.3     1   人生
## 1.4     1   迷惘
## 1.5     1   偽裝
## 1.6     1   不痛
## 1.7     1   逞強
## 1.8     1   愛情
## 1.9     1   害怕
## 1.10    1   觸碰
## 1.11    1   放棄
## 1.12    1   掙扎
## 1.13    1   眼睛
## 1.14    1   記著
## 1.15    1   聲音
## 1.16    1   無畏
## 1.17    1     風
## 1.18    1     雨
## 1.19    1 別忘記
## 1.20    1   太陽
## 1.21    1   太陽
## 1.22    1     心
## 1.23    1   心底
## 1.24    1   遠方
## 1.25    1   害怕
## 1.26    1   身旁
## 1.27    1   太陽
## 1.28    1   太陽
## 1.29    1     心
## 1.30    1   心底
## 1.31    1   身旁
## 1.32    1   奮力
## 1.33    1   發光
## 1.34    1   落寞
## 1.35    1   沮喪
## 1.36    1   失望
## 1.37    1   人生
## 1.38    1   迷惘
## 1.39    1   偽裝
## 1.40    1   不痛
## 1.41    1   逞強
## 1.42    1   愛情
## 1.43    1   害怕
## 1.44    1   觸碰
## 1.45    1   放棄
## 1.46    1   掙扎
## 1.47    1   眼睛
## 1.48    1   記著
## 1.49    1   聲音
## 1.50    1   無畏
## 1.51    1     風
## 1.52    1     雨
## 1.53    1 別忘記
## 1.54    1   太陽
## 1.55    1   太陽
## 1.56    1     心
## 1.57    1   心底
## 1.58    1   遠方
## 1.59    1   害怕
## 1.60    1   身旁
## 1.61    1   太陽
## 1.62    1   太陽
## 1.63    1     心
## 1.64    1   心底
## 1.65    1   身旁
## 1.66    1   奮力
## 1.67    1   發光
## 1.68    1   記得
## 1.69    1 所有的
## 1.70    1   幸福
## 1.71    1   太陽
## 1.72    1   太陽
## 1.73    1     心
## 1.74    1   心底
## 1.75    1   遠方
## 1.76    1   害怕
## 1.77    1   身旁
## 1.78    1   太陽
## 1.79    1   太陽
## 1.80    1     心
## 1.81    1   心底
## 1.82    1   身旁
## 1.83    1   奮力
## 1.84    1   發光
## 2       2   藍色
## 2.1     2   最愛
## 2.2     2   顏色
## 2.3     2   燦爛
## 2.4     2   笑容
## 2.5     2   擁抱
## 2.6     2   面孔
## 2.7     2   消失
## 2.8     2 我錯了
## 2.9     2   搞錯
## 2.10    2   天灰
## 2.11    2     雨
## 2.12    2   凝望
## 2.13    2 回不去
## 2.14    2   快樂

###計算詞彙的出現次數

tokens_count <- tokens %>% 
  select(word,Rank) %>%
  group_by(word,Rank) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

###與LIWC情緒字典做join

song_sentiment_count = tokens_count %>%
  select(word,Rank,count) %>%
  inner_join(LIWC) %>% 
  group_by(Rank,sentiment) %>%
  summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
song_sentiment_count %>%
  ggplot()+
  geom_line(aes(x=Rank,y=count,colour=sentiment))