###系統參數設定
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
###安裝需要的packages
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
require(readr)
## Loading required package: readr
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
require(knitr)
## Loading required package: knitr
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(tm)
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
#資料來源:"mymusic.net KTV熱唱必備►國語K歌排行超精選 https://www.mymusic.net.tw/ux/w/themeInfo/3354
###初始化斷詞引擎
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
###以預設參數進行斷詞
all_song <- read_file("Lyric_All.txt")
allsong_jieba <- segment(all_song, jieba_tokenizer)
allsong_jieba[1:200]
## [1] "你" "總" "感到" "落寞" "沮喪" "你" "總"
## [8] "感到" "失望" "對於" "人生" "未來" "總有" "太多"
## [15] "迷惘" "你" "總偽裝" "自己" "不痛" "你" "總笑"
## [22] "著" "逞強" "對於" "愛情" "害怕" "觸碰" "放棄"
## [29] "掙扎" "你" "看" "著" "我" "眼睛" "你"
## [36] "記著" "我" "聲音" "無畏" "風雨" "別忘記" "還有"
## [43] "我" "站" "在" "這裡" "我" "只" "想"
## [50] "做" "你" "的" "太陽" "你" "的" "太陽"
## [57] "在" "你" "的" "心裡" "呀" "在" "你"
## [64] "的" "心底" "呀" "不管" "是" "多" "遠"
## [71] "的" "遠方" "不要" "害怕" "我" "在" "身旁"
## [78] "想" "做" "你" "的" "太陽" "你" "的"
## [85] "太陽" "在" "你" "的" "心裡" "呀" "在"
## [92] "你" "的" "心底" "呀" "就算" "不能" "在"
## [99] "你" "身旁" "也" "要" "奮力" "為" "你"
## [106] "而" "發光" "你" "總" "感到" "落寞" "沮喪"
## [113] "你" "總" "感到" "失望" "對於" "人生" "未來"
## [120] "總有" "太多" "迷惘" "你" "總偽裝" "自己" "不痛"
## [127] "你" "總笑" "著" "逞強" "對於" "愛情" "害怕"
## [134] "觸碰" "放棄" "掙扎" "你" "看" "著" "我"
## [141] "眼睛" "你" "記著" "我" "聲音" "無畏" "風雨"
## [148] "別忘記" "還有" "我" "站" "在" "這裡" "我"
## [155] "只" "想" "做" "你" "的" "太陽" "你"
## [162] "的" "太陽" "在" "你" "的" "心裡" "呀"
## [169] "在" "你" "的" "心底" "呀" "不管" "是"
## [176] "多" "遠" "的" "遠方" "不要" "害怕" "我"
## [183] "在" "身旁" "想" "做" "你" "的" "太陽"
## [190] "你" "的" "太陽" "在" "你" "的" "心裡"
## [197] "呀" "在" "你" "的"
抓取前50名歌曲,依每首歌曲的特性,創建其user_dict。將user_dict統整,並去除重複字詞。
###使用使用者自訂字典,以外部檔案格式加入
jieba_tokenizer <- worker(user="user_dict_All.txt")
allsong_userdict <- segment(all_song, jieba_tokenizer)
allsong_userdict[1:200]
## [1] "你" "總" "感到" "落寞" "沮喪" "你" "總"
## [8] "感到" "失望" "對於" "人生" "未來" "總有" "太"
## [15] "多" "迷惘" "你" "總" "偽裝" "自己" "不痛"
## [22] "你" "總" "笑著" "逞強" "對於" "愛情" "害怕"
## [29] "觸碰" "放棄" "掙扎" "你" "看著" "我" "眼睛"
## [36] "你" "記著" "我" "聲音" "無畏" "風" "雨"
## [43] "別忘記" "還有" "我" "站" "在" "這" "裡"
## [50] "我" "只想" "做" "你" "的" "太陽" "你"
## [57] "的" "太陽" "在" "你" "的" "心" "裡"
## [64] "呀" "在" "你" "的" "心底" "呀" "不管"
## [71] "是" "多遠" "的" "遠方" "不要" "害怕" "我"
## [78] "在" "身旁" "想" "做" "你" "的" "太陽"
## [85] "你" "的" "太陽" "在" "你" "的" "心"
## [92] "裡" "呀" "在" "你" "的" "心底" "呀"
## [99] "就算" "不能" "在" "你" "身旁" "也" "要"
## [106] "奮力" "為" "你" "而" "發光" "你" "總"
## [113] "感到" "落寞" "沮喪" "你" "總" "感到" "失望"
## [120] "對於" "人生" "未來" "總有" "太" "多" "迷惘"
## [127] "你" "總" "偽裝" "自己" "不痛" "你" "總"
## [134] "笑著" "逞強" "對於" "愛情" "害怕" "觸碰" "放棄"
## [141] "掙扎" "你" "看著" "我" "眼睛" "你" "記著"
## [148] "我" "聲音" "無畏" "風" "雨" "別忘記" "還有"
## [155] "我" "站" "在" "這" "裡" "我" "只想"
## [162] "做" "你" "的" "太陽" "你" "的" "太陽"
## [169] "在" "你" "的" "心" "裡" "呀" "在"
## [176] "你" "的" "心底" "呀" "不管" "是" "多遠"
## [183] "的" "遠方" "不要" "害怕" "我" "在" "身旁"
## [190] "想" "做" "你" "的" "太陽" "你" "的"
## [197] "太陽" "在" "你" "的"
抓取前50名歌曲,依每首歌曲的特性,創建其stop_word。將stop_word統整,並去除重複字詞。
###使用停用詞參數,以外部檔案格式加入
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word="stop_words_All.txt")
allsong_stopword <- segment(all_song, jieba_tokenizer)
allsong_stopword[1:200]
## [1] "落寞" "沮喪" "失望" "人生" "迷惘" "偽裝"
## [7] "不痛" "逞強" "愛情" "害怕" "觸碰" "放棄"
## [13] "掙扎" "眼睛" "記著" "聲音" "無畏" "風"
## [19] "雨" "別忘記" "太陽" "太陽" "心" "心底"
## [25] "遠方" "害怕" "身旁" "太陽" "太陽" "心"
## [31] "心底" "身旁" "奮力" "發光" "落寞" "沮喪"
## [37] "失望" "人生" "迷惘" "偽裝" "不痛" "逞強"
## [43] "愛情" "害怕" "觸碰" "放棄" "掙扎" "眼睛"
## [49] "記著" "聲音" "無畏" "風" "雨" "別忘記"
## [55] "太陽" "太陽" "心" "心底" "遠方" "害怕"
## [61] "身旁" "太陽" "太陽" "心" "心底" "身旁"
## [67] "奮力" "發光" "記得" "所有的" "幸福" "太陽"
## [73] "太陽" "心" "心底" "遠方" "害怕" "身旁"
## [79] "太陽" "太陽" "心" "心底" "身旁" "奮力"
## [85] "發光" "藍色" "最愛" "顏色" "燦爛" "笑容"
## [91] "擁抱" "面孔" "消失" "我錯了" "搞錯" "天灰"
## [97] "雨" "凝望" "回不去" "快樂" "輕易" "結束"
## [103] "抱著" "哭" "努力" "祝福" "心如刀割" "頑固"
## [109] "絕口不提" "幸福" "懷念" "失去" "曉得" "藍色"
## [115] "最愛" "顏色" "燦爛" "笑容" "擁抱" "面孔"
## [121] "消失" "我錯了" "搞錯" "越痛" "苦" "約"
## [127] "快樂" "輕易" "結束" "抱著" "哭" "努力"
## [133] "祝福" "祝福" "最愛" "愛你" "飛翔" "可憐"
## [139] "受傷" "一個人" "可憐" "永遠" "輕易" "結束"
## [145] "抱著" "哭" "努力" "祝福" "祝福" "最愛"
## [151] "永遠" "飛翔" "頑固" "絕口不提" "幸福" "懷念"
## [157] "失去" "曉得" "落寞" "沮喪" "失望" "人生"
## [163] "迷惘" "偽裝" "不痛" "逞強" "愛情" "害怕"
## [169] "觸碰" "放棄" "掙扎" "眼睛" "記著" "聲音"
## [175] "無畏" "風" "雨" "別忘記" "太陽" "太陽"
## [181] "心" "心底" "遠方" "害怕" "身旁" "太陽"
## [187] "太陽" "心" "心底" "身旁" "奮力" "發光"
## [193] "落寞" "沮喪" "失望" "人生" "迷惘" "偽裝"
## [199] "不痛" "逞強"
###計算每一個字詞出現次數
allsong_stopword <- as.data.frame(allsong_stopword)
allsong_count <- allsong_stopword %>%
count(allsong_stopword,sort=T) %>%
filter(n>5)
colnames(allsong_count) <- c("word","count")
allsong_count[1:200,]
## # A tibble: 200 x 2
## word count
## <fct> <int>
## 1 喵 67
## 2 騙吃 44
## 3 心 30
## 4 腦公 28
## 5 辣台妹 27
## 6 太陽 26
## 7 搖落 23
## 8 世界 21
## 9 夢 20
## 10 難過 20
## # … with 190 more rows
###繪製文字雲
allsong_count %>% wordcloud2()
###以預設參數進行斷詞
love_song <- read_file("song.txt")
lovesong_jieba <- segment(love_song, jieba_tokenizer)
lovesong_jieba[1:200]
## [1] "還沒" "櫻花季" "還沒" "照相機" "還沒"
## [6] "光臨" "餐廳" "期待" "有著" "旅行"
## [11] "等待" "日落" "巴黎" "鐵塔" "之下"
## [16] "等待" "願意" "等待" "一點一滴" "珍惜"
## [21] "好好的" "愛你" "倒數" "剩下" "快樂"
## [26] "相擁" "狂熱" "永遠" "深刻" "心跳"
## [31] "倒數" "生命" "剩下" "溫熱" "至少"
## [36] "烏黑" "頭髮" "漆黑" "過後" "旭日"
## [41] "淚流" "堅持" "日" "復" "日"
## [46] "放棄" "重複" "願意" "還沒" "退化"
## [51] "眼睛" "抓緊時間" "不止息" "風景" "新"
## [56] "生命" "咖啡" "酸" "晚餐" "不吃"
## [61] "冷" "錯過" "後悔" "夢" "未"
## [66] "實現" "醒" "心" "開過" "灰"
## [71] "追逐" "旅途" "曲折" "曲折" "至少"
## [76] "痛" "快樂" "愛過" "算" "活著"
## [81] "別無所求" "still" "same" "有變" "Nothing"
## [86] "changed" "討厭" "下雨天" "認錯" "脾氣"
## [91] "硬" "點" "清楚" "辦法" "改變"
## [96] "後悔" "高中" "花錢" "裝很吵" "排氣管"
## [101] "努力" "賺錢" "養" "養成" "壞習慣"
## [106] "晃晃" "玩" "without" "feels" "like"
## [111] "情人節" "台" "相機" "快門" "張"
## [116] "書桌" "檯燈" "without" "feels" "likeA"
## [121] "sentence" "without" "spacesIt" "like" "youtube"
## [126] "without" "playlist" "chapter" "without" "pages"
## [131] "without" "feels" "wrong" "live" "without"
## [136] "alone" "想念" "角度" "重" "You"
## [141] "know" "feel" "inside" "live" "without"
## [146] "辦法" "been" "through" "重蹈" "覆徹"
## [151] "can" "沒得" "負責" "damn" "bad"
## [156] "need" "right" "面子" "兩邊" "榨乾"
## [161] "搶劫" "Goddamn" "got" "nothing" "left"
## [166] "掉進去" "fun" "down" "here" "救救"
## [171] "call" "my" "friends" "訴苦" "好幾遍"
## [176] "振作" "點" "know" "man" "but"
## [181] "fuckin" "can" "刪了" "instagram" "Your"
## [186] "post" "see" "promise" "一支" "Hennessy"
## [191] "know" "baby" "why" "gonna" "go"
## [196] "捨不得" "放手" "yeah" "失去" "失去"
###使用使用者自訂字典,以外部檔案格式加入
jieba_tokenizer <- worker(user="user_dict_All.txt")
lovesong_userdict <- segment(love_song, jieba_tokenizer)
lovesong_userdict[1:200]
## [1] "還沒" "到" "的" "櫻花季" "還沒" "用"
## [7] "的" "照相機" "還沒" "光臨" "的" "餐廳"
## [13] "還" "在" "期待" "有著" "你" "的"
## [19] "旅行" "等待" "日落" "的" "巴黎" "鐵塔"
## [25] "之下" "牽著" "你" "等待" "說" "著"
## [31] "我" "願意" "等待" "未來" "每天" "身邊"
## [37] "有" "你" "一點一滴" "每" "一天" "珍惜"
## [43] "怕" "突然" "來不及" "好好的" "愛你" "時針"
## [49] "一直" "倒數" "著" "我們" "剩下" "的"
## [55] "快樂" "此刻" "相擁" "的" "狂熱" "卻"
## [61] "永遠" "都" "深刻" "心跳" "一直" "倒數"
## [67] "著" "生命" "剩下" "的" "溫熱" "至少"
## [73] "用力" "地" "愛" "著" "還" "烏黑"
## [79] "的" "頭髮" "有" "你" "就" "不怕"
## [85] "白" "了" "漆黑" "過後" "是" "旭日"
## [91] "淚流" "以後" "是" "堅持" "真的" "愛"
## [97] "是" "日" "復" "日" "從不" "放棄"
## [103] "重複" "說" "你" "願意" "還沒" "退化"
## [109] "的" "眼睛" "抓緊時間" "看看" "你" "愛"
## [115] "是" "從來" "不止息" "一個" "風景" "每天"
## [121] "新" "的" "生命" "咖啡" "再" "不"
## [127] "喝" "就" "酸" "了" "晚餐" "再"
## [133] "不吃" "就" "冷" "了" "愛" "著"
## [139] "為" "什麼" "不" "說" "呢" "難道"
## [145] "錯過" "了" "才" "來" "後悔" "著"
## [151] "誰" "夢" "未" "實現" "就" "醒"
## [157] "了" "誰" "心" "沒" "開過" "就"
## [163] "灰" "了" "追逐" "愛" "的" "旅途"
## [169] "曲折" "就算" "再" "曲折" "為" "你"
## [175] "都" "值得" "至少" "痛" "並" "快樂"
## [181] "著" "愛過" "才" "算" "活著" "有"
## [187] "你" "別無所求" "了" "I" "m" "still"
## [193] "the" "same" "都" "好像" "沒" "有變"
## [199] "Nothing" "changed"
###使用停用詞參數,以外部檔案格式加入
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word="stop_words_All.txt")
lovesong_stopword <- segment(love_song, jieba_tokenizer)
lovesong_stopword[1:200]
## [1] "還沒" "櫻花季" "還沒" "照相機" "還沒"
## [6] "光臨" "餐廳" "期待" "有著" "旅行"
## [11] "等待" "日落" "巴黎" "鐵塔" "之下"
## [16] "等待" "願意" "等待" "一點一滴" "珍惜"
## [21] "好好的" "愛你" "倒數" "剩下" "快樂"
## [26] "相擁" "狂熱" "永遠" "深刻" "心跳"
## [31] "倒數" "生命" "剩下" "溫熱" "至少"
## [36] "烏黑" "頭髮" "漆黑" "過後" "旭日"
## [41] "淚流" "堅持" "日" "復" "日"
## [46] "放棄" "重複" "願意" "還沒" "退化"
## [51] "眼睛" "抓緊時間" "不止息" "風景" "新"
## [56] "生命" "咖啡" "酸" "晚餐" "不吃"
## [61] "冷" "錯過" "後悔" "夢" "未"
## [66] "實現" "醒" "心" "開過" "灰"
## [71] "追逐" "旅途" "曲折" "曲折" "至少"
## [76] "痛" "快樂" "愛過" "算" "活著"
## [81] "別無所求" "still" "same" "有變" "Nothing"
## [86] "changed" "討厭" "下雨天" "認錯" "脾氣"
## [91] "硬" "點" "清楚" "辦法" "改變"
## [96] "後悔" "高中" "花錢" "裝很吵" "排氣管"
## [101] "努力" "賺錢" "養" "養成" "壞習慣"
## [106] "晃晃" "玩" "without" "feels" "like"
## [111] "情人節" "台" "相機" "快門" "張"
## [116] "書桌" "檯燈" "without" "feels" "likeA"
## [121] "sentence" "without" "spacesIt" "like" "youtube"
## [126] "without" "playlist" "chapter" "without" "pages"
## [131] "without" "feels" "wrong" "live" "without"
## [136] "alone" "想念" "角度" "重" "You"
## [141] "know" "feel" "inside" "live" "without"
## [146] "辦法" "been" "through" "重蹈" "覆徹"
## [151] "can" "沒得" "負責" "damn" "bad"
## [156] "need" "right" "面子" "兩邊" "榨乾"
## [161] "搶劫" "Goddamn" "got" "nothing" "left"
## [166] "掉進去" "fun" "down" "here" "救救"
## [171] "call" "my" "friends" "訴苦" "好幾遍"
## [176] "振作" "點" "know" "man" "but"
## [181] "fuckin" "can" "刪了" "instagram" "Your"
## [186] "post" "see" "promise" "一支" "Hennessy"
## [191] "know" "baby" "why" "gonna" "go"
## [196] "捨不得" "放手" "yeah" "失去" "失去"
###計算每一個字詞出現次數
lovesong_stopword <- as.data.frame(lovesong_stopword)
lovesong_count <- lovesong_stopword %>%
count(lovesong_stopword,sort=T) %>%
filter(n>3)
colnames(lovesong_count) <- c("word","count")
lovesong_count[1:200,]
## # A tibble: 200 x 2
## word count
## <fct> <int>
## 1 腦公 28
## 2 親愛的 15
## 3 透明 12
## 4 想念 12
## 5 回憶 10
## 6 世界 10
## 7 忘記 10
## 8 微笑 10
## 9 without 10
## 10 愛你 9
## # … with 190 more rows
###情歌文字雲
lovesong_count %>% filter(count > 3) -> lovesong_count
wordcloud(lovesong_count$word,lovesong_count$count,random.order = FALSE, scale = c(5,.2),ordered.colors = FALSE,rot.per = FALSE,colors = brewer.pal(8,"Dark2"),family='STHeitiTC-Light')
###安裝需要的packages
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
library(tidyr)
library(readr)
library(scales)
###準備LIWC字典 以LIWC字典判斷文集中的word屬於正面字還是負面字
# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
###與LIWC情緒字典做join 文集中的字出現在LIWC字典中是屬於positive還是negative
lovesong_count<-lovesong_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
lovesong_count
## # A tibble: 28 x 3
## word count sentiment
## <chr> <int> <fct>
## 1 親愛的 15 positive
## 2 想念 12 positive
## 3 微笑 10 positive
## 4 朋友 9 positive
## 5 眼淚 9 negative
## 6 擁抱 9 positive
## 7 折磨 9 negative
## 8 瘋狂 7 negative
## 9 後悔 7 negative
## 10 快樂 7 positive
## # … with 18 more rows
allsong_count<-allsong_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
allsong_count
## # A tibble: 41 x 3
## word count sentiment
## <chr> <int> <fct>
## 1 難過 20 negative
## 2 愛情 19 positive
## 3 快樂 18 positive
## 4 擁抱 18 positive
## 5 親愛的 17 positive
## 6 抱歉 14 negative
## 7 害怕 14 negative
## 8 朋友 14 positive
## 9 微笑 14 positive
## 10 想念 14 positive
## # … with 31 more rows
###排行榜歌曲正負情緒詞彙排名
allsong_count %>%
group_by(word,sentiment) %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%filter(count>5) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family='STHeitiTC-Light'))+
coord_flip()
###排行榜情歌正負情緒詞彙排名
lovesong_count %>%
group_by(word,sentiment) %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%filter(count>2) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family='STHeitiTC-Light'))+
coord_flip()
all_song_rank <- read.csv("Lyric_All.csv", fileEncoding="Big-5", fill=TRUE)
all_song_rank$Lyric <- as.character(all_song_rank$Lyric)
all_song_rank_jieba <- segment(all_song_rank$Lyric, jieba_tokenizer)
all_song_rank_jieba[1:200]
## [1] "落寞" "沮喪" "失望" "人生" "迷惘" "偽裝"
## [7] "不痛" "逞強" "愛情" "害怕" "觸碰" "放棄"
## [13] "掙扎" "眼睛" "記著" "聲音" "無畏" "風"
## [19] "雨" "別忘記" "太陽" "太陽" "心" "心底"
## [25] "遠方" "害怕" "身旁" "太陽" "太陽" "心"
## [31] "心底" "身旁" "奮力" "發光" "落寞" "沮喪"
## [37] "失望" "人生" "迷惘" "偽裝" "不痛" "逞強"
## [43] "愛情" "害怕" "觸碰" "放棄" "掙扎" "眼睛"
## [49] "記著" "聲音" "無畏" "風" "雨" "別忘記"
## [55] "太陽" "太陽" "心" "心底" "遠方" "害怕"
## [61] "身旁" "太陽" "太陽" "心" "心底" "身旁"
## [67] "奮力" "發光" "記得" "所有的" "幸福" "太陽"
## [73] "太陽" "心" "心底" "遠方" "害怕" "身旁"
## [79] "太陽" "太陽" "心" "心底" "身旁" "奮力"
## [85] "發光" "藍色" "最愛" "顏色" "燦爛" "笑容"
## [91] "擁抱" "面孔" "消失" "我錯了" "搞錯" "天灰"
## [97] "雨" "凝望" "回不去" "快樂" "輕易" "結束"
## [103] "抱著" "哭" "努力" "祝福" "心如刀割" "頑固"
## [109] "絕口不提" "幸福" "懷念" "失去" "曉得" "藍色"
## [115] "最愛" "顏色" "燦爛" "笑容" "擁抱" "面孔"
## [121] "消失" "我錯了" "搞錯" "越痛" "苦" "約"
## [127] "快樂" "輕易" "結束" "抱著" "哭" "努力"
## [133] "祝福" "祝福" "最愛" "愛你" "飛翔" "可憐"
## [139] "受傷" "一個人" "可憐" "永遠" "輕易" "結束"
## [145] "抱著" "哭" "努力" "祝福" "祝福" "最愛"
## [151] "永遠" "飛翔" "頑固" "絕口不提" "幸福" "懷念"
## [157] "失去" "曉得" "落寞" "沮喪" "失望" "人生"
## [163] "迷惘" "偽裝" "不痛" "逞強" "愛情" "害怕"
## [169] "觸碰" "放棄" "掙扎" "眼睛" "記著" "聲音"
## [175] "無畏" "風" "雨" "別忘記" "太陽" "太陽"
## [181] "心" "心底" "遠方" "害怕" "身旁" "太陽"
## [187] "太陽" "心" "心底" "身旁" "奮力" "發光"
## [193] "落寞" "沮喪" "失望" "人生" "迷惘" "偽裝"
## [199] "不痛" "逞強"
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word = "stop_words_All.txt")
# 設定斷詞function
song_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- all_song_rank %>% unnest_tokens(word, Lyric, token=song_tokenizer)
str(tokens)
## 'data.frame': 5408 obs. of 2 variables:
## $ Rank: int 1 1 1 1 1 1 1 1 1 1 ...
## $ word: chr "落寞" "沮喪" "失望" "人生" ...
head(tokens, 100)
## Rank word
## 1 1 落寞
## 1.1 1 沮喪
## 1.2 1 失望
## 1.3 1 人生
## 1.4 1 迷惘
## 1.5 1 偽裝
## 1.6 1 不痛
## 1.7 1 逞強
## 1.8 1 愛情
## 1.9 1 害怕
## 1.10 1 觸碰
## 1.11 1 放棄
## 1.12 1 掙扎
## 1.13 1 眼睛
## 1.14 1 記著
## 1.15 1 聲音
## 1.16 1 無畏
## 1.17 1 風
## 1.18 1 雨
## 1.19 1 別忘記
## 1.20 1 太陽
## 1.21 1 太陽
## 1.22 1 心
## 1.23 1 心底
## 1.24 1 遠方
## 1.25 1 害怕
## 1.26 1 身旁
## 1.27 1 太陽
## 1.28 1 太陽
## 1.29 1 心
## 1.30 1 心底
## 1.31 1 身旁
## 1.32 1 奮力
## 1.33 1 發光
## 1.34 1 落寞
## 1.35 1 沮喪
## 1.36 1 失望
## 1.37 1 人生
## 1.38 1 迷惘
## 1.39 1 偽裝
## 1.40 1 不痛
## 1.41 1 逞強
## 1.42 1 愛情
## 1.43 1 害怕
## 1.44 1 觸碰
## 1.45 1 放棄
## 1.46 1 掙扎
## 1.47 1 眼睛
## 1.48 1 記著
## 1.49 1 聲音
## 1.50 1 無畏
## 1.51 1 風
## 1.52 1 雨
## 1.53 1 別忘記
## 1.54 1 太陽
## 1.55 1 太陽
## 1.56 1 心
## 1.57 1 心底
## 1.58 1 遠方
## 1.59 1 害怕
## 1.60 1 身旁
## 1.61 1 太陽
## 1.62 1 太陽
## 1.63 1 心
## 1.64 1 心底
## 1.65 1 身旁
## 1.66 1 奮力
## 1.67 1 發光
## 1.68 1 記得
## 1.69 1 所有的
## 1.70 1 幸福
## 1.71 1 太陽
## 1.72 1 太陽
## 1.73 1 心
## 1.74 1 心底
## 1.75 1 遠方
## 1.76 1 害怕
## 1.77 1 身旁
## 1.78 1 太陽
## 1.79 1 太陽
## 1.80 1 心
## 1.81 1 心底
## 1.82 1 身旁
## 1.83 1 奮力
## 1.84 1 發光
## 2 2 藍色
## 2.1 2 最愛
## 2.2 2 顏色
## 2.3 2 燦爛
## 2.4 2 笑容
## 2.5 2 擁抱
## 2.6 2 面孔
## 2.7 2 消失
## 2.8 2 我錯了
## 2.9 2 搞錯
## 2.10 2 天灰
## 2.11 2 雨
## 2.12 2 凝望
## 2.13 2 回不去
## 2.14 2 快樂
###計算詞彙的出現次數
tokens_count <- tokens %>%
select(word,Rank) %>%
group_by(word,Rank) %>%
summarise(count = n()) %>%
arrange(desc(count))
###與LIWC情緒字典做join
song_sentiment_count = tokens_count %>%
select(word,Rank,count) %>%
inner_join(LIWC) %>%
group_by(Rank,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
song_sentiment_count %>%
ggplot()+
geom_line(aes(x=Rank,y=count,colour=sentiment))