Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

## Loading required package: readr
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: stringr
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Loading required package: tidytext
## Loading required package: NLP
## Loading required package: tidyr
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: ggraph
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Loading required package: widyr
## Loading required package: knitr
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

載入資料集

#載入原始資料集
sekiro<- read_csv("./sekiro_test_articleMetaData.csv") %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_integer(),
##   push = col_integer(),
##   boo = col_integer(),
##   sentence = col_character()
## )
sekiro %>% head(10) %>% kable %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "300px")
artTitle artDate artTime artUrl artPoster artCat commentNum push boo sentence
[閒聊]隻狼的前世天誅 2019-03-19 14:37:41 https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html tom80727 PlayStation 107 64 0 https://www.youtube.com/watch?v=JIHv92a4MWs 我覺得這個影片說明真的貼切。沒有比天諸更忍者的遊戲了。連遊戲本身也是影子XD。話說我只有玩過2004年出的天誅 紅。他算是添誅系列遊戲性前幾的吧。只是故事沒有2代好。期待週五的隻狼
[情報]隻狼最新預告(中文翻譯)帥翻了阿!!! 2019-03-20 02:35:41 https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html howerlai1990 PlayStation 64 48 0 影片連結: https://youtu.be/0n2kkcK9KMQ 這預告帥到翻天阿 小弟就手刀翻譯了一下 希望能有更多人能體會隻狼的魅力阿!!。如果喜歡影片的話再麻煩訂閱支持一下 以後會更新其他新遊戲的預告喔
[實況]隻狼日版 2019-03-21 06:49:08 https://www.ptt.cc/bbs/PlayStation/M.1553180111.A.3C6.html lovelylion2 PlayStation 1 1 0 實況遊戲: [PS4] 隻狼 Sekiro 日版。實況網址: https://www.twitch.tv/lovelylion 03/21 22:55
[實況]Mera慢玩隻狼:暗影雙死00點開玩~ 2019-03-21 07:36:54 https://www.ptt.cc/bbs/PlayStation/M.1553182978.A.1EF.html chi1206 PlayStation 0 0 0 實況主: Mera。實況網址: https://www.twitch.tv/mera_1206/ 哈囉,大家晚上好 今天要來玩期待已久的 - 隻狼:暗影雙死!!。台主自己類魂遊戲玩過 仁王、黑魂1&3、血源。之前在玩的時候 遊戲中遇上問題都能直接向聊天室發問 (觀眾也很喜歡擺老就是了lol) 而這次的隻狼可以在跟大家同個起跑點上一起探索這遊戲了
[實況]小美SEKIRO:SHADOWDIETWICE隻狼 2019-03-21 07:50:32 https://www.ptt.cc/bbs/PlayStation/M.1553183795.A.3B7.html st12231 PlayStation 0 0 0 實況主播:小美。國家語系:中文。今天要播什麼:SEKIRO: SHADOW DIE TWICE 隻狼。點我連結:。Twitch: https://www.twitch.tv/st12231 FB: https://reurl.cc/bV0Dy YT: https://youtu.be/o0Q7TAG5RDs 繼血源詛咒後最期待的類暗魂遊戲,搶先玩1小時明晚繼續。家有女兒,隨時可能會哭要去報警她,請見諒。可以的話,順手按個追蹤幫解FB百人任務。讓我可以1080p最高畫質輸出 拜偷拜偷。直播純分享與紀錄。歡迎一起聊天喔~
[問題]請問隻狼主題要去哪裡下載 2019-03-21 14:09:48 https://www.ptt.cc/bbs/PlayStation/M.1553206550.A.99F.html AkaiNamida PlayStation 8 3 0 剛剛要裝隻狼主題的時候他說資料毀損。叫我刪掉重載。可是我刪掉之後就找不到要在哪裡下載回來了。求解
[實況]【塔利安】SEKIRO隻狼 2019-03-21 20:01:24 https://www.ptt.cc/bbs/PlayStation/M.1553227647.A.D7F.html sirakawabird PlayStation 3 2 0 直播主帳號:sirakawabird。國家語系:中文。今天要播什麼:SEKIRO:SHADOW DIE TWICE 隻狼 第一輪。點我連結: https://www.youtube.com/user/sirakawabird/live https://www.twitch.tv/sirakawabird 想跟觀眾說:。目前進度要去拿XX斬。可以居合的忍者超棒的哦哦哦哦。葦名奧義書真的要早點拿到 正面交鋒需要的技能都在葦名奧義書裡
[問題]ps4pro隻狼流暢度? 2019-03-22 04:15:39 https://www.ptt.cc/bbs/PlayStation/M.1553257302.A.9C6.html willie7878 PlayStation 145 33 19 如題 小弟我已經買了steam的隻狼 進入遊戲後就發現滿多問題的:。1.是手把沒有直接支援ds4,就算ps4手把能玩還是卡卡的(因為是xbox手把的圖示…… .)但是這種動作遊戲又不想用鍵鼠玩,很沒感覺,而且用滑鼠轉視角頭很暈(其他遊戲 完全沒暈過)。2.我是R5+1070,應該算配備不錯的桌機了 但遊玩下來的幀數其實並沒有說到非常高,大概也就是個穩60的狀態。現在有點猶豫 如果幀數沒差很多的話想退steam買ps4 跪求分享意見
[問題]隻狼是不是不適合路癡玩? 2019-03-22 10:56:23 https://www.ptt.cc/bbs/PlayStation/M.1553281345.A.4C1.html david0426 PlayStation 50 33 0 如題。看評分真的有夠高。表哥剛好也有買就借來玩。但是一直死掉重打蠻花時間的(好像是遊戲特色?。但我現在卡在回憶裡面的城池走不出來。因為沒地圖也不知道要去哪…。小怪都殺完了。也沒有任務提示什麼的。我想問大家如果今天打到一個定點。明天繼續打的話會知道往哪裡走嗎?。我已經在城池裡迷路了…。現在打到一心大人出去散步。跑回去問那個女的一心在哪。可是我不知道剛開始怎麼進來的啊QQ。在想是不是DMC5比較適合我…。好歹還有惡魔直覺可以用。隻狼是不是路癡不適合玩啊……。我反而覺得戰鬥不難 知道往哪走比較難
[心得]手殘的隻狼心得 2019-03-22 13:27:17 https://www.ptt.cc/bbs/PlayStation/M.1553290400.A.F94.html nobody1149 PlayStation 58 28 0 先說忍殺的特寫鏡頭真的是 又帥又中二!!!。玩過血源黑魂3仁王(前兩個有拼到白金) 我個人用自己血黑的套路打隻狼打得很痛苦…血源就槍返苦手黑魂舉盾癌,打王全靠閃有 空擋在補幾刀磨到王死…而隻狼完全不行啊!隻狼我覺得主要特色是架招,不斷的和王攻 防才能有效的積累體幹值打出忍殺,而且還不能瞎忙的防禦,下段要跳突刺看破普攻防禦 還要判斷是不是連攻,還不能L1連打要抓攻擊節奏一個一個按…真心難啊啊啊………還一 直按錯鍵。。地圖是很漂亮,但和風的城市就那樣…個人還是比較喜歡血黑那種歐洲城市的風格,而且 能用勾鎖飛的地方不多,路上小怪大多都能靠偷襲忍殺脫戰後再回去偷襲忍殺解決。。回生已經用到npc都得病了…四個血痰了…後面應該能醫治吧!?目前卡在騎馬哥和幻術 阿罵,遊戲時間約八小時…蟒蛇重藏和他的雜兵夥伴卡很久…昨晚又被阿罵虐殺,真的是 阿罵都比我強QQ。還有,雞超強…= =

對全部的文章進行斷句,並儲存結果

  • 將前 10 列斷句後的資料
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
sekiro_sentences <- strsplit(sekiro$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
sekiro_sentences <- data.frame(
                        artUrl = rep(sekiro$artUrl, sapply(sekiro_sentences, length)), 
                        sentence = unlist(sekiro_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

sekiro_sentences$sentence <- as.character(sekiro_sentences$sentence)

sekiro_sentences %>% head(10) %>% kable%>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "300px")
artUrl sentence
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html https://www.youtube.com/watch
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html v=JIHv92a4MWs 我覺得這個影片說明真的貼切
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 沒有比天諸更忍者的遊戲了
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 連遊戲本身也是影子XD
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 話說我只有玩過2004年出的天誅 紅
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 他算是添誅系列遊戲性前幾的吧
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 只是故事沒有2代好
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html 期待週五的隻狼
https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html 影片連結: https://youtu.be/0n2kkcK9KMQ 這預告帥到翻天阿 小弟就手刀翻譯了一下 希望能有更多人能體會隻狼的魅力阿
https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html 如果喜歡影片的話再麻煩訂閱支持一下 以後會更新其他新遊戲的預告喔

接着做斷詞

  • 初始化斷詞器
  • 斷詞與整理斷詞結果
  • 將前出現頻率前 10 大字列出
# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker()

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
    tokens <- segment(x, jieba_tokenizer)
    # 去掉字串長度爲1的詞彙
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
sekiro_words <- sekiro_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
sekiro_words %>% head(10) 
## # A tibble: 10 x 3
##    artUrl                                                     word       n
##    <fct>                                                      <chr>  <int>
##  1 https://www.ptt.cc/bbs/PlayStation/M.1553688397.A.FB5.html 遊戲      27
##  2 https://www.ptt.cc/bbs/PlayStation/M.1553512835.A.A24.html 可以      14
##  3 https://www.ptt.cc/bbs/PlayStation/M.1553688397.A.FB5.html 玩家      14
##  4 https://www.ptt.cc/bbs/Gossiping/M.1553264860.A.480.html   遊戲      13
##  5 https://www.ptt.cc/bbs/PlayStation/M.1553402708.A.1F4.html 攻擊      13
##  6 https://www.ptt.cc/bbs/PlayStation/M.1553491058.A.334.html 雷雷雷    13
##  7 https://www.ptt.cc/bbs/PlayStation/M.1553402708.A.1F4.html 可以      11
##  8 https://www.ptt.cc/bbs/PlayStation/M.1553473876.A.B39.html 體幹值    11
##  9 https://www.ptt.cc/bbs/Gossiping/M.1553264860.A.480.html   敵人      10
## 10 https://www.ptt.cc/bbs/PlayStation/M.1553343706.A.169.html 識破      10

計算 tf-idf

  • 以每篇文章爲單位,計算每個詞彙在的tf-idf值
sekiro_words_tf_idf <- sekiro_words %>%
  bind_tf_idf(word, artUrl, n)

sekiro_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl))
## Selecting by tf_idf
## # A tibble: 1,075 x 6
## # Groups:   artUrl [76]
##    artUrl                                  word      n     tf   idf tf_idf
##    <fct>                                   <chr> <int>  <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/PlayStation/M.1~ 直播      2 0.0513  2.94 0.151 
##  2 https://www.ptt.cc/bbs/PlayStation/M.1~ 不長      1 0.0256  4.33 0.111 
##  3 https://www.ptt.cc/bbs/PlayStation/M.1~ 主播      1 0.0256  3.64 0.0933
##  4 https://www.ptt.cc/bbs/PlayStation/M.1~ 左右      1 0.0256  3.64 0.0933
##  5 https://www.ptt.cc/bbs/PlayStation/M.1~ 平日      1 0.0256  4.33 0.111 
##  6 https://www.ptt.cc/bbs/PlayStation/M.1~ 玩個      1 0.0256  4.33 0.111 
##  7 https://www.ptt.cc/bbs/PlayStation/M.1~ 流暢      1 0.0256  4.33 0.111 
##  8 https://www.ptt.cc/bbs/PlayStation/M.1~ 聊天      1 0.0256  3.64 0.0933
##  9 https://www.ptt.cc/bbs/PlayStation/M.1~ 當碼      1 0.0256  4.33 0.111 
## 10 https://www.ptt.cc/bbs/PlayStation/M.1~ 興趣      1 0.0256  3.64 0.0933
## # ... with 1,065 more rows
  • 選每篇文章,tf-idf最大的十個詞,
  • 並查看每個詞被選中的次數
sekiro_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 1,008 x 2
##    word     nn
##    <chr> <int>
##  1 實況      4
##  2 有點      3
##  3 設定      3
##  4 最近      3
##  5 暗影      3
##  6 經驗      3
##  7 葦名      3
##  8 影片      3
##  9 雙死      3
## 10 一心      2
## # ... with 998 more rows

因爲我們是以每篇文章爲一個document單位(總共有76個document)
因此我們就不畫課本第三章中,比較各document中tf-idf較高的詞彙比較圖

bigram function

  • 載入 stop word 和 negation word
  • 刪除 stop word 後列出前10 大頻率的 bigram
# remove stopwords
jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    bigram<- ngrams(tokens, 2)
    bigram <- lapply(bigram, paste, collapse = " ")
    unlist(bigram)
  })
}

# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 './dict/stop_words.txt' 中的輸入不正確
# load negation words
negation_words <- scan(file = "./dict/negation_words.txt", what=character(),sep=',', 
                   encoding='utf-8',fileEncoding='utf-8')

# remove negation words from stop words list
stop_words <- stop_words[!(stop_words %in% c(negation_words))] #如果negation出現在stop word時,不能被刪除

# 執行bigram分詞
sekiro_bigram <- sekiro %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)


# 清楚包含英文或數字的bigram組合
# 計算每個組合出現的次數
# remove the stop words in bigram
sekiro_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ") %>% head(10)
## # A tibble: 10 x 2
##    bigram            n
##    <chr>         <int>
##  1 隻 狼           141
##  2 不 知道          13
##  3 血源 詛咒        13
##  4 狼 暗影          13
##  5 暗影 雙死        13
##  6 不 死            12
##  7 宮崎 英高        12
##  8 雷雷雷 雷雷雷    11
##  9 玩 隻             9
## 10 都 會             9
  • 刪除 stop word 後列出前10 大頻率的 trigram
jieba_ngram_3 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram<- ngrams(unlist(tokens), 3)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

# 執行ngram分詞
sekiro_ngram_3 <- sekiro %>%
  unnest_tokens(ngrams, sentence, token = jieba_ngram_3)
sekiro_ngram_3 %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  count(ngrams, sort = TRUE)
## # A tibble: 11,817 x 2
##    ngrams                   n
##    <chr>                <int>
##  1 隻 狼 的                22
##  2 狼 暗影 雙死            13
##  3 隻 狼 暗影              13
##  4 的 隻 狼                10
##  5 玩 隻 狼                 9
##  6 雷雷雷 雷雷雷 雷雷雷     9
##  7 了 隻 狼                 6
##  8 不 死 斬                 5
##  9 是 隻 狼                 5
## 10 中文 今天 要播           4
## # ... with 11,807 more rows
# remove the stop words in ngram(n=3)
sekiro_ngram_3 %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")%>% head(10)
## # A tibble: 10 x 2
##    ngrams                   n
##    <chr>                <int>
##  1 狼 暗影 雙死            13
##  2 隻 狼 暗影              13
##  3 玩 隻 狼                 9
##  4 雷雷雷 雷雷雷 雷雷雷     9
##  5 不 死 斬                 5
##  6 中文 今天 要播           4
##  7 這次 隻 狼               4
##  8 遊戲 隻 狼               4
##  9 一直 吃 鱉               3
## 10 我方 暈眩 直             3

從上面的bigram和ngram(n=3)的結果中,我們可以整理出一個斷詞字典。
我們將詞彙整理好存在dict文件夾中的 sekiro_lexicon.txt 中

載入各種字典

  • 讀入整理好的字典和情緒字典
# load sekiro_lexicon
sekiro_lexicon <- fread(file = "./dict/sekiro_lexicon.txt", header= F,sep='\n', encoding = "UTF-8")
colnames(sekiro_lexicon)[1] <- "word"
sekiro_lexicon <- as.character(sekiro_lexicon$word)
# load game company positive words
game_co_pos_words <- fread(file = "./dict/game_company_positive.txt",sep=',', header= F, stringsAsFactors = F, encoding='UTF-8')
game_co_pos_words <- data.frame(word=t(game_co_pos_words))
rownames(game_co_pos_words)<-c(1:103)
game_co_pos_words <- as.character(game_co_pos_words$word)
# load  game company negation words
game_co_neg_words <- fread(file = "./dict/game_company_negative.txt", header= F, sep=',', stringsAsFactors = F, encoding='UTF-8')
game_co_neg_words <- data.frame(word=t(game_co_neg_words))
rownames(game_co_neg_words)<-c(1:73)
game_co_neg_words <- as.character(game_co_neg_words$word)
  • 列出隻狼整理好的字典
sekiro_lexicon
##  [1] "日本"     "武士"     "忍者"     "戰國時代" "戰國"     "第三人稱"
##  [7] "短網址"   "解謎遊戲" "動作遊戲" "日本遊戲" "義手"     "忍術"    
## [13] "忍具"     "潛行"     "暗殺"     "隻狼"     "卿子"     "九郎"    
## [19] "一心"     "英麻"     "葦名國"   "葦名"     "弦一郎"   "龍胤之子"
## [25] "龍胤"     "不死斬"   "不死"     "阿蝶"     "赤鬼"     "幻之阿蝶"
## [31] "幻蝶"     "貓頭鷹"   "宮崎英高" "菁英怪"   "蟒蛇"     "白蛇"    
## [37] "源之宮"   "源宮"     "仙峰寺"   "仙峰上人" "劍聖"     "架刀"    
## [43] "盾反"     "格擋"     "軀幹值"   "體幹值"   "義肢"
  • 使用新的字典和情緒字典斷詞
  • 選出 word2 為情緒字的資料
# 這裏不加入stop word字典
# 因為清掉的話會影響bigram出來的結果
jieba_tokenizer = worker()

# 使用隻狼字典重新斷詞
# 把否定詞也加入斷詞

new_user_word(jieba_tokenizer, c(sekiro_lexicon,negation_words,game_co_pos_words,game_co_neg_words));
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    bigram<- ngrams(unlist(tokens), 2)
    bigram <- lapply(bigram, paste, collapse = " ")
    unlist(bigram)
  })
}


# 執行bigram分詞
sekiro_bigram <- sekiro %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)


# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- sekiro_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
sekiro_sentiment_bigrams <- bigrams_separated %>%
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  filter(word2 %in% c(game_co_pos_words, game_co_neg_words))

sekiro_sentiment_bigrams
## # A tibble: 210 x 11
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push
##    <chr>    <date>     <time>  <chr>  <chr>     <chr>       <int> <int>
##  1 [閒聊]隻狼的~ 2019-03-19 14:37   https~ tom80727  PlayS~        107    64
##  2 [情報]隻狼最~ 2019-03-20 02:35   https~ howerlai~ PlayS~         64    48
##  3 [情報]隻狼最~ 2019-03-20 02:35   https~ howerlai~ PlayS~         64    48
##  4 [實況]Mer~ 2019-03-21 07:36   https~ chi1206   PlayS~          0     0
##  5 [實況]Mer~ 2019-03-21 07:36   https~ chi1206   PlayS~          0     0
##  6 [實況]小美S~ 2019-03-21 07:50   https~ st12231   PlayS~          0     0
##  7 [實況]小美S~ 2019-03-21 07:50   https~ st12231   PlayS~          0     0
##  8 [實況]【塔利~ 2019-03-21 20:01   https~ sirakawa~ PlayS~          3     2
##  9 [問題]ps4~ 2019-03-22 04:15   https~ willie78~ PlayS~        145    33
## 10 [問題]ps4~ 2019-03-22 04:15   https~ willie78~ PlayS~        145    33
## # ... with 200 more rows, and 3 more variables: boo <int>, word1 <chr>,
## #   word2 <chr>

畫出原本情緒走勢圖

  • 將每一天的情緒走勢畫出
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
sekiro_sentiment_bigrams <- sekiro_sentiment_bigrams %>%
  select(artDate, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% game_co_pos_words,1,-1), sentiment_tag=ifelse(word2 %in% game_co_pos_words, "positive", "negative"))


# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(sekiro_sentiment_bigrams$artDate)), as.Date(max(sekiro_sentiment_bigrams$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 計算我們資料集中 每日的情緒值
sentiment_plot_data <- sekiro_sentiment_bigrams %>%
  group_by(artDate,sentiment_tag) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
sentiment_plot_data <- all_dates %>% 
  merge(sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))


# 畫圖
sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 

畫出使用 bigram 修正後的情緒走勢圖

  • 查看 前面出現否定詞 且 後面爲情緒詞彙 的組合
sekiro_sentiment_bigrams %>%
  filter(word1 %in% negation_words) %>%
  count(word1, word2, sort = TRUE) 
## # A tibble: 2 x 3
##   word1 word2     n
##   <chr> <chr> <int>
## 1 不    喜歡      2
## 2 沒    攻擊      2
  • bigram 修正後的情緒走勢圖
# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調
sekiro_sentiment_bigrams_negated <- sekiro_sentiment_bigrams %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中 每日的情緒值
negated_sentiment_plot_data <- sekiro_sentiment_bigrams_negated %>%
  group_by(artDate,sentiment_tag) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))

比較negation 後的情緒走勢

# 合併兩種情緒值的資料
all_sentiments <- bind_rows(
  sentiment_plot_data %>% mutate(sentiment=paste(sentiment, "_original", sep = "")),
  negated_sentiment_plot_data %>% mutate(sentiment=paste(sentiment, "_negated", sep = ""))) 


# 先比較正面情緒
all_sentiments %>% 
  filter(sentiment %in% c("positive_original", "positive_negated")) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))  -> graph1

# 再比較負面情緒
all_sentiments %>% 
  filter(sentiment %in% c("negative_original", "negative_negated")) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) -> graph2

gridExtra::grid.arrange(graph1, graph2, ncol = 1)