Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

安裝需要的packages

## Loading required package: readr

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: stringr

## Loading required package: jiebaR

## Loading required package: jiebaRD

## Loading required package: tidytext

## Loading required package: NLP

## Loading required package: tidyr

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Loading required package: ggraph

## Loading required package: igraph

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## Loading required package: widyr

## Loading required package: knitr

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

載入資料集

列出前資料前 10 列

#載入原始資料集
sekiro<- read_csv("./sekiro_test_articleMetaData.csv") %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_integer(),
##   push = col_integer(),
##   boo = col_integer(),
##   sentence = col_character()
## )

sekiro %>% head(10) %>% kable %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "300px")

artTitle	artDate	artTime	artUrl	artPoster	artCat	commentNum	push	boo	sentence
[閒聊]隻狼的前世天誅	2019-03-19	14:37:41	https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	tom80727	PlayStation	107	64	0	https://www.youtube.com/watch?v=JIHv92a4MWs 我覺得這個影片說明真的貼切。沒有比天諸更忍者的遊戲了。連遊戲本身也是影子XD。話說我只有玩過2004年出的天誅紅。他算是添誅系列遊戲性前幾的吧。只是故事沒有2代好。期待週五的隻狼
[情報]隻狼最新預告(中文翻譯)帥翻了阿!!!	2019-03-20	02:35:41	https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html	howerlai1990	PlayStation	64	48	0	影片連結： https://youtu.be/0n2kkcK9KMQ 這預告帥到翻天阿小弟就手刀翻譯了一下希望能有更多人能體會隻狼的魅力阿!!。如果喜歡影片的話再麻煩訂閱支持一下以後會更新其他新遊戲的預告喔
[實況]隻狼日版	2019-03-21	06:49:08	https://www.ptt.cc/bbs/PlayStation/M.1553180111.A.3C6.html	lovelylion2	PlayStation	1	1	0	實況遊戲: [PS4] 隻狼 Sekiro 日版。實況網址: https://www.twitch.tv/lovelylion 03/21 22:55
[實況]Mera慢玩隻狼:暗影雙死00點開玩~	2019-03-21	07:36:54	https://www.ptt.cc/bbs/PlayStation/M.1553182978.A.1EF.html	chi1206	PlayStation	0	0	0	實況主: Mera。實況網址: https://www.twitch.tv/mera_1206/ 哈囉，大家晚上好今天要來玩期待已久的 - 隻狼:暗影雙死!!。台主自己類魂遊戲玩過仁王、黑魂1&3、血源。之前在玩的時候遊戲中遇上問題都能直接向聊天室發問 (觀眾也很喜歡擺老就是了lol) 而這次的隻狼可以在跟大家同個起跑點上一起探索這遊戲了
[實況]小美SEKIRO:SHADOWDIETWICE隻狼	2019-03-21	07:50:32	https://www.ptt.cc/bbs/PlayStation/M.1553183795.A.3B7.html	st12231	PlayStation	0	0	0	實況主播：小美。國家語系：中文。今天要播什麼：SEKIRO: SHADOW DIE TWICE 隻狼。點我連結：。Twitch: https://www.twitch.tv/st12231 FB: https://reurl.cc/bV0Dy YT: https://youtu.be/o0Q7TAG5RDs 繼血源詛咒後最期待的類暗魂遊戲，搶先玩1小時明晚繼續。家有女兒，隨時可能會哭要去報警她，請見諒。可以的話，順手按個追蹤幫解FB百人任務。讓我可以1080p最高畫質輸出拜偷拜偷。直播純分享與紀錄。歡迎一起聊天喔_{_{_~}}
[問題]請問隻狼主題要去哪裡下載	2019-03-21	14:09:48	https://www.ptt.cc/bbs/PlayStation/M.1553206550.A.99F.html	AkaiNamida	PlayStation	8	3	0	剛剛要裝隻狼主題的時候他說資料毀損。叫我刪掉重載。可是我刪掉之後就找不到要在哪裡下載回來了。求解
[實況]【塔利安】SEKIRO隻狼	2019-03-21	20:01:24	https://www.ptt.cc/bbs/PlayStation/M.1553227647.A.D7F.html	sirakawabird	PlayStation	3	2	0	直播主帳號：sirakawabird。國家語系：中文。今天要播什麼：SEKIRO：SHADOW DIE TWICE 隻狼第一輪。點我連結： https://www.youtube.com/user/sirakawabird/live https://www.twitch.tv/sirakawabird 想跟觀眾說：。目前進度要去拿XX斬。可以居合的忍者超棒的哦哦哦哦。葦名奧義書真的要早點拿到正面交鋒需要的技能都在葦名奧義書裡
[問題]ps4pro隻狼流暢度？	2019-03-22	04:15:39	https://www.ptt.cc/bbs/PlayStation/M.1553257302.A.9C6.html	willie7878	PlayStation	145	33	19	如題小弟我已經買了steam的隻狼進入遊戲後就發現滿多問題的：。1.是手把沒有直接支援ds4，就算ps4手把能玩還是卡卡的（因為是xbox手把的圖示…… .）但是這種動作遊戲又不想用鍵鼠玩，很沒感覺，而且用滑鼠轉視角頭很暈（其他遊戲完全沒暈過）。2.我是R5+1070，應該算配備不錯的桌機了但遊玩下來的幀數其實並沒有說到非常高，大概也就是個穩60的狀態。現在有點猶豫如果幀數沒差很多的話想退steam買ps4 跪求分享意見
[問題]隻狼是不是不適合路癡玩？	2019-03-22	10:56:23	https://www.ptt.cc/bbs/PlayStation/M.1553281345.A.4C1.html	david0426	PlayStation	50	33	0	如題。看評分真的有夠高。表哥剛好也有買就借來玩。但是一直死掉重打蠻花時間的(好像是遊戲特色？。但我現在卡在回憶裡面的城池走不出來。因為沒地圖也不知道要去哪…。小怪都殺完了。也沒有任務提示什麼的。我想問大家如果今天打到一個定點。明天繼續打的話會知道往哪裡走嗎？。我已經在城池裡迷路了…。現在打到一心大人出去散步。跑回去問那個女的一心在哪。可是我不知道剛開始怎麼進來的啊QQ。在想是不是DMC5比較適合我…。好歹還有惡魔直覺可以用。隻狼是不是路癡不適合玩啊……。我反而覺得戰鬥不難知道往哪走比較難
[心得]手殘的隻狼心得	2019-03-22	13:27:17	https://www.ptt.cc/bbs/PlayStation/M.1553290400.A.F94.html	nobody1149	PlayStation	58	28	0	先說忍殺的特寫鏡頭真的是又帥又中二！！！。玩過血源黑魂3仁王（前兩個有拼到白金）我個人用自己血黑的套路打隻狼打得很痛苦…血源就槍返苦手黑魂舉盾癌，打王全靠閃有空擋在補幾刀磨到王死…而隻狼完全不行啊！隻狼我覺得主要特色是架招，不斷的和王攻防才能有效的積累體幹值打出忍殺，而且還不能瞎忙的防禦，下段要跳突刺看破普攻防禦還要判斷是不是連攻，還不能L1連打要抓攻擊節奏一個一個按…真心難啊啊啊………還一直按錯鍵。。地圖是很漂亮，但和風的城市就那樣…個人還是比較喜歡血黑那種歐洲城市的風格，而且能用勾鎖飛的地方不多，路上小怪大多都能靠偷襲忍殺脫戰後再回去偷襲忍殺解決。。回生已經用到npc都得病了…四個血痰了…後面應該能醫治吧！？目前卡在騎馬哥和幻術阿罵，遊戲時間約八小時…蟒蛇重藏和他的雜兵夥伴卡很久…昨晚又被阿罵虐殺，真的是阿罵都比我強QQ。還有，雞超強…= =

對全部的文章進行斷句，並儲存結果

將前 10 列斷句後的資料

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
sekiro_sentences <- strsplit(sekiro$sentence,"[。！；？!?;]")
# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
sekiro_sentences <- data.frame(
                        artUrl = rep(sekiro$artUrl, sapply(sekiro_sentences, length)), 
                        sentence = unlist(sekiro_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

sekiro_sentences$sentence <- as.character(sekiro_sentences$sentence)

sekiro_sentences %>% head(10) %>% kable%>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "300px")

artUrl	sentence
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	https://www.youtube.com/watch
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	v=JIHv92a4MWs 我覺得這個影片說明真的貼切
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	沒有比天諸更忍者的遊戲了
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	連遊戲本身也是影子XD
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	話說我只有玩過2004年出的天誅紅
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	他算是添誅系列遊戲性前幾的吧
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	只是故事沒有2代好
https://www.ptt.cc/bbs/PlayStation/M.1553035424.A.0B9.html	期待週五的隻狼
https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html	影片連結： https://youtu.be/0n2kkcK9KMQ 這預告帥到翻天阿小弟就手刀翻譯了一下希望能有更多人能體會隻狼的魅力阿
https://www.ptt.cc/bbs/PlayStation/M.1553078503.A.7DD.html	如果喜歡影片的話再麻煩訂閱支持一下以後會更新其他新遊戲的預告喔

接着做斷詞

初始化斷詞器
斷詞與整理斷詞結果
將前出現頻率前 10 大字列出

# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker()

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
    tokens <- segment(x, jieba_tokenizer)
    # 去掉字串長度爲1的詞彙
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
sekiro_words <- sekiro_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
sekiro_words %>% head(10)

## # A tibble: 10 x 3
##    artUrl                                                     word       n
##    <fct>                                                      <chr>  <int>
##  1 https://www.ptt.cc/bbs/PlayStation/M.1553688397.A.FB5.html 遊戲      27
##  2 https://www.ptt.cc/bbs/PlayStation/M.1553512835.A.A24.html 可以      14
##  3 https://www.ptt.cc/bbs/PlayStation/M.1553688397.A.FB5.html 玩家      14
##  4 https://www.ptt.cc/bbs/Gossiping/M.1553264860.A.480.html   遊戲      13
##  5 https://www.ptt.cc/bbs/PlayStation/M.1553402708.A.1F4.html 攻擊      13
##  6 https://www.ptt.cc/bbs/PlayStation/M.1553491058.A.334.html 雷雷雷    13
##  7 https://www.ptt.cc/bbs/PlayStation/M.1553402708.A.1F4.html 可以      11
##  8 https://www.ptt.cc/bbs/PlayStation/M.1553473876.A.B39.html 體幹值    11
##  9 https://www.ptt.cc/bbs/Gossiping/M.1553264860.A.480.html   敵人      10
## 10 https://www.ptt.cc/bbs/PlayStation/M.1553343706.A.169.html 識破      10

計算 tf-idf

以每篇文章爲單位，計算每個詞彙在的tf-idf值

sekiro_words_tf_idf <- sekiro_words %>%
  bind_tf_idf(word, artUrl, n)

sekiro_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl))

## Selecting by tf_idf

## # A tibble: 1,075 x 6
## # Groups:   artUrl [76]
##    artUrl                                  word      n     tf   idf tf_idf
##    <fct>                                   <chr> <int>  <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/PlayStation/M.1~ 直播      2 0.0513  2.94 0.151 
##  2 https://www.ptt.cc/bbs/PlayStation/M.1~ 不長      1 0.0256  4.33 0.111 
##  3 https://www.ptt.cc/bbs/PlayStation/M.1~ 主播      1 0.0256  3.64 0.0933
##  4 https://www.ptt.cc/bbs/PlayStation/M.1~ 左右      1 0.0256  3.64 0.0933
##  5 https://www.ptt.cc/bbs/PlayStation/M.1~ 平日      1 0.0256  4.33 0.111 
##  6 https://www.ptt.cc/bbs/PlayStation/M.1~ 玩個      1 0.0256  4.33 0.111 
##  7 https://www.ptt.cc/bbs/PlayStation/M.1~ 流暢      1 0.0256  4.33 0.111 
##  8 https://www.ptt.cc/bbs/PlayStation/M.1~ 聊天      1 0.0256  3.64 0.0933
##  9 https://www.ptt.cc/bbs/PlayStation/M.1~ 當碼      1 0.0256  4.33 0.111 
## 10 https://www.ptt.cc/bbs/PlayStation/M.1~ 興趣      1 0.0256  3.64 0.0933
## # ... with 1,065 more rows

選每篇文章，tf-idf最大的十個詞，
並查看每個詞被選中的次數

sekiro_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)

## Selecting by tf_idf

## # A tibble: 1,008 x 2
##    word     nn
##    <chr> <int>
##  1 實況      4
##  2 有點      3
##  3 設定      3
##  4 最近      3
##  5 暗影      3
##  6 經驗      3
##  7 葦名      3
##  8 影片      3
##  9 雙死      3
## 10 一心      2
## # ... with 998 more rows

因爲我們是以每篇文章爲一個document單位（總共有76個document）
因此我們就不畫課本第三章中，比較各document中tf-idf較高的詞彙比較圖

bigram function

載入 stop word 和 negation word
刪除 stop word 後列出前10 大頻率的 bigram

# remove stopwords
jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    bigram<- ngrams(tokens, 2)
    bigram <- lapply(bigram, paste, collapse = " ")
    unlist(bigram)
  })
}

# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

## Warning in scan(file = "./dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 './dict/stop_words.txt' 中的輸入不正確

# load negation words
negation_words <- scan(file = "./dict/negation_words.txt", what=character(),sep=',', 
                   encoding='utf-8',fileEncoding='utf-8')

# remove negation words from stop words list
stop_words <- stop_words[!(stop_words %in% c(negation_words))] #如果negation出現在stop word時，不能被刪除

# 執行bigram分詞
sekiro_bigram <- sekiro %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)


# 清楚包含英文或數字的bigram組合
# 計算每個組合出現的次數
# remove the stop words in bigram
sekiro_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ") %>% head(10)

## # A tibble: 10 x 2
##    bigram            n
##    <chr>         <int>
##  1 隻 狼           141
##  2 不 知道          13
##  3 血源 詛咒        13
##  4 狼 暗影          13
##  5 暗影 雙死        13
##  6 不 死            12
##  7 宮崎 英高        12
##  8 雷雷雷 雷雷雷    11
##  9 玩 隻             9
## 10 都 會             9

刪除 stop word 後列出前10 大頻率的 trigram

jieba_ngram_3 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram<- ngrams(unlist(tokens), 3)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

# 執行ngram分詞
sekiro_ngram_3 <- sekiro %>%
  unnest_tokens(ngrams, sentence, token = jieba_ngram_3)
sekiro_ngram_3 %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  count(ngrams, sort = TRUE)

## # A tibble: 11,817 x 2
##    ngrams                   n
##    <chr>                <int>
##  1 隻 狼 的                22
##  2 狼 暗影 雙死            13
##  3 隻 狼 暗影              13
##  4 的 隻 狼                10
##  5 玩 隻 狼                 9
##  6 雷雷雷 雷雷雷 雷雷雷     9
##  7 了 隻 狼                 6
##  8 不 死 斬                 5
##  9 是 隻 狼                 5
## 10 中文 今天 要播           4
## # ... with 11,807 more rows

# remove the stop words in ngram(n=3)
sekiro_ngram_3 %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")%>% head(10)

## # A tibble: 10 x 2
##    ngrams                   n
##    <chr>                <int>
##  1 狼 暗影 雙死            13
##  2 隻 狼 暗影              13
##  3 玩 隻 狼                 9
##  4 雷雷雷 雷雷雷 雷雷雷     9
##  5 不 死 斬                 5
##  6 中文 今天 要播           4
##  7 這次 隻 狼               4
##  8 遊戲 隻 狼               4
##  9 一直 吃 鱉               3
## 10 我方 暈眩 直             3

從上面的bigram和ngram(n=3)的結果中，我們可以整理出一個斷詞字典。
我們將詞彙整理好存在dict文件夾中的 sekiro_lexicon.txt 中

載入各種字典

讀入整理好的字典和情緒字典

# load sekiro_lexicon
sekiro_lexicon <- fread(file = "./dict/sekiro_lexicon.txt", header= F,sep='\n', encoding = "UTF-8")
colnames(sekiro_lexicon)[1] <- "word"
sekiro_lexicon <- as.character(sekiro_lexicon$word)
# load game company positive words
game_co_pos_words <- fread(file = "./dict/game_company_positive.txt",sep=',', header= F, stringsAsFactors = F, encoding='UTF-8')
game_co_pos_words <- data.frame(word=t(game_co_pos_words))
rownames(game_co_pos_words)<-c(1:103)
game_co_pos_words <- as.character(game_co_pos_words$word)
# load  game company negation words
game_co_neg_words <- fread(file = "./dict/game_company_negative.txt", header= F, sep=',', stringsAsFactors = F, encoding='UTF-8')
game_co_neg_words <- data.frame(word=t(game_co_neg_words))
rownames(game_co_neg_words)<-c(1:73)
game_co_neg_words <- as.character(game_co_neg_words$word)

列出隻狼整理好的字典

sekiro_lexicon

##  [1] "日本"     "武士"     "忍者"     "戰國時代" "戰國"     "第三人稱"
##  [7] "短網址"   "解謎遊戲" "動作遊戲" "日本遊戲" "義手"     "忍術"    
## [13] "忍具"     "潛行"     "暗殺"     "隻狼"     "卿子"     "九郎"    
## [19] "一心"     "英麻"     "葦名國"   "葦名"     "弦一郎"   "龍胤之子"
## [25] "龍胤"     "不死斬"   "不死"     "阿蝶"     "赤鬼"     "幻之阿蝶"
## [31] "幻蝶"     "貓頭鷹"   "宮崎英高" "菁英怪"   "蟒蛇"     "白蛇"    
## [37] "源之宮"   "源宮"     "仙峰寺"   "仙峰上人" "劍聖"     "架刀"    
## [43] "盾反"     "格擋"     "軀幹值"   "體幹值"   "義肢"

使用新的字典和情緒字典斷詞
選出 word2 為情緒字的資料

# 這裏不加入stop word字典
# 因為清掉的話會影響bigram出來的結果
jieba_tokenizer = worker()

# 使用隻狼字典重新斷詞
# 把否定詞也加入斷詞

new_user_word(jieba_tokenizer, c(sekiro_lexicon,negation_words,game_co_pos_words,game_co_neg_words));

## [1] TRUE

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    bigram<- ngrams(unlist(tokens), 2)
    bigram <- lapply(bigram, paste, collapse = " ")
    unlist(bigram)
  })
}


# 執行bigram分詞
sekiro_bigram <- sekiro %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)


# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- sekiro_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
sekiro_sentiment_bigrams <- bigrams_separated %>%
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  filter(word2 %in% c(game_co_pos_words, game_co_neg_words))

sekiro_sentiment_bigrams

## # A tibble: 210 x 11
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push
##    <chr>    <date>     <time>  <chr>  <chr>     <chr>       <int> <int>
##  1 [閒聊]隻狼的~ 2019-03-19 14:37   https~ tom80727  PlayS~        107    64
##  2 [情報]隻狼最~ 2019-03-20 02:35   https~ howerlai~ PlayS~         64    48
##  3 [情報]隻狼最~ 2019-03-20 02:35   https~ howerlai~ PlayS~         64    48
##  4 [實況]Mer~ 2019-03-21 07:36   https~ chi1206   PlayS~          0     0
##  5 [實況]Mer~ 2019-03-21 07:36   https~ chi1206   PlayS~          0     0
##  6 [實況]小美S~ 2019-03-21 07:50   https~ st12231   PlayS~          0     0
##  7 [實況]小美S~ 2019-03-21 07:50   https~ st12231   PlayS~          0     0
##  8 [實況]【塔利~ 2019-03-21 20:01   https~ sirakawa~ PlayS~          3     2
##  9 [問題]ps4~ 2019-03-22 04:15   https~ willie78~ PlayS~        145    33
## 10 [問題]ps4~ 2019-03-22 04:15   https~ willie78~ PlayS~        145    33
## # ... with 200 more rows, and 3 more variables: boo <int>, word1 <chr>,
## #   word2 <chr>

畫出原本情緒走勢圖

將每一天的情緒走勢畫出

# 選出word2中，有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予： 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予： 情緒標籤爲"negative"、情緒值爲 -1
sekiro_sentiment_bigrams <- sekiro_sentiment_bigrams %>%
  select(artDate, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% game_co_pos_words,1,-1), sentiment_tag=ifelse(word2 %in% game_co_pos_words, "positive", "negative"))


# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(sekiro_sentiment_bigrams$artDate)), as.Date(max(sekiro_sentiment_bigrams$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 計算我們資料集中 每日的情緒值
sentiment_plot_data <- sekiro_sentiment_bigrams %>%
  group_by(artDate,sentiment_tag) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
sentiment_plot_data <- all_dates %>% 
  merge(sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))


# 畫圖
sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))

畫出使用 bigram 修正後的情緒走勢圖

查看前面出現否定詞且後面爲情緒詞彙的組合

sekiro_sentiment_bigrams %>%
  filter(word1 %in% negation_words) %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 2 x 3
##   word1 word2     n
##   <chr> <chr> <int>
## 1 不    喜歡      2
## 2 沒    攻擊      2

bigram 修正後的情緒走勢圖

# 如果在情緒詞前出現的是否定詞的話，則將他的情緒對調
sekiro_sentiment_bigrams_negated <- sekiro_sentiment_bigrams %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中 每日的情緒值
negated_sentiment_plot_data <- sekiro_sentiment_bigrams_negated %>%
  group_by(artDate,sentiment_tag) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))

比較negation 後的情緒走勢

# 合併兩種情緒值的資料
all_sentiments <- bind_rows(
  sentiment_plot_data %>% mutate(sentiment=paste(sentiment, "_original", sep = "")),
  negated_sentiment_plot_data %>% mutate(sentiment=paste(sentiment, "_negated", sep = ""))) 


# 先比較正面情緒
all_sentiments %>% 
  filter(sentiment %in% c("positive_original", "positive_negated")) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))  -> graph1

# 再比較負面情緒
all_sentiments %>% 
  filter(sentiment %in% c("negative_original", "negative_negated")) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) -> graph2

gridExtra::grid.arrange(graph1, graph2, ncol = 1)

第九組：TF-IDF 與 Bigram 情緒分析（隻狼：暗影雙死）

第九組

2019/04/09

安裝需要的packages

載入資料集

對全部的文章進行斷句，並儲存結果

接着做斷詞

計算 tf-idf

bigram function

載入各種字典

畫出原本情緒走勢圖

畫出使用 bigram 修正後的情緒走勢圖

比較negation 後的情緒走勢