System setting

避免中文亂碼

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

Load packages

library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(data.table)
library(reshape2)
library(scales)

Read Data

這次用的資料集是冰與火之歌在PTT上的討論資料,爬標題的關鍵字有:
GoT、Game of Thrones、冰與火之歌、權力遊戲 來分析看看討論板上對這部影集的觀感以及社群網絡關係。

  1. posts: 用文本分析平台抓下來的PTT 文章資料
  2. reviews: 用文本分析平台抓下來的PTT 推噓文資料
posts <- read_csv("data/got_posts.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )
reviews <- read_csv("data/got_reviews.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentPoster = col_character(),
##   commentStatus = col_character(),
##   commentDate = col_datetime(format = ""),
##   commentContent = col_character()
## )

reviews只挑選:

  • 文章連結
  • 回應者
  • 回應狀態: 推、噓
  • 回應內容
reviews <- select(reviews, artUrl, commentPoster, commentStatus, commentContent)

Exploratory Data Analysis

length(unique(posts$artPoster))
## [1] 122

資料中有185篇貼文,其中有122個發文者

length(unique(reviews$commentPoster))
## [1] 2224

資料中有9998則回覆,其中有2224位使用者

allUsers <- c(posts$artPoster, reviews$commentPoster) 
allUsers %>%
  unique() %>% 
  length()
## [1] 2266

總共參與者為2266位

userList <- data.frame(user = unique(allUsers)) %>%
  mutate(type = ifelse(user %in% posts$artPoster, "poster", "replyer"))
head(userList)
##          user   type
## 1  drgraffiti poster
## 2   H23324216 poster
## 3      solomn poster
## 4 tontontonni poster
## 5    look1225 poster
## 6  ducklingwu poster

先簡單將每個user分成發文者與回覆者,有發文者歸類為poster,沒有發過文則是replyer

主題分析

讀取資料

got_meta_post <- posts %>%
  mutate(sentence = gsub("[\n]{2, }", "。", sentence))
head(got_meta_post)
## # A tibble: 6 x 10
##   artTitle artDate    artTime artUrl artPoster artCat commentNum  push
##   <chr>    <date>     <time>  <chr>  <chr>     <chr>       <dbl> <dbl>
## 1 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
## 2 Re:[心得]… 2019-04-15 09:50   https… H23324216 EAser…         68    32
## 3 [請益]GOT… 2019-04-17 03:12   https… solomn    EAser…         23    14
## 4 [討論]為何G… 2019-04-18 03:54   https… tontonto… EAser…        107    43
## 5 Re:[討論]… 2019-04-18 07:24   https… look1225  EAser…         52    15
## 6 [閒聊]Got… 2019-04-19 04:59   https… duckling… EAser…         28    16
## # … with 2 more variables: boo <dbl>, sentence <chr>

將讀取的GoT文章資料,依\n切成句子。

斷句處理

got_sentences <- strsplit(got_meta_post$sentence, "[。!;?!?;]")
got_sentences <- data.frame(
  artUrl = rep(got_meta_post$artUrl, sapply(got_sentences, length)),
  sentence = unlist(got_sentences)
) %>% 
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
got_sentences$sentence <- as.character(got_sentences$sentence)

載入字典

got_lexicon <- scan(file = "dict/got_lexicon.txt", what = character(), sep='\n', 
                    encoding = 'utf-8', fileEncoding = 'utf-8')

字典自行新增的冰與火之歌一些專有名詞,例如常看到的人名、暱稱等:
阿雅無垢者龍母等等的約24個

jieba斷詞

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# 使用冰與火之歌字典重新斷詞
new_user_word(jieba_tokenizer, c(got_lexicon))
## [1] TRUE
# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x) > 1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens) > 1 | tokens == "囧" | tokens == "冏"]
      return(tokens)
    }
  })
}

tokens <- got_sentences %>%
  unnest_tokens(word, sentence, token = chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
head(tokens)
## # A tibble: 6 x 3
##   artUrl                                                  word  count
##   <fct>                                                   <chr> <int>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 幫忙      2
## 2 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 北境      2
## 3 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 被叫      1
## 4 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 並且      1
## 5 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 不給      1
## 6 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 不過      1

清理斷詞結果

reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% 
  unlist()

got_removed <- tokens %>% 
  filter( word %in% reserved_word)

got_dtm <- got_removed %>% cast_dtm(artUrl, word, count)
got_dtm
## <<DocumentTermMatrix (documents: 185, terms: 1192)>>
## Non-/sparse entries: 12256/208264
## Sparsity           : 94%
## Maximal term length: 5
## Weighting          : term frequency (tf)

LDA

整理出中文的stop words

stop_word <- c("可以","他們","沒有","真的","就是","覺得","我們","知道","因為","非常","還是","然後","所以","一個","非常","這樣","應該","如果","只是","自己","這個","可能","認為","不過","什麼","不是","完全","結果","怎麼","不會","已經","還有","現在","大家","其實","但是","想要","雖然")
got_lda <- LDA(got_dtm, k = 2, control = list(seed = 1234))

看各群的常用詞彙

tidy(got_lda, matrix = "beta") %>%
  filter(!term %in% c("冰與火之歌", "權力", "遊戲"), !term %in% stop_word) %>% 
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = "黑體-繁 中黑"))


透過分成兩類主題,我們可以發現一類的討論內容比較偏向討論劇情本身,另一類則是關於影集拍攝、觀眾等戲外的討論。

使用LDA預測每篇文章的主題

got_topics <- tidy(got_lda, matrix = "gamma") %>% 
  # 在tidy function中使用參數"gamma"來取得 theta矩陣。
                group_by(document) %>%
                top_n(1, wt = gamma)
head(got_topics)
## # A tibble: 6 x 3
## # Groups:   document [6]
##   document                                                topic gamma
##   <chr>                                                   <int> <dbl>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html     1 0.799
## 2 https://www.ptt.cc/bbs/EAseries/M.1555350999.A.61F.html     1 0.950
## 3 https://www.ptt.cc/bbs/EAseries/M.1555588854.A.755.html     1 0.545
## 4 https://www.ptt.cc/bbs/EAseries/M.1555601456.A.E16.html     1 0.515
## 5 https://www.ptt.cc/bbs/EAseries/M.1555679137.A.1AB.html     1 0.730
## 6 https://www.ptt.cc/bbs/EAseries/M.1555814295.A.4CB.html     1 0.597

可以看出我們用LDA幫文章分出了兩類,以及各文章的gamma,接下來我們可以依照分出的主題來畫出社群網絡圖。

社群網路圖

posts_reviews <- merge(posts, reviews, by = "artUrl")
link <- posts_reviews %>%
  select(commentPoster, artPoster, artUrl)
reviewNetwork <- graph_from_data_frame(d = link, directed = T)
plot(reviewNetwork)

先簡單將回文者與發文者的關係直接畫出圖,看不出什麼東西。

plot(reviewNetwork, vertex.size = 2, edge.arrow.size = .2, vertex.label = NA)

因為資料量太多,我們篩選其中一天的資料。選擇冰與火之歌播出之日 5/13。

link <- posts_reviews %>% 
  filter(artDate == "2019-05-13") %>% 
  select(commentPoster, artPoster, artUrl)

篩選link中有出現的使用者

filtered_user <- userList %>%
          filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user)
##           user    type
## 1    sundazlas replyer
## 2 Ipadhotwater replyer
## 3      neverli replyer
## 4      yujimin replyer
## 5   AppleAlice replyer
## 6        real4 replyer
reviewNetwork <- graph_from_data_frame(d = link, v = filtered_user, directed=F)
plot(reviewNetwork, vertex.size = 2, edge.arrow.size = .2, vertex.label = NA)

labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type == "poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2, 
     vertex.label = ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA), 
     vertex.label.ces = .5)

V(reviewNetwork)[degree(reviewNetwork) > 10]
## + 28/362 vertices, named, from 7a9c585:
##  [1] orzisme      pattda       jcshie       sasiru0959   MadAngel    
##  [6] celeris      mainline     tontontonni  hahaha0204   ivorysoap   
## [11] risingtide   bloodrance   attilalin    smallsun10   HuangJ      
## [16] s58565254    nomorefoggy  dejamisvu    steven655267 kwinner     
## [21] mitdoh       alepp123     minipig1127  sammon       tinywill    
## [26] a382773      XristianBale czchen
posts_reviews_topic <- merge(x = posts_reviews, y = got_topics, 
                             by.x = "artUrl", by.y = "document")
link <- posts_reviews_topic %>%
      filter(artDate=='2019-05-13') %>%
      select(commentPoster, artPoster, artUrl, commentStatus, topic)
filtered_user <- userList %>%
          filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user)
##           user    type
## 1    sundazlas replyer
## 2 Ipadhotwater replyer
## 3      neverli replyer
## 4      yujimin replyer
## 5   AppleAlice replyer
## 6        real4 replyer

使用主題1、2做連線的顏色

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")

## 畫出社群網路圖
set.seed(5431)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
     vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA),  vertex.label.ces=.5)

## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("topic 1","topic 2"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=.8)

使用推噓文做連線的顏色

# ptt的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- posts_reviews_topic %>%
      filter(artDate == '2019-05-13', commentStatus != "→") %>%
      select(commentPoster, artPoster, artUrl, commentStatus, topic)

# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述

## 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

## 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$commentStatus == "推", "lightgreen", "palevioletred")

## 畫出社群網路圖
set.seed(488)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
     vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA),  vertex.label.ces=.5, family = "黑體-繁 中黑")

## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("like","unlike"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=.8)

#載入情緒辭典
positive <- read_file("dict/positive.txt") %>% 
  strsplit("[,]") %>%
  unlist() %>% 
  data.frame(word = ., sentiment = "positive")

negative <- read_file("dict/negative.txt") %>% 
  strsplit("[,]") %>% 
  unlist() %>% 
  data.frame(word = ., sentiment = "negative")

LIWC_ch <- rbind(positive, negative)

先讀GoT留言資料,並依\n切成句子。

got_meta_review <- read_csv("data/got_reviews.csv") %>%
  mutate(commentContent = gsub("[\n]{2, }", "。", commentContent))
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentPoster = col_character(),
##   commentStatus = col_character(),
##   commentDate = col_datetime(format = ""),
##   commentContent = col_character()
## )
# 斷句處理
got_sentences2 <- strsplit(got_meta_review$commentContent, "[。!;?!?;]")
#got_sentences2
got_sentences2 <- data.frame(
  artUrl = rep(got_meta_review$artUrl, sapply(got_sentences2, length)),
  commentContent = unlist(got_sentences2),
  commentPoster = rep(got_meta_review$commentPoster, sapply(got_sentences2, length))
) %>% 
  filter(!str_detect(commentContent, regex("^(\t|\n| )*$")))
got_sentences2$commentContent <- as.character(got_sentences2$commentContent)
#got_sentences2
tokens2 <- got_sentences2 %>%
  unnest_tokens(word, commentContent, token = chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl,commentPoster, word) %>%
  rename(count=n)
tokens2
## # A tibble: 47,454 x 4
##    artUrl                                         commentPoster word  count
##    <fct>                                          <fct>         <chr> <int>
##  1 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          變三殺…     1
##  2 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          波頓      1
##  3 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          成長      1
##  4 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          個人      1
##  5 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          後期      1
##  6 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          腳色      1
##  7 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          敬佩      1
##  8 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          可以      1
##  9 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          裡要      1
## 10 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          龍母      1
## # … with 47,444 more rows
## 清理斷詞結果
reserved_word2 <- tokens2 %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% 
  unlist()

got_removed2 <- tokens2 %>% 
  filter( word %in% reserved_word)
got_removed2
## # A tibble: 26,110 x 4
##    artUrl                                         commentPoster word  count
##    <fct>                                          <fct>         <chr> <int>
##  1 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          成長      1
##  2 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          個人      1
##  3 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          後期      1
##  4 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          腳色      1
##  5 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          可以      1
##  6 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          龍母      1
##  7 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          認為      1
##  8 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          瑟曦      1
##  9 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          珊莎      1
## 10 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie          什麼      1
## # … with 26,100 more rows
#將情緒與詞合併
got_word_sentiment <- got_removed2 %>%
  inner_join(LIWC_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
head(got_word_sentiment, 10)
## # A tibble: 10 x 5
##    artUrl                               commentPoster word  count sentiment
##    <fct>                                <fct>         <chr> <int> <fct>    
##  1 https://www.ptt.cc/bbs/EAseries/M.1… adie          可以      1 positive 
##  2 https://www.ptt.cc/bbs/EAseries/M.1… adie          私生子…     1 negative 
##  3 https://www.ptt.cc/bbs/EAseries/M.1… adm123        抱怨      1 negative 
##  4 https://www.ptt.cc/bbs/EAseries/M.1… adm123        敵人      1 negative 
##  5 https://www.ptt.cc/bbs/EAseries/M.1… adm123        奇怪      1 negative 
##  6 https://www.ptt.cc/bbs/EAseries/M.1… adm123        願意      1 positive 
##  7 https://www.ptt.cc/bbs/EAseries/M.1… adm123        重要      1 positive 
##  8 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort   不好      1 negative 
##  9 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort   不爽      1 negative 
## 10 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort   私生子…     1 negative
got_sentiment <- got_word_sentiment %>%
  left_join(posts, by="artUrl")
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
head(got_sentiment, 10)
## # A tibble: 10 x 14
##    artUrl commentPoster word  count sentiment artTitle artDate    artTime
##    <chr>  <fct>         <chr> <int> <fct>     <chr>    <date>     <time> 
##  1 https… adie          可以      1 positive  [心得]GoT… 2019-04-15 06:56  
##  2 https… adie          私生子…     1 negative  [心得]GoT… 2019-04-15 06:56  
##  3 https… adm123        抱怨      1 negative  [心得]GoT… 2019-04-15 06:56  
##  4 https… adm123        敵人      1 negative  [心得]GoT… 2019-04-15 06:56  
##  5 https… adm123        奇怪      1 negative  [心得]GoT… 2019-04-15 06:56  
##  6 https… adm123        願意      1 positive  [心得]GoT… 2019-04-15 06:56  
##  7 https… adm123        重要      1 positive  [心得]GoT… 2019-04-15 06:56  
##  8 https… aquacomfort   不好      1 negative  [心得]GoT… 2019-04-15 06:56  
##  9 https… aquacomfort   不爽      1 negative  [心得]GoT… 2019-04-15 06:56  
## 10 https… aquacomfort   私生子…     1 negative  [心得]GoT… 2019-04-15 06:56  
## # … with 6 more variables: artPoster <chr>, artCat <chr>,
## #   commentNum <dbl>, push <dbl>, boo <dbl>, sentence <chr>

依回文正負面情緒畫出社群網路圖

# ptt的回覆者情緒是正面或負面
link2 <- got_sentiment %>%
      filter(artDate == '2019-05-13') %>%
      select(commentPoster, artPoster, artUrl, sentiment)

# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述

## 篩選link中有出現的使用者
filtered_user2 <- userList %>%
          filter(user%in%link2$commentPoster | user%in%link2$artPoster) %>%
          arrange(desc(type))

## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link2, v=filtered_user2, directed=F)

## 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type == "poster", "gold", "lightblue")

## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$sentiment == "positive", "lightgreen", "palevioletred")
set.seed(788)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
     vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA),  vertex.label.ces=.5, family = "黑體-繁 中黑")

## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("positive","negative"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=.8)

情緒分析

統計每天的文章正面字的次數與負面字的次數

all_dates <- 
  expand.grid(seq(as.Date(min(posts$artDate)), as.Date(max(posts$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

got_data <- posts %>% 
  left_join(got_word_sentiment) 
## Joining, by = "artUrl"
## Warning: Column `artUrl` joining character vector and factor, coercing into
## character vector
got_data$artDate= got_data$artDate %>% as.Date("%Y/%m/%d")
got_data
## # A tibble: 2,081 x 14
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push
##    <chr>    <date>     <time>  <chr>  <chr>     <chr>       <dbl> <dbl>
##  1 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  2 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  3 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  4 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  5 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  6 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  7 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  8 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
##  9 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
## 10 [心得]GoT… 2019-04-15 06:56   https… drgraffi… EAser…        279   100
## # … with 2,071 more rows, and 6 more variables: boo <dbl>,
## #   sentence <chr>, commentPoster <fct>, word <chr>, count <int>,
## #   sentiment <fct>
plot_table <- got_data %>%
  select(artDate, word, count) %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate, sentiment) %>%
  summarise(count = sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))
plot_table
##       artDate sentiment count
## 1  2019-04-15  positive    56
## 2  2019-04-15  negative    41
## 3  2019-04-16  positive     0
## 4  2019-04-16  negative     0
## 5  2019-04-17  positive     0
## 6  2019-04-17  negative     0
## 7  2019-04-18  positive    20
## 8  2019-04-18  negative     9
## 9  2019-04-19  positive    12
## 10 2019-04-19  negative    14
## 11 2019-04-20  positive     9
## 12 2019-04-20  negative     4
## 13 2019-04-21  positive    23
## 14 2019-04-21  negative    26
## 15 2019-04-22  positive    43
## 16 2019-04-22  negative    18
## 17 2019-04-23  positive    56
## 18 2019-04-23  negative    46
## 19 2019-04-24  positive    20
## 20 2019-04-24  negative     3
## 21 2019-04-25  positive    18
## 22 2019-04-25  negative     5
## 23 2019-04-26  positive     3
## 24 2019-04-26  negative     4
## 25 2019-04-27  positive     2
## 26 2019-04-27  negative     2
## 27 2019-04-28  positive    43
## 28 2019-04-28  negative    23
## 29 2019-04-29  positive   151
## 30 2019-04-29  negative   135
## 31 2019-04-30  positive    90
## 32 2019-04-30  negative    43
## 33 2019-05-01  positive     1
## 34 2019-05-01  negative     0
## 35 2019-05-02  positive    34
## 36 2019-05-02  negative    27
## 37 2019-05-03  positive    20
## 38 2019-05-03  negative     1
## 39 2019-05-04  positive     3
## 40 2019-05-04  negative     2
## 41 2019-05-05  positive    10
## 42 2019-05-05  negative     5
## 43 2019-05-06  positive    58
## 44 2019-05-06  negative    47
## 45 2019-05-07  positive    39
## 46 2019-05-07  negative    27
## 47 2019-05-08  positive    80
## 48 2019-05-08  negative    58
## 49 2019-05-09  positive    72
## 50 2019-05-09  negative    21
## 51 2019-05-10  positive     9
## 52 2019-05-10  negative     3
## 53 2019-05-11  positive    19
## 54 2019-05-11  negative     2
## 55 2019-05-12  positive    40
## 56 2019-05-12  negative    30
## 57 2019-05-13  positive    99
## 58 2019-05-13  negative    66
## 59 2019-05-14  positive   125
## 60 2019-05-14  negative    94
## 61 2019-05-15  positive    88
## 62 2019-05-15  negative    43
## 63 2019-05-16  positive    78
## 64 2019-05-16  negative    44
## 65 2019-05-17  positive    61
## 66 2019-05-17  negative    26
## 67 2019-05-18  positive    14
## 68 2019-05-18  negative     6
ggplot(plot_table, aes(x = artDate, y = count, colour = sentiment)) +
  geom_line() +
  facet_wrap(~ sentiment)


由上圖可知,因為冰與火之歌在每週一推出一集,4/22、4/29、5/6、5/13都有較高的情緒表現,正負面的情緒都很高。

正負面情緒出現較高的前十個tokens

got_data %>%
  select(artDate, word, count) %>%
  inner_join(LIWC_ch) %>%
  # Count by word and sentiment
  count(word, sentiment) %>%
  # Group by sentiment
  group_by(sentiment) %>%
  # Take the top 10 words for each sentiment
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  # Set up the plot with aes()
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = "黑體-繁 中黑"))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Selecting by n