動機和分析目的

許多經典電影動畫電影都出自『迪士尼』與『皮克斯』兩大工作室，而其各自代表作品的風格迥異，我們將透過社群網路上大家對其動畫電影的討論度來做以下討論。

資料集描述

由中山大學管理學院文字分析平台收集PTT文章及留言取得之原始csv檔案。
透過中山管院文字分析平台，取得PTT資料，以關鍵字為『皮克斯』、『Pixar』、『玩具總動員』等皮克斯相關電影及『迪士尼』、『冰雪奇緣』等迪士尼相關電影。『皮克斯』相關文章共有711篇，『迪士尼』相關文章共有822篇。

系統設置

# 安裝需要的packages
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","wordcloud2","scales","tm","reshape2","widyr","ggraph","tidyverse","textstem","e1071","rpart.plot","rpart","caTools","caret")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# 載入packages
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(scales)
library(tm)
library(reshape2)
library(widyr)
library(ggraph)
library(tidyverse)
library(RColorBrewer)
library(textstem)
library(e1071)
library(rpart)
library(rpart.plot)
library(caTools)
library(caret)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料處理

# 讀取資料
# 迪士尼文章資料
disney_posts <- read_csv("期末＿迪士尼_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 迪士尼回覆資料
disney_reviews <- read_csv("期末＿迪士尼_articleReviews.csv")

# 皮克斯文章資料
pixar_posts <- read_csv("期末_皮克斯_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 皮克斯回覆資料
pixar_reviews <- read_csv("期末_皮克斯_articleReviews.csv")

# 所有文章資料
all_posts <- read_csv("all_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 所有回覆資料
all_reviews <- read_csv("all_articleReviews.csv")

# 轉換日期格式
disney_posts$artDate <- disney_posts$artDate %>% as.Date("%Y/%m/%d")
disney_reviews$artDate <- disney_reviews$artDate %>% as.Date("%Y/%m/%d")
pixar_posts$artDate <- pixar_posts$artDate %>% as.Date("%Y/%m/%d")
pixar_reviews$artDate <- pixar_reviews$artDate %>% as.Date("%Y/%m/%d")
all_posts$artDate <- all_posts$artDate %>% as.Date("%Y/%m/%d")
all_reviews$artDate <- all_reviews$artDate %>% as.Date("%Y/%m/%d")

斷句

#迪士尼文章斷句
disney_posts_sentences <- strsplit(disney_posts$sentence,"[.。！；？!?;]")

disney_posts_sentences <- data.frame(
  artUrl = rep(disney_posts$artUrl, sapply(disney_posts_sentences, length)), 
  sentence = unlist(disney_posts_sentences),
  artDate = rep(disney_posts$artDate, sapply(disney_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

disney_posts_sentences$sentence <- as.character(disney_posts_sentences$sentence)

#皮克斯文章斷句
pixar_posts_sentences <- strsplit(pixar_posts$sentence,"[.。！；？!?;]")

pixar_posts_sentences <- data.frame(
  artUrl = rep(pixar_posts$artUrl, sapply(pixar_posts_sentences, length)), 
  sentence = unlist(pixar_posts_sentences),
  artDate = rep(pixar_posts$artDate, sapply(pixar_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

pixar_posts_sentences$sentence <- as.character(pixar_posts_sentences$sentence)

#所有文章斷句
all_posts_sentences <- strsplit(all_posts$sentence, "[.。！；？!?;]")

all_posts_sentences <- data.frame(
  artUrl = rep(all_posts$artUrl, sapply(all_posts_sentences, length)), 
  sentence = unlist(all_posts_sentences),
  artDate = rep(all_posts$artDate, sapply(all_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

all_posts_sentences$sentence <- as.character(all_posts_sentences$sentence)

初始化斷詞器

# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞

jieba_tokenizer = worker(user="all_dict/user_dict.txt", stop_word="all_dict/stop_words.txt")

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

斷詞

disney_posts_tokens <- disney_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

pixar_posts_tokens <- pixar_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

all_posts_tokens <- all_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

同義字取代

#迪士尼
disney_synonym <- function(i)
{
  gsub("disney", "迪士尼", i) %>%
  gsub("冰雪", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
  gsub("冰雪2", "冰雪奇緣2", .) %>%
  gsub("frozen", "冰雪奇緣", .) %>%
  gsub("frozen2", "冰雪奇緣2", .) %>%
  gsub("Tangled", "魔法公主", .) %>%
  gsub("WreckItRalph", "無敵破壞王", .) %>%
  gsub("BigHero", "大英雄天團", .) %>%
  gsub("Zootopia", "動物方程式", .) %>%
  gsub("Moana", "海洋奇緣", .) %>%
  gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
  gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
  gsub("elsa", "艾莎", .) %>%  #冰雪奇緣女主角
  gsub("anna", "安娜", .) %>%  #冰雪奇緣女主角
  gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
  gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
  gsub("Felix", "阿修", .) %>% #無敵破壞王主角
  gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
  gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
  gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
  gsub("Judy", "哈茱蒂", .) %>%  #動物方程式主角
  gsub("艾爾莎", "艾莎", .)
}

## 針對貼文內容套入 function
disney_posts_tokens$word <- disney_synonym(disney_posts_tokens$word)

#皮克斯
pixar_synonym <- function(i)
{
  gsub("woody", "胡迪", i) %>%
  gsub("dory", "多莉", .) %>%
  gsub("toy story", "玩具總動員", .) 
}

## 針對貼文內容套入 function
pixar_posts_tokens$word <- pixar_synonym(pixar_posts_tokens$word)

#所有
all_synonym <- function(i)
{
  gsub("disney", "迪士尼", i) %>%
  gsub("冰雪", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
  gsub("冰雪2", "冰雪奇緣2", .) %>%
  gsub("frozen", "冰雪奇緣", .) %>%
  gsub("frozen2", "冰雪奇緣2", .) %>%
  gsub("Tangled", "魔法公主", .) %>%
  gsub("WreckItRalph", "無敵破壞王", .) %>%
  gsub("BigHero", "大英雄天團", .) %>%
  gsub("Zootopia", "動物方程式", .) %>%
  gsub("Moana", "海洋奇緣", .) %>%
  gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
  gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
  gsub("elsa", "艾莎", .) %>%  #冰雪奇緣女主角
  gsub("anna", "安娜", .) %>%  #冰雪奇緣女主角
  gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
  gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
  gsub("Felix", "阿修", .) %>% #無敵破壞王主角
  gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
  gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
  gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
  gsub("Judy", "哈茱蒂", .) %>%  #動物方程式主角
  gsub("艾爾莎", "艾莎", .) %>%
  gsub("woody", "胡迪", .) %>%
  gsub("dory", "多莉", .) %>%
  gsub("toy story", "玩具總動員", .)
}

# 針對貼文內容套入 function
all_posts_tokens$word <- all_synonym(all_posts_tokens$word)

資料預覽

全部文章日期圖

all_posts %>% 
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot(aes(artDate,count)) +
  geom_line(color="skyblue", size=1.2)+
  scale_x_date(labels = date_format("%Y-%m-%d"))+
  theme_classic() -> p

## `summarise()` ungrouping output (override with `.groups` argument)

p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.5)+#大英雄天團
    geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.5)+#冰雪奇緣2
    geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.5)+#腦筋急轉彎
    geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.5)#玩具總動員4

從上圖可以發現討論度高峰低峰很明顯，高峰代表動畫上映時，大家會熱烈討論，但當動畫下檔時，將不太有討論度。以下將針對四部動畫做討論，分別為迪士尼-『大英雄天團』及『冰雪奇緣2』，皮克斯-『腦筋急轉彎』及『玩具總動員4』。
紅色虛線為迪士尼代表作。綠色虛線為皮克斯代表作。

迪士尼-大英雄天團(2014)
皮克斯-腦筋急轉彎(2015)
皮克斯-玩具總動員4(2019)
迪士尼-冰雪奇緣2(2019)

文字雲

# 計算所有字在文章的總詞頻
disney_posts_tokens_count <- disney_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 30) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

#head(disney_posts_tokens_count)

pixar_posts_tokens_count <- pixar_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 30) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

#head(pixar_posts_tokens_count)

『迪士尼』文字雲

#disney_clean_words <- c("迪士尼","電影","動畫")
#disney_posts_tokens_count %>%
  #filter(!(disney_posts_tokens_count$word %in% disney_clean_words)) %>% 
  #filter(sum > 100) #%>% 
  #wordcloud2()

與『迪士尼』相關的文章討論，以『冰雪奇緣』最受歡迎。兩位公主『艾莎』與『安娜』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。另外『歌曲』兩字也很常被討論，因迪士尼動畫之歌曲大都很受歡迎。

『皮克斯』文字雲

#pixar_clean_words <- c("皮克斯","電影","動畫")
#pixar_posts_tokens_count %>%
  #filter(!(pixar_posts_tokens_count$word %in% pixar_clean_words)) %>% 
  #filter(sum > 100) #%>% 
  #wordcloud2()

與『皮克斯』相關的文章討論，以『玩具總動員』最受歡迎。『胡迪』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。

主角分析

出現在『艾莎』的前後五個字彙(貼文)-『冰雪奇緣』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

elsa_posts_ngram <- disney_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

elsa_posts_ngram_separated <- elsa_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(elsa_posts_ngram_separated)

elsa_posts_check_words <- elsa_posts_ngram_separated %>%
  filter((word6 == "艾莎"))
#elsa_posts_check_words

elsa_posts_check_words_count <- elsa_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

elsa_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「艾莎」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『艾莎』的妹妹『安娜』公主為出現次數最高。其他如『魔法』、『公主』等都與其相關性很高。

出現在『杯麵』的前後五個字彙(貼文)-『大英雄天團』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

baymax_posts_ngram <- disney_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

baymax_posts_ngram_separated <- baymax_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(baymax_posts_ngram_separated)

baymax_posts_check_words <- baymax_posts_ngram_separated %>%
  filter((word6 == "杯麵"))
#baymax_posts_check_words

baymax_posts_check_words_count <- baymax_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

baymax_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「杯麵」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『杯麵』為主角『阿廣』的哥哥『阿正』所發明出的『機器人』，其代表著『可愛』與『照顧』的特色。

出現在『胡迪』的前後五個字彙(貼文)-『玩具總動員4』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

woody_posts_ngram <- pixar_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

woody_posts_ngram_separated <- woody_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(woody_posts_ngram_separated)

woody_posts_check_words <- woody_posts_ngram_separated %>%
  filter((word6 == "胡迪"))
#woody_posts_check_words

woody_posts_check_words_count <- woody_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

woody_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「胡迪」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『胡迪』為『安弟』其中一個『玩具』，其他出現字詞有『胡迪』的『夥伴』，如『巴斯』、『牧羊女』。

出現在『樂樂』的前後五個字彙(貼文)-『腦筋急轉彎』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

happy_posts_ngram <- pixar_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

happy_posts_ngram_separated <- happy_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(happy_posts_ngram_separated)

happy_posts_check_words <- happy_posts_ngram_separated %>%
  filter((word6 == "樂樂"))
#happy_posts_check_words

happy_posts_check_words_count <- happy_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

happy_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「樂樂」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『樂樂』為『萊莉』『大腦』中的『情緒』『角色』之一，『憂憂』則為與『樂樂』相反的角色。

共線相關圖

『迪士尼』共線相關圖

# 迪士尼
# 剛才的斷詞結果沒有使用新增的辭典，
# 因此我們重新進行斷詞，再計算各詞彙在各文章中出現的次數
disney_posts_words <- disney_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 計算兩個詞彙同時出現的總次數
disney_word_pairs <- disney_posts_words %>%
  pairwise_count(word, artUrl, sort = TRUE)

## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

# 計算兩個詞彙間的相關性
disney_word_cors <- disney_posts_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

# 顯示相關性大於0.5的組合
set.seed(2020)

disney_word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

討論『動畫觀影心得』，也會討論到是否為雷文或是是否需要防雷的分隔線等。而『大英雄天團』則會討論關於機器人-杯麵及發明等字詞。

『皮克斯』共線相關圖

# 皮克斯
# 剛才的斷詞結果沒有使用新增的辭典，
# 因此我們重新進行斷詞，再計算各詞彙在各文章中出現的次數
pixar_posts_words <- pixar_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 計算兩個詞彙同時出現的總次數
pixar_word_pairs <- pixar_posts_words %>%
  pairwise_count(word, artUrl, sort = TRUE)


# 計算兩個詞彙間的相關性
pixar_word_cors <- pixar_posts_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

# 顯示相關性大於0.5的組合
set.seed(2020)

pixar_word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

討論『腦筋急轉彎』除了討論情緒主角，也會討論到關於大腦的記憶、潛意識等。而討論『玩具總動員』可能大多與暴雷等資訊有關，所以會討論到分隔線等資訊。

情緒分析

# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")

# 負向字典txt檔
N <- read_file("liwc/negative.txt")

#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

# 與LIWC情緒字典做join
# 文集中的字屬於positive還是negative
### 計算詞彙次數_部分
disney_posts_tokens <- disney_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date <- disney_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

disney_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 199 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 破壞  negative 
##  2 破壞  negative 
##  3 破壞  negative 
##  4 破壞  negative 
##  5 遊戲  positive 
##  6 情緒  negative 
##  7 勇氣  positive 
##  8 遊戲  positive 
##  9 遊戲  positive 
## 10 破壞  negative 
## # … with 189 more rows

pixar_posts_tokens <- pixar_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date <- pixar_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

pixar_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 237 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 寶貝  positive 
##  2 解決  positive 
##  3 情緒  negative 
##  4 情緒  negative 
##  5 驚嚇  negative 
##  6 情緒  negative 
##  7 驚嚇  negative 
##  8 情緒  negative 
##  9 情緒  negative 
## 10 情緒  negative 
## # … with 227 more rows

all_posts_tokens <- all_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

all_tokens_by_date <- all_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

all_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 490 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 寶貝  positive 
##  2 解決  positive 
##  3 破壞  negative 
##  4 情緒  negative 
##  5 情緒  negative 
##  6 驚嚇  negative 
##  7 遊戲  positive 
##  8 情緒  negative 
##  9 破壞  negative 
## 10 驚嚇  negative 
## # … with 480 more rows

# 統計每天文章正面字的次數與負面字的次數
disney_sentiment_count = disney_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_sentiment_count = pixar_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

all_sentiment_count = all_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

每日的正負面情緒字分布

all_sentiment_count %>%
  ggplot() +
  geom_line(aes(x=artDate,y=count,colour=sentiment)) +
  scale_x_date(labels = date_format("%Y/%m/%d")) -> p

p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.2)+#大英雄天團
    geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.2)+#冰雪奇緣2
    geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.2)+#腦筋急轉彎
    geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.2)#玩具總動員4

紅色為迪士尼代表作，綠色代表皮克斯代表作。圖中『玩具總動員4』的正面字詞相當多，可以代表『玩具總動員4』為一部相當正面的動畫，其他部動畫則無明顯之差異。

討論迪士尼-『冰雪奇緣2』相關貼文的正負面詞

disney_posts_select_frozen <- disney_posts %>%
  filter(str_detect(disney_posts$artTitle, "冰雪奇緣2") | str_detect(disney_posts$artTitle, "frozen2"))

# 計算詞彙次數_部分
disney_posts_tokens_frozen <- disney_posts_select_frozen %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date_frozen <- disney_posts_tokens_frozen %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_frozen = disney_tokens_by_date_frozen %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

disney_posts_tokens_frozen %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『冰雪奇緣2』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『冰雪奇緣2』正面字詞如：喜歡、相信、冒險、愛情等。負面字詞如：失去、恐懼、害怕等。

討論迪士尼-『大英雄天團』相關貼文的正負面詞

disney_posts_select_hero <- disney_posts %>%
  filter(str_detect(disney_posts$artTitle, "大英雄天團") | str_detect(disney_posts$artTitle, "big hero 6"))

# 計算詞彙次數_部分
disney_posts_tokens_hero <- disney_posts_select_hero %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date_hero <- disney_posts_tokens_hero %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_hero = disney_tokens_by_date_hero %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

disney_posts_tokens_hero %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『大英雄天團』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『大英雄天團』正面詞如：英雄、溫暖、溫馨、分享等。負面字詞如：反派、復仇、戰鬥、失去等。

討論皮克斯-『玩具總動員4』相關貼文的正負面詞

pixar_posts_select_toy <- pixar_posts %>%
  filter(str_detect(pixar_posts$artTitle, "玩具總動員4") | str_detect(pixar_posts$artTitle, "toy4"))

# 計算詞彙次數_部分
pixar_posts_tokens_toy <- pixar_posts_select_toy %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date_toy <-pixar_posts_tokens_toy %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_toy = pixar_tokens_by_date_toy %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_posts_tokens_toy %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『玩具總動員4』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『玩具總動員4』正面詞如：自由、朋友、重要、冒險等。負面詞如：失去、放下、反派等。

討論皮克斯-『腦筋急轉灣』相關貼文的正負面詞

pixar_posts_select_brain <- pixar_posts %>%
  filter(str_detect(pixar_posts$artTitle, "腦筋急轉彎") | str_detect(pixar_posts$artTitle, "inside out"))

# 計算詞彙次數_部分
pixar_posts_tokens_brain <- pixar_posts_select_brain %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date_brain <-pixar_posts_tokens_brain %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_brain = pixar_tokens_by_date_brain %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_posts_tokens_brain %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『腦筋急轉彎』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『腦筋急轉彎』正面詞如：希望、冒險、相信等。負面詞如：憂傷、難過、控制等。

LDA

# 將資料轉換為Document Term Matrix (DTM)
#all_posts_tokens <- all_posts %>%
  #unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  #filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  #count(artUrl, word) %>%
  #rename(count=n)

#all_posts_dtm <- all_posts_tokens %>% cast_dtm(artUrl, word, count)
#inspect(all_posts_dtm[1:10,1:10])

#ldas_allPosts = c()
#topics = c(3, 5, 10, 25, 36)
#for(topic in topics){
  #start_time <- Sys.time()
  #lda_allPosts <- LDA(all_posts_dtm, k = topic, control = list(seed = 2020))
  #ldas_allPosts =c(ldas_allPosts,lda_allPosts)
  #print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
  #save(ldas_allPosts,file = "ldas_result_allPosts")
#}

load("ldas_result_allPosts")

透過perplexity找到最佳主題數

#topics = c(3, 5, 10, 25, 36)
#data_frame(k = topics,
          # perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>%
  #ggplot(aes(k, perplex)) +
  #geom_point() +
  #geom_line() +
  #labs(title = "Evaluating LDA topic models",
       #subtitle = "Optimal number of topics (smaller is better)",
      # x = "Number of topics",
       #y = "Perplexity")

透過perplexity找最佳主題數，決定以主題數10個作為接續討論。

10個主題及其常出現字詞

new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果

topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。

remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts <- topics_allPosts %>%
  filter(!term  %in% remove_words)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_allPosts %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  theme(text = element_text(family='STHeitiTC-Light'))

主題1-皮克斯
主題2-玩具總動員
主題3-迪士尼
主題4-冰雪奇緣
主題5-腦筋急轉彎
主題6-票房
主題7-可可夜總會
主題8-大英雄天團
主題9-動物方程式
主題10-無敵破壞王
選定主題1-皮克斯、主題3-迪士尼、主題6-票房為主要討論主題

# 主題命名
topic_name = c('皮克斯', '玩具總動員', '迪士尼', '冰雪奇緣', '腦筋急轉彎', '票房', '可可夜總會', '大英雄天團', '動物方程式', '無敵破壞王')

tmResult <- posterior(new_lda_allPosts)
doc_pro <- tmResult$topics 
#dim(doc_pro) 

allPosts_data <- all_posts %>%
  select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics <- doc_pro[allPosts_data$artUrl,]
document_topics_df = data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(allPosts_data, document_topics_df)
#news_topic %>% head(10)

以比例看各主題在時間的變化

news_topic %>%
  group_by(artDate = format(artDate,'%Y')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family='STHeitiTC-Light'))

皮克斯的討論比例較迪士尼稍高一點，也可以發現『票房』為隨時間到來，討論度也提高。

LDAvis

LDAvis 連結：https://shangheng88.github.io/rproject2/

主題3偏向迪士尼，主題4偏向皮克斯。

網路圖

# 選取所需欄位
all_reviews <- all_reviews %>%
  select(artUrl, cmtPoster, cmtStatus, cmtContent)

發文者數量

length(unique(all_posts$artPoster))

## [1] 1278

#1278

回覆者數量

length(unique(all_reviews$cmtPoster))

## [1] 19524

#19524

總共參與人數

# 總共參與人數
all_poster <- c(all_posts$artPoster, all_reviews$cmtPoster)
length(unique(all_poster))

## [1] 19977

#19977

# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(all_poster)) %>%
             mutate(type=ifelse(user%in%all_posts$artPoster, "poster", "replyer"))

## 建立社群網路圖
postsReviews <- merge(x = all_posts, y = all_reviews, by = "artUrl")
#postsReviews

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- postsReviews %>%
      select(cmtPoster, artPoster, artUrl)


# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, directed=T)


# 畫出網路圖
#plot(reviewNetwork)

# 把點點的大小和線的粗細調小，並不顯示使用者賬號。
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

# LDA 主題進行視覺化
new_lda_allPosts5 = ldas_allPosts[[3]] 

topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta") 

# 使用LDA分類每篇文章的主題
animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。
                  group_by(document) %>%
                  top_n(1, wt=gamma)

# 把文章資訊和主題join起來
postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document")


# 挑選出2014/01/01後的文章，
# 篩選有在15篇以上文章回覆者，
# 文章主題歸類為者，
# 欄位只取：cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- postsReviews %>%
      filter(artDate > as.Date('2014-01-01')) %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>15) %>% 
      ungroup() %>% 
      filter(topic == 1 | topic == 3 ) %>% 
      select(cmtPoster, artPoster, artUrl, topic) %>% 
      unique()

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

使用者經常參與的文章種類

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2) 

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21, 
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。
接著我們將針對這些意見領袖發文的文章及其留言做討論。

探討意見領袖發表的文章，出現了哪些關鍵字讓比較多人願意回覆與互動

disney_leader <- c("AACKball", "xslayer", "YanbinCao")
pixar_leader <- c("filmwalker", "mysmalllamb")

leader_posts <- all_posts %>%
  filter(all_posts$artPoster %in% disney_leader)
leader_posts

## # A tibble: 4 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat commentNum  push   boo
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl> <dbl>
## 1 [普好雷]冰雪… 2019-11-22 22:07:32 https… xslayer   movie         208    40     2
## 2 [討論]冰雪奇… 2019-11-30 23:15:47 https… AACKball  movie         174    99     0
## 3 [好普負雷]冰… 2019-12-03 00:43:36 https… YanbinCao movie         500   100     2
## 4 Re:[好雷]… 2019-12-04 20:01:40 https… YanbinCao movie         371    63    15
## # … with 1 more variable: sentence <chr>

leader_reviews <- all_reviews %>%
  filter(all_reviews$cmtPoster %in% disney_leader)
leader_reviews

## # A tibble: 742 x 4
##    artUrl                       cmtPoster cmtStatus cmtContent                  
##    <chr>                        <chr>     <chr>     <chr>                       
##  1 https://www.ptt.cc/bbs/movi… AACKball  推        :歌是中文翻唱，中配的缺點是我覺得艾莎有點顯老，…
##  2 https://www.ptt.cc/bbs/movi… AACKball  →         :裡面的歌翻中文唱得還不錯，不過歌手唱腔畢竟不同，…
##  3 https://www.ptt.cc/bbs/movi… AACKball  →         :要看個人喜好了。           
##  4 https://www.ptt.cc/bbs/movi… AACKball  推        :我沒看過冰雪一的中文也無從比較。不過我覺得二是…
##  5 https://www.ptt.cc/bbs/movi… AACKball  →         :不錯啦，當然也有可能是因為我前幾天才看過重甲機…
##  6 https://www.ptt.cc/bbs/movi… AACKball  →         :神......                   
##  7 https://www.ptt.cc/bbs/movi… AACKball  推        :艾莎都被困了，還有心情慢慢洩洪嗎XD…
##  8 https://www.ptt.cc/bbs/movi… AACKball  推        :艾莎進入洞窟深處發現水壩的真相，但是陷入迷惘，因…
##  9 https://www.ptt.cc/bbs/movi… AACKball  →         :為兩邊都是重要的人，她不知道要如何解決而被困在回…
## 10 https://www.ptt.cc/bbs/movi… AACKball  →         :憶裡，周圍也呼應了雪寶一直在提的「水有記憶」。…
## # … with 732 more rows

找出disney意見領袖文章的詞句

disney_leader_posts_tokens <- all_posts_tokens %>%
  filter(all_posts_tokens$artPoster %in% disney_leader)

計算disney意見領袖文章詞彙的詞頻

# 計算詞頻
disney_leader_posts_tokens_count <- disney_leader_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 10) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

繪製文字雲，找出關鍵字

# 文字雲
#disney_leader_posts_tokens_count %>%
  #filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>% 
  #filter(sum > 10) %>% 
  #wordcloud2()

發現disney意見領袖所發文主要是以冰雪奇緣的角色與劇情探討為主。

找出pixar意見領袖文章的詞句

pixar_leader_posts_tokens <- all_posts_tokens %>%
  filter(all_posts_tokens$artPoster %in% pixar_leader)

# 計算詞頻
pixar_leader_posts_tokens_count <- pixar_leader_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 10) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

繪製文字雲，找出關鍵字

# 文字雲
#pixar_leader_posts_tokens_count %>%
  #filter(!(leader_posts_tokens_count$word %in% disney_clean_words)) %>% 
  #filter(sum > 10) %>% 
  #wordcloud2()

發現Pixar討論比較熱烈的文章內容，會出現關於人生、教育、冒險貼近真實生活所會遇到的相關議題。

情緒分析disney意見領袖文章

disney_leader_sentiment_count = disney_leader_posts_tokens_count %>%
  select(word, sum) %>%
  inner_join(LIWC) %>%
  group_by(sentiment) %>%
  summarise(count=sum(sum))

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

disney_leader_sentiment_count

## # A tibble: 2 x 2
##   sentiment count
##   <fct>     <int>
## 1 positive     12
## 2 negative     26

迪士尼意見領袖負面文章數量教多。

disney_leader_posts_tokens %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "迪士尼文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

正面詞如：希望、拯救、願意、順利等。負面詞如：自我、恐懼、壓抑、害怕等。

情緒分析Pixar意見領袖文章

pixar_leader_sentiment_count = pixar_leader_posts_tokens_count %>%
  select(word, sum) %>%
  inner_join(LIWC) %>%
  group_by(sentiment) %>%
  summarise(count=sum(sum))

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

pixar_leader_sentiment_count

## # A tibble: 2 x 2
##   sentiment count
##   <fct>     <int>
## 1 positive     49
## 2 negative     13

皮克斯意見領袖正面文章數量較多。

pixar_leader_posts_tokens %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "皮克斯意見領袖文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

正面詞如：冒險、鼓勵、分享、渴望等。負面詞如：問題、恐懼、焦慮、討厭等。

取出熱門文章url

disney_leader_article_url <- disney_leader_posts_tokens %>%
  group_by(disney_leader_posts_tokens$artUrl) %>%
  select(artUrl)

## Adding missing grouping variables: `disney_leader_posts_tokens$artUrl`

disney_leader_article_url <- unique(disney_leader_article_url)

pixar_leader_article_url <- pixar_leader_posts_tokens %>%
  group_by(pixar_leader_posts_tokens$artUrl) %>%
  select(artUrl)

## Adding missing grouping variables: `pixar_leader_posts_tokens$artUrl`

pixar_leader_article_url <- unique(pixar_leader_article_url)
#pixar_leader_article_url

取出意見領袖文章留言

disney_leader_reviews <- all_reviews %>%
  inner_join(disney_leader_article_url)

## Joining, by = "artUrl"

head(disney_leader_reviews,10)

## # A tibble: 10 x 5
##    artUrl          cmtPoster  cmtStatus cmtContent        `disney_leader_posts_…
##    <chr>           <chr>      <chr>     <chr>             <chr>                 
##  1 https://www.pt… sunlockfi… 推        :你真的知道政治正確是什麼意思嗎… https://www.ptt.cc/bb…
##  2 https://www.pt… xslayer    →         :就是國王要配王后啊?不然咧?… https://www.ptt.cc/bb…
##  3 https://www.pt… yehsongyo… →         :換個角度想，就是Anna成熟到… https://www.ptt.cc/bb…
##  4 https://www.pt… yehsongyo… →         :Elsa也能放心當第五靈成為人… https://www.ptt.cc/bb…
##  5 https://www.pt… leoheart1… 噓        :我只覺得Elsa把自己不想背的… https://www.ptt.cc/bb…
##  6 https://www.pt… leoheart1… →         :。這個結局讓我暴怒orz… https://www.ptt.cc/bb…
##  7 https://www.pt… valentian  推        :其實就只是想撈第3集的鋪陳Or… https://www.ptt.cc/bb…
##  8 https://www.pt… leoheart1… 推        :剛按到噓，補血orzzz… https://www.ptt.cc/bb…
##  9 https://www.pt… valentian  →         :女王沒有孩子也OK。伊莉莎白也… https://www.ptt.cc/bb…
## 10 https://www.pt… valentian  →         :也可繼位，不覺得是國王王后的窠… https://www.ptt.cc/bb…

迪士尼意見領袖文章留言多為討論冰雪奇緣的結局。

pixar_leader_reviews <- all_reviews %>%
  inner_join(pixar_leader_article_url)

## Joining, by = "artUrl"

head(pixar_leader_reviews,10)

## # A tibble: 10 x 5
##    artUrl          cmtPoster  cmtStatus cmtContent        `pixar_leader_posts_t…
##    <chr>           <chr>      <chr>     <chr>             <chr>                 
##  1 https://www.pt… minoru04   →         :這樣有需要發新聞片稿費喔… https://www.ptt.cc/bb…
##  2 https://www.pt… s87269x    推        :藏成這樣要發現也太難了… https://www.ptt.cc/bb…
##  3 https://www.pt… toto3527   推        :這部老實說還好耶劇情太普通太好… https://www.ptt.cc/bb…
##  4 https://www.pt… sign0      推        :這是'大家來找碴'吧瞬間晃過去… https://www.ptt.cc/bb…
##  5 https://www.pt… kikielle   推        :為什麼大家都喜歡劇情難猜的?… https://www.ptt.cc/bb…
##  6 https://www.pt… faang      推        :劇情太複雜，小朋友在戲院又要跟… https://www.ptt.cc/bb…
##  7 https://www.pt… torukumato 推        :要動腦又要BOSS猜一猜請看柯… https://www.ptt.cc/bb…
##  8 https://www.pt… edwar      推        :娛樂片當推理片在看,比如冰雪的… https://www.ptt.cc/bb…
##  9 https://www.pt… stevexbuc… 推        :是有多好猜       https://www.ptt.cc/bb…
## 10 https://www.pt… aku192     推        :劇情太好猜的話電影就會變的很無… https://www.ptt.cc/bb…

皮克斯意見領袖文章留言多為討論其題材是否適合小孩子的議題。

斷詞

disney_leader_reviews_tokens <- disney_leader_reviews %>%
  unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

disney_leader_reviews_tokens$word <- all_synonym(disney_leader_reviews_tokens$word)
#disney_leader_reviews_tokens

# 計算詞頻
disney_leader_reviews_tokens_count <- disney_leader_reviews_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 10) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

# 文字雲
#disney_leader_reviews_tokens_count %>%
  #filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>% 
  #filter(sum > 10) %>% 
  #wordcloud2()

迪士尼意見領袖發文的留言，仍多以討論冰雪奇緣居多。

留言情緒分析

disney_leader_reviews_tokens %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "迪士尼意見領袖發文留言內容的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

正面詞如：喜歡，同意、無憂無慮、信任。負面詞如：問題、不好、指責、批評。

皮克斯意見領袖發文的留言

pixar_leader_reviews_tokens <- pixar_leader_reviews %>%
  unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

pixar_leader_reviews_tokens$word <- all_synonym(pixar_leader_reviews_tokens$word)
pixar_leader_reviews_tokens

## # A tibble: 642 x 5
##    artUrl                 cmtPoster cmtStatus `pixar_leader_posts_tokens$… word 
##    <chr>                  <chr>     <chr>     <chr>                        <chr>
##  1 https://www.ptt.cc/bb… minoru04  →         https://www.ptt.cc/bbs/movi… 需要 
##  2 https://www.ptt.cc/bb… minoru04  →         https://www.ptt.cc/bbs/movi… 新聞片…
##  3 https://www.ptt.cc/bb… minoru04  →         https://www.ptt.cc/bbs/movi… 稿費 
##  4 https://www.ptt.cc/bb… s87269x   推        https://www.ptt.cc/bbs/movi… 藏成 
##  5 https://www.ptt.cc/bb… s87269x   推        https://www.ptt.cc/bbs/movi… 發現 
##  6 https://www.ptt.cc/bb… s87269x   推        https://www.ptt.cc/bbs/movi… 太難 
##  7 https://www.ptt.cc/bb… toto3527  推        https://www.ptt.cc/bbs/movi… 實說 
##  8 https://www.ptt.cc/bb… toto3527  推        https://www.ptt.cc/bbs/movi… 還好 
##  9 https://www.ptt.cc/bb… toto3527  推        https://www.ptt.cc/bbs/movi… 劇情 
## 10 https://www.ptt.cc/bb… toto3527  推        https://www.ptt.cc/bbs/movi… 太普通…
## # … with 632 more rows

# 計算詞頻
pixar_leader_reviews_tokens_count <- pixar_leader_reviews_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 1) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

文字雲，找出關鍵字

# 文字雲
#pixar_leader_reviews_tokens_count %>%
  #filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>% 
  #filter(sum > 1) %>% 
  #wordcloud2()

在文字圖中發現有許多青菜如：花椰菜、苦瓜等。在腦筋急轉彎裡，皮克斯針對每個不同的國家畫面做了調整，討人厭的綠色花椰菜，到了日本版變成青椒，討論文章也引起了注意，網友紛紛熱絡地回覆自己不喜歡的食物，這是一個很有趣的現象。

意見領袖發文的留言情緒分析

pixar_leader_reviews_tokens %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "皮克斯意見領袖發文留言內容的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

留言情緒分析，可以發現一個有趣的現象，發文的內容偏向正面但網友留言的內容偏向負面。

使用者經常參與的文章種類

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2) 

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21, 
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。

使用者是否受到歡迎

# PTT的回覆有三種，推文、噓文、箭頭
# 我們只要看推噓就好，因此把箭頭清掉
link <- postsReviews %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>5) %>% 
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()



# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

『迪士尼』與『皮克斯』討論文章大多為推文居多，代表大家對於兩個工作室的作品都抱有相當高正面的討論。

postsReviews %>% 
  filter( cmtPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) -> test
          
test %>% select(-artUrl,-artDate,-artTime,-artCat,-sentence) -> test
test %>% head()

##                                     artTitle  artPoster commentNum push boo
## 1           [選片]柯南、怪獸大學還是白宮末日       anif         31   20   0
## 2 [分享]《天外奇蹟》電影真正令人感動的是故事 originbook         49   33   2
## 3       Fw:[新聞]迪士尼連續4年海外票房破20億       ooic          8    5   0
## 4       Fw:[新聞]迪士尼連續4年海外票房破20億       ooic          8    5   0
## 5       Fw:[新聞]迪士尼連續4年海外票房破20億       ooic          8    5   0
## 6       [討論]迪士尼將推出「庫伊拉」真人電影 beckseaton         17   14   0
##   cmtPoster cmtStatus                                 cmtContent topic
## 1   S890127        推 :今年的柯南在粉絲間評價明明比前幾年都好...     7
## 2   S890127        噓    :是NEMO不是NIMO，天外奇蹟片頭不是倒敘法     5
## 3   S890127        推                     :漫威+皮克斯，躺著賺啊     6
## 4   S890127        推  :飛機根本沒宣傳，我連預告片都沒在戲院看過     6
## 5   S890127        →           :當初根本沒要上院線只打算出DVD的     6
## 6   S890127        推                            :梅莉史翠普不錯     6
##       gamma
## 1 0.8373285
## 2 0.9993013
## 3 0.9071989
## 4 0.9071989
## 5 0.9071989
## 6 0.9276916

#連結的
test %>% filter(cmtStatus == "推") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()

## # A tibble: 6 x 2
## # Groups:   cmtPoster [6]
##   cmtPoster        n
##   <chr>        <int>
## 1 Howard61313    315
## 2 KingKingCold   107
## 3 S890127         99
## 4 lovemelissa     89
## 5 Punisher         8
## 6 chirex           4

#全部的
postsReviews %>% filter(cmtStatus == "推")%>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()

## # A tibble: 6 x 2
## # Groups:   cmtPoster [6]
##   cmtPoster       n
##   <chr>       <int>
## 1 Howard61313   315
## 2 vvvvvvvvvvv   258
## 3 mysmalllamb   236
## 4 DemonElf      219
## 5 RapunzElsa    204
## 6 pattda        191

test %>% filter(cmtStatus == "推") %>% arrange(desc(commentNum)) %>% head()

##                            artTitle artPoster commentNum push boo   cmtPoster
## 1 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
## 2 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
## 3 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
## 4 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
## 5 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
## 6 Re:[非常普雷]真的很普通的冰雪奇緣    bernon        854   89  29 Howard61313
##   cmtStatus                                          cmtContent topic     gamma
## 1        推         :算了明明有篇更好的普雷文值得我們回應或討論     7 0.8729672
## 2        推  :奇怪，又不是每篇給冰雪普雷的都言之無物...我就看到     7 0.8729672
## 3        推 :最好是啦....一篇文章自己的質量還會受閱讀者的影響喔     7 0.8729672
## 4        推         :一篇文章有沒有道理，才是閱讀者方的主觀意識     7 0.8729672
## 5        推 :OK，我是主觀的不認同別人的意見，但是用到了'錯'這個     7 0.8729672
## 6        推                 :我現在發現我沒雅量的地方了，他還沒     7 0.8729672

#連結的
test %>% filter(cmtStatus == "噓") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()

## # A tibble: 4 x 2
## # Groups:   cmtPoster [4]
##   cmtPoster        n
##   <chr>        <int>
## 1 KingKingCold     6
## 2 chirex           4
## 3 Howard61313      2
## 4 S890127          2

#全部的
postsReviews %>% filter(cmtStatus == "噓") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()

## # A tibble: 6 x 2
## # Groups:   cmtPoster [6]
##   cmtPoster       n
##   <chr>       <int>
## 1 darkMood       56
## 2 toyamaK52      35
## 3 Alertme        34
## 4 PttGod         28
## 5 callcallABC    25
## 6 dreamtime09    25

test %>% filter(cmtStatus == "噓") %>% arrange(desc(commentNum)) %>% head()

##                                         artTitle    artPoster commentNum push
## 1             [討論]動物方城市字幕改編團隊的回應      S890127        671  233
## 2 Re:[討論]迪士尼要跟康卡斯特拼了，提713億買福斯        avdcd        124   44
## 3      [新聞]小S首度偕3女兒為【海底總動員2】配音    c84925025        111   21
## 4      [新聞]小S首度偕3女兒為【海底總動員2】配音    c84925025        111   21
## 5          Re:[討論]動物方城市字幕改編團隊的回應 pingtung5566         98   22
## 6     [討論]《冰雪奇緣2》艾莎為何被…？/樂高海報      CYKONGG         87   20
##   boo    cmtPoster cmtStatus                                        cmtContent
## 1  33       chirex        噓 :冰的啦我也覺得翻得有夠爛，原來是辛普森家庭的翻譯
## 2   1 KingKingCold        噓                 :樓下快靠北萬惡米老鼠壟斷電影產業
## 3  47 KingKingCold        噓                                     :天龍人總動員
## 4  47 KingKingCold        噓                                     :天龍人總動員
## 5  12       chirex        噓             :我沒有笑，謝謝。在地化跟竄改是兩回事
## 6   7       chirex        噓                                  :被X是什麼意思？
##   topic     gamma
## 1     9 0.9994406
## 2     6 0.9988877
## 3    10 0.9992542
## 4    10 0.9992542
## 5     9 0.9983714
## 6     7 0.6837956

藉由將上述連結討論度較高的鄉民“S890127”,“Punisher”,“chirex”,“Howard61313”,“lovemelissa”,"KingKingCold提出查看，推的部分由多到少排序為 Howard61313 > KingKingCold > S890127 > lovemelissa 推較多，且回覆多為是票房、歌曲、劇情等得認同或討論。
相對噓的部分極少(為個位數)，由多到少排序為現KingKingCold > chirex > Howard61313 > S890127 推較多內容幕後團隊的內容、翻譯、配音等的不認同。雖然整體噓還是有，推估是因為與推數量相差太多，或是太分散，導致連結性弱。

網路圖_全部電影(近一年)

# 所有文章資料_全部電影(近一年)
all_movie_posts <- read_csv("期末_movie版_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 所有回覆資料_全部電影(近一年)
all_movie_reviews <- read_csv("期末_movie版_articleReviews.csv")

all_movie_posts$artDate <- all_movie_posts$artDate %>% as.Date("%Y/%m/%d")
all_movie_reviews$artDate <- all_movie_reviews$artDate %>% as.Date("%Y/%m/%d")

# 選取所需欄位
all_movie_reviews <- all_movie_reviews  %>% select(artUrl, cmtPoster, cmtStatus, cmtContent)

# 總共參與人數
all_movie_poster <- c(all_movie_posts$artPoster, all_movie_reviews$cmtPoster)

# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList_movie <- data.frame(user=unique(all_movie_poster)) %>%
             mutate(type=ifelse(user%in%all_movie_posts$artPoster, "poster", "replyer"))

## 建立社群網路圖
#由於資料太龐大，因為是要判斷對於皮克斯與迪士尼所連結性強且受歡迎的鄉民，相對於近一年的情況，因此塞選(commentNum >= 400 & push >= 150 & boo >= 5)的條件
all_movie_posts %>% filter(commentNum >= 400 & push >= 150 & boo >= 5) -> all_movie_posts_sel

postsReviews_movie <- merge(x = all_movie_posts_sel, y = all_movie_reviews, by = "artUrl")
#postsReviews

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link_movie <- postsReviews_movie %>%
      select(cmtPoster, artPoster, artUrl)

# 建立網路關係
reviewNetwork_movie <- graph_from_data_frame(d=link_movie, directed=T)

# 畫出網路圖
#plot(reviewNetwork_movie )

# 把點點的大小和線的粗細調小，並不顯示使用者賬號。
#plot(reviewNetwork_movie , vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

使用者是否受到歡迎

# PTT的回覆有三種，推文、噓文、箭頭
# 我們只要看推噓就好，因此把箭頭清掉
link_movie <- postsReviews_movie %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n() > 5) %>% 
      ungroup() %>% 
      filter(commentNum >= 400 & push >= 150 & boo >= 5) %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% unique()

# 篩選link中有出現的使用者
filtered_user_movie <- userList_movie %>%
          filter(user%in%link_movie$cmtPoster | user%in%link_movie$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork_movie <- graph_from_data_frame(d=link_movie, v=filtered_user_movie, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork_movie)
V(reviewNetwork_movie)$label <- names(labels)
V(reviewNetwork_movie)$color <- ifelse(V(reviewNetwork_movie)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork_movie)$color <- ifelse(E(reviewNetwork_movie)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork_movie, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork_movie) > 7, V(reviewNetwork_movie)$label, NA),vertex.label.font=2)

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

由於資料太龐大，因為是要判斷對於皮克斯與迪士尼所連結性強且受歡迎的鄉民，相對於近一年的情況，因此塞選(commentNum >= 400 & push >= 150 & boo >= 5)的條件查看
明顯整體會在迪士尼與皮克斯的鄉民，在近一年中會對每部電影進行發文與回覆偏少

postsReviews_movie <- postsReviews_movie %>%
      filter(cmtStatus!="→") %>%
      filter(artPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) %>% select(artPoster,artTitle,commentNum,push,boo,cmtPoster,cmtStatus,cmtContent)  %>% arrange(desc(push))

細節查看發現，會在迪士尼與皮克斯的鄉民以KingKingCold是幾人當中唯一發文者中較多人討論的

postsReviews_movie <- postsReviews_movie %>%
      filter(cmtStatus!="→") %>%
      filter( cmtPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) %>% select(cmtPoster,cmtStatus,cmtContent,artTitle,commentNum,push,boo,artPoster)  %>% arrange(desc(push))

而留言者中 Punisher、KingKingCold、chirexn雖然出現率低但仍有在電影版中為其他電影發文進行回覆

Text Classfication

#增加欄位判斷disney or pixar
is_disney_posts <- disney_posts %>%
                   mutate(title = "disney")
is_pixar_posts <- pixar_posts %>%
                   mutate(title = "pixar")

is_posts <- rbind(is_disney_posts,is_pixar_posts)

tidy_is_disney <- is_posts %>%
  unnest_tokens(word, sentence) %>%
  group_by(word) %>%
  anti_join(stop_words) %>% 
  filter(n() > 10) %>% #只取出出現大於10次的字
  ungroup()

## Joining, by = "word"

tidy_is_disney$lemma = lemmatize_words(tidy_is_disney$word)

disney = unique(tidy_is_disney$artUrl[which(tidy_is_disney$title == "disney")])

# 轉成document term matrix
dtm = tidy_is_disney %>% 
      count(artUrl,lemma) %>% 
      cast_dtm(artUrl,lemma,n)

inspect(dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity           : 86%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                                       Terms
## Docs                                                   安 般 半 伴 包容 包裝 保
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html  1  1  1  1    1    1  2
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html  0  0  0  0    1    0  0
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html  0  0  0  0    1    0  0
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html  0  0  0  0    0    0  0
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html  0  1  0  0    0    0  0
##                                                       Terms
## Docs                                                   保護 保守 悲傷
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html    1    1    1
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html    0    0    0
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html    0    1    0

dtm = as.data.frame(as.matrix(dtm))

dtm$is_disney = as.integer(rownames(dtm)) %in% disney

## Warning in as.integer(rownames(dtm)) %in% disney: 強制變更過程中產生了 NA

dtm$is_disney = as.factor(ifelse(dtm$is_disney,1,0)) #是alice為1,不是為0

TF-IDF

##將每個artUrl視為一個document來計算字詞的TF-IDF
document_words <- tidy_is_disney %>% #計算每個document裡不同的字的tf
                  count(artUrl, lemma, sort = TRUE)

total_words <- document_words %>% 
                group_by(artUrl) %>% 
                summarize(total = sum(n)) %>% 
                right_join(document_words) %>% 
                mutate(is_disney = artUrl %in% disney)

## `summarise()` ungrouping output (override with `.groups` argument)

## Joining, by = "artUrl"

total_words = total_words %>% 
              bind_tf_idf(lemma,artUrl,n)

將TF-ITF 轉為dtm的值

dtm.tfidf = total_words %>% cast_dtm(artUrl,lemma,tf_idf)
#dim(dtm.tfidf)

inspect(dtm.tfidf[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 75/25
## Sparsity           : 25%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                       Terms
## Docs                                                             但          的
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.0064325021 0.003217620
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.0066720485 0.002387637
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.0037513920 0.002850147
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.0000000000 0.004230182
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.0026795657 0.004602722
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.0027970905 0.004527408
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.0018812290 0.003790692
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.0062831196 0.003528347
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.0019252427 0.004229160
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.0005395403 0.005168552
##                                                       Terms
## Docs                                                           電影         昆
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.0050997145 0.04946648
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.0013224070 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.0044611788 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.0050763079 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.0000000000 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.0055438626 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.0014914462 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.0149438503 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.0007631702 0.00000000
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.0059884973 0.00000000
##                                                       Terms
## Docs                                                            了           裡
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.003635134 0.0185995042
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.001099731 0.0067522525
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.001854988 0.0037964870
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.001266457 0.0025919791
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.004858301 0.0000000000
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.004149314 0.0000000000
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.004651149 0.0000000000
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.003106876 0.0063586482
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.002617982 0.0009741929
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.006669795 0.0043682084
##                                                       Terms
## Docs                                                            他          與
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.015984792 0.015912509
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.007033967 0.003703065
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.023729296 0.004996960
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.000000000 0.006823158
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.005649832 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.002211612 0.014903214
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.000000000 0.005011700
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.000000000 0.004184646
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.009133540 0.006411194
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.004266054 0.005030780
##                                                       Terms
## Docs                                                            在         car
##   https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.006561669 0.043199324
##   https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.002268675 0.003267255
##   https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.003826721 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.003265776 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.001822248 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.003566571 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.002878507 0.005527347
##   https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.003204643 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.003436829 0.000000000
##   https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.006054119 0.000000000

使用dtm.tfidf作為input來訓練model

dtm.tfidf = as.data.frame(as.matrix(dtm.tfidf))

dtm.tfidf$is_disney = as.character(rownames(dtm.tfidf)) %in% disney
dtm.tfidf$is_disney = as.factor(ifelse(dtm.tfidf$is_disney,1,0))

Model building

將資料分成訓練集與測試集

set.seed(123)
spl = sample.split(dtm$is_disney, 0.7) #在保留is_jane比例的狀況下以7:3將原始資料分割成訓練集與測試集
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)

set.seed(123)
spl = sample.split(dtm.tfidf$is_disney, 0.7) 
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)

# 以dtm.tfidf為input
svm.fit.tfidf = svm(is_disney~.,TR.tfidf,kernel = "linear",cost = 10,scale = F)
p.svm.tfidf = pred = predict(svm.fit.tfidf,TS.tfidf)

confusionMatrix(pred,TS.tfidf$is_disney,dnn = c("Prediction","Reference"),positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 206   5
##          1   6 242
##                                          
##                Accuracy : 0.976          
##                  95% CI : (0.9575, 0.988)
##     No Information Rate : 0.5381         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9518         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9798         
##             Specificity : 0.9717         
##          Pos Pred Value : 0.9758         
##          Neg Pred Value : 0.9763         
##              Prevalence : 0.5381         
##          Detection Rate : 0.5272         
##    Detection Prevalence : 0.5403         
##       Balanced Accuracy : 0.9757         
##                                          
##        'Positive' Class : 1              
##

Accuracy:0.976, Sensitivity:0.9798, Specificity:0.717

Model Building : Decision tree

#t0 = Sys.time()
#rp = rpart(is_disney~.,TR,method = "class")
#Sys.time() - t0

決策樹視覺化

#par(family = "Heiti TC Light")
#prp(rp)

在高維度資料上，特別是本次的document term，很難有一個特徵是可以決定區分兩個類別的，使決策樹的表現並不是很好。但決策樹的好處是易於解釋。

#p.rp = pred = predict(rp,TS)[,2] #predict(rp,TS) 會返回2種probability : 是0的機率,是1的機率 , 我們只取出 是1的機率

#confusionMatrix(factor(ifelse(pred >= 0.5,1,0)),factor(TS$is_disney),positive = "1")

Accuracy:0.9521,Sensitivity:0.9838,Specificity:0.9151

模型預測結果的ROC

#colAUC(cbind(svm.tfidf = p.svm.tfidf,rpart = p.rp), TS$is_disney, T)

結論

迪士尼與皮克斯工作室發表的動畫除了畫風不一樣之外，每一部動畫所傳達的寓意也不太相同，儘管是同一個工作室產出，也會有著不一樣的寓意。大家在討論時除了針對主角做討論，也可以對於所傳達之寓意有著不一樣且多元面向的討論。另外，大家在討論皮克斯時，討論的深度較迪士尼更深，這也是一個很有趣的發現。

PTT movie版：迪士尼與皮克斯討論分析

Group 10 組員 : N084020017 蔡尚亨、M084020015 陳怡蓁、M084020048 劉育寧、M084020033 顏灝

動機和分析目的

資料集描述

系統設置

資料處理

斷句

初始化斷詞器

斷詞

同義字取代

資料預覽

全部文章日期圖

文字雲

『迪士尼』文字雲

『皮克斯』文字雲

主角分析

出現在『艾莎』的前後五個字彙(貼文)-『冰雪奇緣』

出現在『杯麵』的前後五個字彙(貼文)-『大英雄天團』

出現在『胡迪』的前後五個字彙(貼文)-『玩具總動員4』

出現在『樂樂』的前後五個字彙(貼文)-『腦筋急轉彎』

共線相關圖

『迪士尼』共線相關圖

『皮克斯』共線相關圖

情緒分析

每日的正負面情緒字分布

討論迪士尼-『冰雪奇緣2』相關貼文的正負面詞

討論迪士尼-『大英雄天團』相關貼文的正負面詞

討論皮克斯-『玩具總動員4』相關貼文的正負面詞

討論皮克斯-『腦筋急轉灣』相關貼文的正負面詞

LDA

透過perplexity找到最佳主題數

10個主題及其常出現字詞

以比例看各主題在時間的變化

LDAvis

網路圖

發文者數量

回覆者數量

總共參與人數

使用者經常參與的文章種類

探討意見領袖發表的文章，出現了哪些關鍵字讓比較多人願意回覆與互動

找出disney意見領袖文章的詞句

計算disney意見領袖文章詞彙的詞頻

繪製文字雲，找出關鍵字

找出pixar意見領袖文章的詞句

繪製文字雲，找出關鍵字

情緒分析disney意見領袖文章

情緒分析Pixar意見領袖文章

取出熱門文章url

取出意見領袖文章留言

斷詞

留言情緒分析

皮克斯意見領袖發文的留言

文字雲，找出關鍵字

意見領袖發文的留言情緒分析

使用者經常參與的文章種類

使用者是否受到歡迎

網路圖_全部電影(近一年)

使用者是否受到歡迎

Text Classfication

TF-IDF

將TF-ITF 轉為dtm的值

使用dtm.tfidf作為input來訓練model

Model building

Model Building : Decision tree

模型預測結果的ROC

結論

Group 10
組員 :
N084020017 蔡尚亨、M084020015 陳怡蓁、M084020048 劉育寧、M084020033 顏灝