動機和分析目的

許多經典電影動畫電影都出自『迪士尼』與『皮克斯』兩大工作室，而其各自代表作品的風格迥異，我們將透過社群網路上大家對其動畫電影的討論度來做以下討論。

資料集描述

由中山大學管理學院文字分析平台收集PTT文章及留言取得之原始csv檔案。
透過中山管院文字分析平台，取得PTT資料，以關鍵字為『皮克斯』、『Pixar』、『玩具總動員』等皮克斯相關電影及『迪士尼』、『冰雪奇緣』等迪士尼相關電影。『皮克斯』相關文章共有711篇，『迪士尼』相關文章共有822篇。

系統設置

# 安裝需要的packages
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","wordcloud2","scales","tm","reshape2","widyr","ggraph","tidyverse")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# 載入packages
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(scales)
library(tm)
library(reshape2)
library(widyr)
library(ggraph)
library(tidyverse)
library(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料處理

# 讀取資料
# 迪士尼文章資料
disney_posts <- read_csv("期末＿迪士尼_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 迪士尼回覆資料
disney_reviews <- read_csv("期末＿迪士尼_articleReviews.csv")

# 皮克斯文章資料
pixar_posts <- read_csv("期末_皮克斯_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 皮克斯回覆資料
pixar_reviews <- read_csv("期末_皮克斯_articleReviews.csv")

# 所有文章資料
all_posts <- read_csv("all_articleMetaData.csv") %>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 所有回覆資料
all_reviews <- read_csv("all_articleReviews.csv")

# 轉換日期格式
disney_posts$artDate <- disney_posts$artDate %>% as.Date("%Y/%m/%d")
disney_reviews$artDate <- disney_reviews$artDate %>% as.Date("%Y/%m/%d")
pixar_posts$artDate <- pixar_posts$artDate %>% as.Date("%Y/%m/%d")
pixar_reviews$artDate <- pixar_reviews$artDate %>% as.Date("%Y/%m/%d")
all_posts$artDate <- all_posts$artDate %>% as.Date("%Y/%m/%d")
all_reviews$artDate <- all_reviews$artDate %>% as.Date("%Y/%m/%d")

斷句

#迪士尼文章斷句
disney_posts_sentences <- strsplit(disney_posts$sentence,"[.。！；？!?;]")

disney_posts_sentences <- data.frame(
  artUrl = rep(disney_posts$artUrl, sapply(disney_posts_sentences, length)), 
  sentence = unlist(disney_posts_sentences),
  artDate = rep(disney_posts$artDate, sapply(disney_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

disney_posts_sentences$sentence <- as.character(disney_posts_sentences$sentence)

#皮克斯文章斷句
pixar_posts_sentences <- strsplit(pixar_posts$sentence,"[.。！；？!?;]")

pixar_posts_sentences <- data.frame(
  artUrl = rep(pixar_posts$artUrl, sapply(pixar_posts_sentences, length)), 
  sentence = unlist(pixar_posts_sentences),
  artDate = rep(pixar_posts$artDate, sapply(pixar_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

pixar_posts_sentences$sentence <- as.character(pixar_posts_sentences$sentence)

#所有文章斷句
all_posts_sentences <- strsplit(all_posts$sentence, "[.。！；？!?;]")

all_posts_sentences <- data.frame(
  artUrl = rep(all_posts$artUrl, sapply(all_posts_sentences, length)), 
  sentence = unlist(all_posts_sentences),
  artDate = rep(all_posts$artDate, sapply(all_posts_sentences, length))
  ) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

all_posts_sentences$sentence <- as.character(all_posts_sentences$sentence)

初始化斷詞器

# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞

jieba_tokenizer = worker(user="all_dict/user_dict.txt", stop_word="all_dict/stop_words.txt")

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

斷詞

disney_posts_tokens <- disney_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

pixar_posts_tokens <- pixar_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

all_posts_tokens <- all_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

同義字取代

#迪士尼
disney_synonym <- function(i)
{
  gsub("disney", "迪士尼", i) %>%
  gsub("冰雪", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
  gsub("冰雪2", "冰雪奇緣2", .) %>%
  gsub("frozen", "冰雪奇緣", .) %>%
  gsub("frozen2", "冰雪奇緣2", .) %>%
  gsub("Tangled", "魔法公主", .) %>%
  gsub("WreckItRalph", "無敵破壞王", .) %>%
  gsub("BigHero", "大英雄天團", .) %>%
  gsub("Zootopia", "動物方程式", .) %>%
  gsub("Moana", "海洋奇緣", .) %>%
  gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
  gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
  gsub("elsa", "艾莎", .) %>%  #冰雪奇緣女主角
  gsub("anna", "安娜", .) %>%  #冰雪奇緣女主角
  gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
  gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
  gsub("Felix", "阿修", .) %>% #無敵破壞王主角
  gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
  gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
  gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
  gsub("Judy", "哈茱蒂", .) %>%  #動物方程式主角
  gsub("艾爾莎", "艾莎", .)
}

## 針對貼文內容套入 function
disney_posts_tokens$word <- disney_synonym(disney_posts_tokens$word)

#皮克斯
pixar_synonym <- function(i)
{
  gsub("woody", "胡迪", i) %>%
  gsub("dory", "多莉", .) %>%
  gsub("toy story", "玩具總動員", .) 
}

## 針對貼文內容套入 function
pixar_posts_tokens$word <- pixar_synonym(pixar_posts_tokens$word)

#所有
all_synonym <- function(i)
{
  gsub("disney", "迪士尼", i) %>%
  gsub("冰雪", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
  gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
  gsub("冰雪2", "冰雪奇緣2", .) %>%
  gsub("frozen", "冰雪奇緣", .) %>%
  gsub("frozen2", "冰雪奇緣2", .) %>%
  gsub("Tangled", "魔法公主", .) %>%
  gsub("WreckItRalph", "無敵破壞王", .) %>%
  gsub("BigHero", "大英雄天團", .) %>%
  gsub("Zootopia", "動物方程式", .) %>%
  gsub("Moana", "海洋奇緣", .) %>%
  gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
  gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
  gsub("elsa", "艾莎", .) %>%  #冰雪奇緣女主角
  gsub("anna", "安娜", .) %>%  #冰雪奇緣女主角
  gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
  gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
  gsub("Felix", "阿修", .) %>% #無敵破壞王主角
  gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
  gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
  gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
  gsub("Judy", "哈茱蒂", .) %>%  #動物方程式主角
  gsub("艾爾莎", "艾莎", .) %>%
  gsub("woody", "胡迪", .) %>%
  gsub("dory", "多莉", .) %>%
  gsub("toy story", "玩具總動員", .)
}

# 針對貼文內容套入 function
all_posts_tokens$word <- all_synonym(all_posts_tokens$word)

資料預覽

全部文章日期圖

all_posts %>% 
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot(aes(artDate,count)) +
  geom_line(color="skyblue", size=1.2)+
  scale_x_date(labels = date_format("%Y-%m-%d"))+
  theme_classic() -> p

## `summarise()` ungrouping output (override with `.groups` argument)

p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.5)+#大英雄天團
    geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.5)+#冰雪奇緣2
    geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.5)+#腦筋急轉彎
    geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.5)#玩具總動員4

從上圖可以發現討論度高峰低峰很明顯，高峰代表動畫上映時，大家會熱烈討論，但當動畫下檔時，將不太有討論度。以下將針對四部動畫做討論，分別為迪士尼-『大英雄天團』及『冰雪奇緣2』，皮克斯-『腦筋急轉彎』及『玩具總動員4』。
紅色虛線為迪士尼代表作。綠色虛線為皮克斯代表作。

迪士尼-大英雄天團(2014)
皮克斯-腦筋急轉彎(2015)
皮克斯-玩具總動員4(2019)
迪士尼-冰雪奇緣2(2019)

文字雲

# 計算所有字在文章的總詞頻
disney_posts_tokens_count <- disney_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 30) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

#head(disney_posts_tokens_count)

pixar_posts_tokens_count <- pixar_posts_tokens %>% 
  filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 30) %>% #過濾出現太少次的字
  arrange(desc(sum))

## `summarise()` ungrouping output (override with `.groups` argument)

#head(pixar_posts_tokens_count)

『迪士尼』文字雲

#disney_clean_words <- c("迪士尼","電影","動畫")
#disney_posts_tokens_count %>%
  #filter(!(disney_posts_tokens_count$word %in% disney_clean_words)) %>% 
  #filter(sum > 100) #%>% 
  #wordcloud2()

與『迪士尼』相關的文章討論，以『冰雪奇緣』最受歡迎。兩位公主『艾莎』與『安娜』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。另外『歌曲』兩字也很常被討論，因迪士尼動畫之歌曲大都很受歡迎。

『皮克斯』文字雲

#pixar_clean_words <- c("皮克斯","電影","動畫")
#pixar_posts_tokens_count %>%
  #filter(!(pixar_posts_tokens_count$word %in% pixar_clean_words)) %>% 
  #filter(sum > 100) #%>% 
  #wordcloud2()

與『皮克斯』相關的文章討論，以『玩具總動員』最受歡迎。『胡迪』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。

主角分析

出現在『艾莎』的前後五個字彙(貼文)-『冰雪奇緣』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

elsa_posts_ngram <- disney_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

elsa_posts_ngram_separated <- elsa_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(elsa_posts_ngram_separated)

elsa_posts_check_words <- elsa_posts_ngram_separated %>%
  filter((word6 == "艾莎"))
#elsa_posts_check_words

elsa_posts_check_words_count <- elsa_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

elsa_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「艾莎」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『艾莎』的妹妹『安娜』公主為出現次數最高。其他如『魔法』、『公主』等都與其相關性很高。

出現在『杯麵』的前後五個字彙(貼文)-『大英雄天團』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

baymax_posts_ngram <- disney_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

baymax_posts_ngram_separated <- baymax_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(baymax_posts_ngram_separated)

baymax_posts_check_words <- baymax_posts_ngram_separated %>%
  filter((word6 == "杯麵"))
#baymax_posts_check_words

baymax_posts_check_words_count <- baymax_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

baymax_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「杯麵」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『杯麵』為主角『阿廣』的哥哥『阿正』所發明出的『機器人』，其代表著『可愛』與『照顧』的特色。

出現在『胡迪』的前後五個字彙(貼文)-『玩具總動員4』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

woody_posts_ngram <- pixar_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

woody_posts_ngram_separated <- woody_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(woody_posts_ngram_separated)

woody_posts_check_words <- woody_posts_ngram_separated %>%
  filter((word6 == "胡迪"))
#woody_posts_check_words

woody_posts_check_words_count <- woody_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

woody_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「胡迪」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『胡迪』為『安弟』其中一個『玩具』，其他出現字詞有『胡迪』的『夥伴』，如『巴斯』、『牧羊女』。

出現在『樂樂』的前後五個字彙(貼文)-『腦筋急轉彎』

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

happy_posts_ngram <- pixar_posts %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

happy_posts_ngram_separated <- happy_posts_ngram %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(happy_posts_ngram_separated)

happy_posts_check_words <- happy_posts_ngram_separated %>%
  filter((word6 == "樂樂"))
#happy_posts_check_words

happy_posts_check_words_count <- happy_posts_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

happy_posts_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("貼文出現在「樂樂」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

『樂樂』為『萊莉』『大腦』中的『情緒』『角色』之一，『憂憂』則為與『樂樂』相反的角色。

共線相關圖

『迪士尼』共線相關圖

# 迪士尼
# 剛才的斷詞結果沒有使用新增的辭典，
# 因此我們重新進行斷詞，再計算各詞彙在各文章中出現的次數
disney_posts_words <- disney_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 計算兩個詞彙同時出現的總次數
disney_word_pairs <- disney_posts_words %>%
  pairwise_count(word, artUrl, sort = TRUE)

## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

# 計算兩個詞彙間的相關性
disney_word_cors <- disney_posts_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

# 顯示相關性大於0.5的組合
set.seed(2020)

disney_word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

討論『動畫觀影心得』，也會討論到是否為雷文或是是否需要防雷的分隔線等。而『大英雄天團』則會討論關於機器人-杯麵及發明等字詞。

『皮克斯』共線相關圖

# 皮克斯
# 剛才的斷詞結果沒有使用新增的辭典，
# 因此我們重新進行斷詞，再計算各詞彙在各文章中出現的次數
pixar_posts_words <- pixar_posts_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 計算兩個詞彙同時出現的總次數
pixar_word_pairs <- pixar_posts_words %>%
  pairwise_count(word, artUrl, sort = TRUE)


# 計算兩個詞彙間的相關性
pixar_word_cors <- pixar_posts_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

# 顯示相關性大於0.5的組合
set.seed(2020)

pixar_word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

討論『腦筋急轉彎』除了討論情緒主角，也會討論到關於大腦的記憶、潛意識等。而討論『玩具總動員』可能大多與暴雷等資訊有關，所以會討論到分隔線等資訊。

情緒分析

# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")

# 負向字典txt檔
N <- read_file("liwc/negative.txt")

#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

# 與LIWC情緒字典做join
# 文集中的字屬於positive還是negative
### 計算詞彙次數_部分
disney_posts_tokens <- disney_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date <- disney_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

disney_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 199 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 破壞  negative 
##  2 破壞  negative 
##  3 破壞  negative 
##  4 破壞  negative 
##  5 遊戲  positive 
##  6 情緒  negative 
##  7 勇氣  positive 
##  8 遊戲  positive 
##  9 遊戲  positive 
## 10 破壞  negative 
## # … with 189 more rows

pixar_posts_tokens <- pixar_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date <- pixar_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

pixar_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 237 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 寶貝  positive 
##  2 解決  positive 
##  3 情緒  negative 
##  4 情緒  negative 
##  5 驚嚇  negative 
##  6 情緒  negative 
##  7 驚嚇  negative 
##  8 情緒  negative 
##  9 情緒  negative 
## 10 情緒  negative 
## # … with 227 more rows

all_posts_tokens <- all_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

all_tokens_by_date <- all_posts_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

all_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"
## Joining, by = "word"

## # A tibble: 490 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 寶貝  positive 
##  2 解決  positive 
##  3 破壞  negative 
##  4 情緒  negative 
##  5 情緒  negative 
##  6 驚嚇  negative 
##  7 遊戲  positive 
##  8 情緒  negative 
##  9 破壞  negative 
## 10 驚嚇  negative 
## # … with 480 more rows

# 統計每天文章正面字的次數與負面字的次數
disney_sentiment_count = disney_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_sentiment_count = pixar_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

all_sentiment_count = all_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

每日的正負面情緒字分布

all_sentiment_count %>%
  ggplot() +
  geom_line(aes(x=artDate,y=count,colour=sentiment)) +
  scale_x_date(labels = date_format("%Y/%m/%d")) -> p

p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.2)+#大英雄天團
    geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.2)+#冰雪奇緣2
    geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.2)+#腦筋急轉彎
    geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.2)#玩具總動員4

紅色為迪士尼代表作，綠色代表皮克斯代表作。圖中『玩具總動員4』的正面字詞相當多，可以代表『玩具總動員4』為一部相當正面的動畫，其他部動畫則無明顯之差異。

討論迪士尼-『冰雪奇緣2』相關貼文的正負面詞

disney_posts_select_frozen <- disney_posts %>%
  filter(str_detect(disney_posts$artTitle, "冰雪奇緣2") | str_detect(disney_posts$artTitle, "frozen2"))

# 計算詞彙次數_部分
disney_posts_tokens_frozen <- disney_posts_select_frozen %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date_frozen <- disney_posts_tokens_frozen %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_frozen = disney_tokens_by_date_frozen %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

disney_posts_tokens_frozen %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『冰雪奇緣2』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『冰雪奇緣2』正面字詞如：喜歡、相信、冒險、愛情等。負面字詞如：失去、恐懼、害怕等。

討論迪士尼-『大英雄天團』相關貼文的正負面詞

disney_posts_select_hero <- disney_posts %>%
  filter(str_detect(disney_posts$artTitle, "大英雄天團") | str_detect(disney_posts$artTitle, "big hero 6"))

# 計算詞彙次數_部分
disney_posts_tokens_hero <- disney_posts_select_hero %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

disney_tokens_by_date_hero <- disney_posts_tokens_hero %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_hero = disney_tokens_by_date_hero %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

disney_posts_tokens_hero %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『大英雄天團』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『大英雄天團』正面詞如：英雄、溫暖、溫馨、分享等。負面字詞如：反派、復仇、戰鬥、失去等。

討論皮克斯-『玩具總動員4』相關貼文的正負面詞

pixar_posts_select_toy <- pixar_posts %>%
  filter(str_detect(pixar_posts$artTitle, "玩具總動員4") | str_detect(pixar_posts$artTitle, "toy4"))

# 計算詞彙次數_部分
pixar_posts_tokens_toy <- pixar_posts_select_toy %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date_toy <-pixar_posts_tokens_toy %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_toy = pixar_tokens_by_date_toy %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_posts_tokens_toy %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『玩具總動員4』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『玩具總動員4』正面詞如：自由、朋友、重要、冒險等。負面詞如：失去、放下、反派等。

討論皮克斯-『腦筋急轉灣』相關貼文的正負面詞

pixar_posts_select_brain <- pixar_posts %>%
  filter(str_detect(pixar_posts$artTitle, "腦筋急轉彎") | str_detect(pixar_posts$artTitle, "inside out"))

# 計算詞彙次數_部分
pixar_posts_tokens_brain <- pixar_posts_select_brain %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

pixar_tokens_by_date_brain <-pixar_posts_tokens_brain %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_brain = pixar_tokens_by_date_brain %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

pixar_posts_tokens_brain %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "討論『腦筋急轉彎』文章的正負面詞",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Joining, by = "word"

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

『腦筋急轉彎』正面詞如：希望、冒險、相信等。負面詞如：憂傷、難過、控制等。

LDA

# 將資料轉換為Document Term Matrix (DTM)
#all_posts_tokens <- all_posts %>%
  #unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  #filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  #count(artUrl, word) %>%
  #rename(count=n)

#all_posts_dtm <- all_posts_tokens %>% cast_dtm(artUrl, word, count)
#inspect(all_posts_dtm[1:10,1:10])

#ldas_allPosts = c()
#topics = c(3, 5, 10, 25, 36)
#for(topic in topics){
  #start_time <- Sys.time()
  #lda_allPosts <- LDA(all_posts_dtm, k = topic, control = list(seed = 2020))
  #ldas_allPosts =c(ldas_allPosts,lda_allPosts)
  #print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
  #save(ldas_allPosts,file = "ldas_result_allPosts")
#}

load("ldas_result_allPosts")

透過perplexity找到最佳主題數

#topics = c(3, 5, 10, 25, 36)
#data_frame(k = topics,
          # perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>%
  #ggplot(aes(k, perplex)) +
  #geom_point() +
  #geom_line() +
  #labs(title = "Evaluating LDA topic models",
       #subtitle = "Optimal number of topics (smaller is better)",
      # x = "Number of topics",
       #y = "Perplexity")

透過perplexity找最佳主題數，決定以主題數10個作為接續討論。

10個主題及其常出現字詞

new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果

topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。

remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts <- topics_allPosts %>%
  filter(!term  %in% remove_words)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_allPosts %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  theme(text = element_text(family='STHeitiTC-Light'))

主題1-皮克斯
主題2-玩具總動員
主題3-迪士尼
主題4-冰雪奇緣
主題5-腦筋急轉彎
主題6-票房
主題7-可可夜總會
主題8-大英雄天團
主題9-動物方程式
主題10-無敵破壞王
選定主題1-皮克斯、主題3-迪士尼、主題6-票房為主要討論主題

# 主題命名
topic_name = c('皮克斯', '玩具總動員', '迪士尼', '冰雪奇緣', '腦筋急轉彎', '票房', '可可夜總會', '大英雄天團', '動物方程式', '無敵破壞王')

tmResult <- posterior(new_lda_allPosts)
doc_pro <- tmResult$topics 
#dim(doc_pro) 

allPosts_data <- all_posts %>%
  select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics <- doc_pro[allPosts_data$artUrl,]
document_topics_df = data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(allPosts_data, document_topics_df)
#news_topic %>% head(10)

以比例看各主題在時間的變化

news_topic %>%
  group_by(artDate = format(artDate,'%Y')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family='STHeitiTC-Light'))

皮克斯的討論比例較迪士尼稍高一點，也可以發現『票房』為隨時間到來，討論度也提高。

網路圖

# 選取所需欄位
all_reviews <- all_reviews %>%
  select(artUrl, cmtPoster, cmtStatus, cmtContent)

發文者數量

length(unique(all_posts$artPoster))

## [1] 1278

#1278

回覆者數量

length(unique(all_reviews$cmtPoster))

## [1] 19524

#19524

總共參與人數

# 總共參與人數
all_poster <- c(all_posts$artPoster, all_reviews$cmtPoster)
length(unique(all_poster))

## [1] 19977

#19977

# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(all_poster)) %>%
             mutate(type=ifelse(user%in%all_posts$artPoster, "poster", "replyer"))

## 建立社群網路圖
postsReviews <- merge(x = all_posts, y = all_reviews, by = "artUrl")
#postsReviews

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- postsReviews %>%
      select(cmtPoster, artPoster, artUrl)


# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, directed=T)


# 畫出網路圖
#plot(reviewNetwork)

# 把點點的大小和線的粗細調小，並不顯示使用者賬號。
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

# LDA 主題進行視覺化
new_lda_allPosts5 = ldas_allPosts[[3]] 

topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta") 

# 使用LDA分類每篇文章的主題
animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。
                  group_by(document) %>%
                  top_n(1, wt=gamma)

# 把文章資訊和主題join起來
postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document")


# 挑選出2014/01/01後的文章，
# 篩選有在15篇以上文章回覆者，
# 文章主題歸類為者，
# 欄位只取：cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- postsReviews %>%
      filter(artDate > as.Date('2014-01-01')) %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>15) %>% 
      ungroup() %>% 
      filter(topic == 1 | topic == 3 ) %>% 
      select(cmtPoster, artPoster, artUrl, topic) %>% 
      unique()

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

使用者經常參與的文章種類

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2) 

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21, 
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。

使用者是否受到歡迎

# PTT的回覆有三種，推文、噓文、箭頭
# 我們只要看推噓就好，因此把箭頭清掉
link <- postsReviews %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>5) %>% 
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()



# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

『迪士尼』與『皮克斯』討論文章大多為推文居多，代表大家對於兩個工作室的作品都抱有相當高正面的討論。

結論

迪士尼與皮克斯工作室發表的動畫除了畫風不一樣之外，每一部動畫所傳達的寓意也不太相同，儘管是同一個工作室產出，也會有著不一樣的寓意。大家在討論時除了針對主角做討論，也可以對於所傳達之寓意有著不一樣且多元面向的討論。

PTT movie版：迪士尼與皮克斯討論分析

Group 10
組員 :
N084020017 蔡尚亨、M084020015 陳怡蓁、M084020048 劉育寧、M084020033 顏灝

動機和分析目的

資料集描述

系統設置

資料處理

斷句

初始化斷詞器

斷詞

同義字取代

資料預覽

全部文章日期圖

文字雲

『迪士尼』文字雲

『皮克斯』文字雲

主角分析

出現在『艾莎』的前後五個字彙(貼文)-『冰雪奇緣』

出現在『杯麵』的前後五個字彙(貼文)-『大英雄天團』

出現在『胡迪』的前後五個字彙(貼文)-『玩具總動員4』

出現在『樂樂』的前後五個字彙(貼文)-『腦筋急轉彎』

共線相關圖

『迪士尼』共線相關圖

『皮克斯』共線相關圖

情緒分析

每日的正負面情緒字分布

討論迪士尼-『冰雪奇緣2』相關貼文的正負面詞

討論迪士尼-『大英雄天團』相關貼文的正負面詞

討論皮克斯-『玩具總動員4』相關貼文的正負面詞

討論皮克斯-『腦筋急轉灣』相關貼文的正負面詞

LDA

透過perplexity找到最佳主題數

10個主題及其常出現字詞

以比例看各主題在時間的變化

網路圖

發文者數量

回覆者數量

總共參與人數

使用者經常參與的文章種類

使用者是否受到歡迎

結論

PTT movie版：迪士尼與皮克斯討論分析

Group 10 組員 : N084020017 蔡尚亨、M084020015 陳怡蓁、M084020048 劉育寧、M084020033 顏灝

動機和分析目的

資料集描述

系統設置

資料處理

斷句

初始化斷詞器

斷詞

同義字取代

資料預覽

全部文章日期圖

文字雲

『迪士尼』文字雲

『皮克斯』文字雲

主角分析

出現在『艾莎』的前後五個字彙(貼文)-『冰雪奇緣』

出現在『杯麵』的前後五個字彙(貼文)-『大英雄天團』

出現在『胡迪』的前後五個字彙(貼文)-『玩具總動員4』

出現在『樂樂』的前後五個字彙(貼文)-『腦筋急轉彎』

共線相關圖

『迪士尼』共線相關圖

『皮克斯』共線相關圖

情緒分析

每日的正負面情緒字分布

討論迪士尼-『冰雪奇緣2』相關貼文的正負面詞

討論迪士尼-『大英雄天團』相關貼文的正負面詞

討論皮克斯-『玩具總動員4』相關貼文的正負面詞

討論皮克斯-『腦筋急轉灣』相關貼文的正負面詞

LDA

透過perplexity找到最佳主題數

10個主題及其常出現字詞

以比例看各主題在時間的變化

網路圖

發文者數量

回覆者數量

總共參與人數

使用者經常參與的文章種類

使用者是否受到歡迎

結論

Group 10
組員 :
N084020017 蔡尚亨、M084020015 陳怡蓁、M084020048 劉育寧、M084020033 顏灝