許多經典電影動畫電影都出自『迪士尼』與『皮克斯』兩大工作室,而其各自代表作品的風格迥異,我們將透過社群網路上大家對其動畫電影的討論度來做以下討論。
由中山大學管理學院文字分析平台收集PTT文章及留言取得之原始csv檔案。
透過中山管院文字分析平台,取得PTT資料,以關鍵字為『皮克斯』、『Pixar』、『玩具總動員』等皮克斯相關電影及『迪士尼』、『冰雪奇緣』等迪士尼相關電影。『皮克斯』相關文章共有711篇,『迪士尼』相關文章共有822篇。
# 安裝需要的packages
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","wordcloud2","scales","tm","reshape2","widyr","ggraph","tidyverse")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)# 載入packages
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(scales)
library(tm)
library(reshape2)
library(widyr)
library(ggraph)
library(tidyverse)
library(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)# 讀取資料
# 迪士尼文章資料
disney_posts <- read_csv("期末_迪士尼_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 迪士尼回覆資料
disney_reviews <- read_csv("期末_迪士尼_articleReviews.csv")
# 皮克斯文章資料
pixar_posts <- read_csv("期末_皮克斯_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 皮克斯回覆資料
pixar_reviews <- read_csv("期末_皮克斯_articleReviews.csv")
# 所有文章資料
all_posts <- read_csv("all_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 所有回覆資料
all_reviews <- read_csv("all_articleReviews.csv")# 轉換日期格式
disney_posts$artDate <- disney_posts$artDate %>% as.Date("%Y/%m/%d")
disney_reviews$artDate <- disney_reviews$artDate %>% as.Date("%Y/%m/%d")
pixar_posts$artDate <- pixar_posts$artDate %>% as.Date("%Y/%m/%d")
pixar_reviews$artDate <- pixar_reviews$artDate %>% as.Date("%Y/%m/%d")
all_posts$artDate <- all_posts$artDate %>% as.Date("%Y/%m/%d")
all_reviews$artDate <- all_reviews$artDate %>% as.Date("%Y/%m/%d")#迪士尼文章斷句
disney_posts_sentences <- strsplit(disney_posts$sentence,"[.。!;?!?;]")
disney_posts_sentences <- data.frame(
artUrl = rep(disney_posts$artUrl, sapply(disney_posts_sentences, length)),
sentence = unlist(disney_posts_sentences),
artDate = rep(disney_posts$artDate, sapply(disney_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
disney_posts_sentences$sentence <- as.character(disney_posts_sentences$sentence)
#皮克斯文章斷句
pixar_posts_sentences <- strsplit(pixar_posts$sentence,"[.。!;?!?;]")
pixar_posts_sentences <- data.frame(
artUrl = rep(pixar_posts$artUrl, sapply(pixar_posts_sentences, length)),
sentence = unlist(pixar_posts_sentences),
artDate = rep(pixar_posts$artDate, sapply(pixar_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
pixar_posts_sentences$sentence <- as.character(pixar_posts_sentences$sentence)
#所有文章斷句
all_posts_sentences <- strsplit(all_posts$sentence, "[.。!;?!?;]")
all_posts_sentences <- data.frame(
artUrl = rep(all_posts$artUrl, sapply(all_posts_sentences, length)),
sentence = unlist(all_posts_sentences),
artDate = rep(all_posts$artDate, sapply(all_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
all_posts_sentences$sentence <- as.character(all_posts_sentences$sentence)# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker(user="all_dict/user_dict.txt", stop_word="all_dict/stop_words.txt")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}disney_posts_tokens <- disney_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
pixar_posts_tokens <- pixar_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
all_posts_tokens <- all_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))#迪士尼
disney_synonym <- function(i)
{
gsub("disney", "迪士尼", i) %>%
gsub("冰雪", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
gsub("冰雪2", "冰雪奇緣2", .) %>%
gsub("frozen", "冰雪奇緣", .) %>%
gsub("frozen2", "冰雪奇緣2", .) %>%
gsub("Tangled", "魔法公主", .) %>%
gsub("WreckItRalph", "無敵破壞王", .) %>%
gsub("BigHero", "大英雄天團", .) %>%
gsub("Zootopia", "動物方程式", .) %>%
gsub("Moana", "海洋奇緣", .) %>%
gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
gsub("elsa", "艾莎", .) %>% #冰雪奇緣女主角
gsub("anna", "安娜", .) %>% #冰雪奇緣女主角
gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
gsub("Felix", "阿修", .) %>% #無敵破壞王主角
gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
gsub("Judy", "哈茱蒂", .) %>% #動物方程式主角
gsub("艾爾莎", "艾莎", .)
}
## 針對貼文內容套入 function
disney_posts_tokens$word <- disney_synonym(disney_posts_tokens$word)
#皮克斯
pixar_synonym <- function(i)
{
gsub("woody", "胡迪", i) %>%
gsub("dory", "多莉", .) %>%
gsub("toy story", "玩具總動員", .)
}
## 針對貼文內容套入 function
pixar_posts_tokens$word <- pixar_synonym(pixar_posts_tokens$word)
#所有
all_synonym <- function(i)
{
gsub("disney", "迪士尼", i) %>%
gsub("冰雪", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
gsub("冰雪2", "冰雪奇緣2", .) %>%
gsub("frozen", "冰雪奇緣", .) %>%
gsub("frozen2", "冰雪奇緣2", .) %>%
gsub("Tangled", "魔法公主", .) %>%
gsub("WreckItRalph", "無敵破壞王", .) %>%
gsub("BigHero", "大英雄天團", .) %>%
gsub("Zootopia", "動物方程式", .) %>%
gsub("Moana", "海洋奇緣", .) %>%
gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
gsub("elsa", "艾莎", .) %>% #冰雪奇緣女主角
gsub("anna", "安娜", .) %>% #冰雪奇緣女主角
gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
gsub("Felix", "阿修", .) %>% #無敵破壞王主角
gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
gsub("Judy", "哈茱蒂", .) %>% #動物方程式主角
gsub("艾爾莎", "艾莎", .) %>%
gsub("woody", "胡迪", .) %>%
gsub("dory", "多莉", .) %>%
gsub("toy story", "玩具總動員", .)
}
# 針對貼文內容套入 function
all_posts_tokens$word <- all_synonym(all_posts_tokens$word)all_posts %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot(aes(artDate,count)) +
geom_line(color="skyblue", size=1.2)+
scale_x_date(labels = date_format("%Y-%m-%d"))+
theme_classic() -> p## `summarise()` ungrouping output (override with `.groups` argument)
p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.5)+#大英雄天團
geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.5)+#冰雪奇緣2
geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.5)+#腦筋急轉彎
geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.5)#玩具總動員4從上圖可以發現討論度高峰低峰很明顯,高峰代表動畫上映時,大家會熱烈討論,但當動畫下檔時,將不太有討論度。以下將針對四部動畫做討論,分別為迪士尼-『大英雄天團』及『冰雪奇緣2』,皮克斯-『腦筋急轉彎』及『玩具總動員4』。
紅色虛線為迪士尼代表作。綠色虛線為皮克斯代表作。
# 計算所有字在文章的總詞頻
disney_posts_tokens_count <- disney_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 30) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
#head(disney_posts_tokens_count)
pixar_posts_tokens_count <- pixar_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 30) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
#disney_clean_words <- c("迪士尼","電影","動畫")
#disney_posts_tokens_count %>%
#filter(!(disney_posts_tokens_count$word %in% disney_clean_words)) %>%
#filter(sum > 100) #%>%
#wordcloud2()與『迪士尼』相關的文章討論,以『冰雪奇緣』最受歡迎。兩位公主『艾莎』與『安娜』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。另外『歌曲』兩字也很常被討論,因迪士尼動畫之歌曲大都很受歡迎。
#pixar_clean_words <- c("皮克斯","電影","動畫")
#pixar_posts_tokens_count %>%
#filter(!(pixar_posts_tokens_count$word %in% pixar_clean_words)) %>%
#filter(sum > 100) #%>%
#wordcloud2()與『皮克斯』相關的文章討論,以『玩具總動員』最受歡迎。『胡迪』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
elsa_posts_ngram <- disney_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
elsa_posts_ngram_separated <- elsa_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(elsa_posts_ngram_separated)
elsa_posts_check_words <- elsa_posts_ngram_separated %>%
filter((word6 == "艾莎"))
#elsa_posts_check_words
elsa_posts_check_words_count <- elsa_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
elsa_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「艾莎」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『艾莎』的妹妹『安娜』公主為出現次數最高。其他如『魔法』、『公主』等都與其相關性很高。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
baymax_posts_ngram <- disney_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
baymax_posts_ngram_separated <- baymax_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(baymax_posts_ngram_separated)
baymax_posts_check_words <- baymax_posts_ngram_separated %>%
filter((word6 == "杯麵"))
#baymax_posts_check_words
baymax_posts_check_words_count <- baymax_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
baymax_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「杯麵」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『杯麵』為主角『阿廣』的哥哥『阿正』所發明出的『機器人』,其代表著『可愛』與『照顧』的特色。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
woody_posts_ngram <- pixar_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
woody_posts_ngram_separated <- woody_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(woody_posts_ngram_separated)
woody_posts_check_words <- woody_posts_ngram_separated %>%
filter((word6 == "胡迪"))
#woody_posts_check_words
woody_posts_check_words_count <- woody_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
woody_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「胡迪」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『胡迪』為『安弟』其中一個『玩具』,其他出現字詞有『胡迪』的『夥伴』,如『巴斯』、『牧羊女』。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
happy_posts_ngram <- pixar_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
happy_posts_ngram_separated <- happy_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(happy_posts_ngram_separated)
happy_posts_check_words <- happy_posts_ngram_separated %>%
filter((word6 == "樂樂"))
#happy_posts_check_words
happy_posts_check_words_count <- happy_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
happy_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「樂樂」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『樂樂』為『萊莉』『大腦』中的『情緒』『角色』之一,『憂憂』則為與『樂樂』相反的角色。
# 迪士尼
# 剛才的斷詞結果沒有使用新增的辭典,
# 因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
disney_posts_words <- disney_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
# 計算兩個詞彙同時出現的總次數
disney_word_pairs <- disney_posts_words %>%
pairwise_count(word, artUrl, sort = TRUE)## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# 計算兩個詞彙間的相關性
disney_word_cors <- disney_posts_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
# 顯示相關性大於0.5的組合
set.seed(2020)
disney_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()討論『動畫觀影心得』,也會討論到是否為雷文或是是否需要防雷的分隔線等。而『大英雄天團』則會討論關於機器人-杯麵及發明等字詞。
# 皮克斯
# 剛才的斷詞結果沒有使用新增的辭典,
# 因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
pixar_posts_words <- pixar_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
# 計算兩個詞彙同時出現的總次數
pixar_word_pairs <- pixar_posts_words %>%
pairwise_count(word, artUrl, sort = TRUE)
# 計算兩個詞彙間的相關性
pixar_word_cors <- pixar_posts_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
# 顯示相關性大於0.5的組合
set.seed(2020)
pixar_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()討論『腦筋急轉彎』除了討論情緒主角,也會討論到關於大腦的記憶、潛意識等。而討論『玩具總動員』可能大多與暴雷等資訊有關,所以會討論到分隔線等資訊。
# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")
# 負向字典txt檔
N <- read_file("liwc/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
# 與LIWC情緒字典做join
# 文集中的字屬於positive還是negative
### 計算詞彙次數_部分
disney_posts_tokens <- disney_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date <- disney_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
disney_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC) ## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 199 x 2
## word sentiment
## <chr> <fct>
## 1 破壞 negative
## 2 破壞 negative
## 3 破壞 negative
## 4 破壞 negative
## 5 遊戲 positive
## 6 情緒 negative
## 7 勇氣 positive
## 8 遊戲 positive
## 9 遊戲 positive
## 10 破壞 negative
## # … with 189 more rows
pixar_posts_tokens <- pixar_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date <- pixar_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
pixar_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC) ## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 237 x 2
## word sentiment
## <chr> <fct>
## 1 寶貝 positive
## 2 解決 positive
## 3 情緒 negative
## 4 情緒 negative
## 5 驚嚇 negative
## 6 情緒 negative
## 7 驚嚇 negative
## 8 情緒 negative
## 9 情緒 negative
## 10 情緒 negative
## # … with 227 more rows
all_posts_tokens <- all_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
all_tokens_by_date <- all_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
all_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 490 x 2
## word sentiment
## <chr> <fct>
## 1 寶貝 positive
## 2 解決 positive
## 3 破壞 negative
## 4 情緒 negative
## 5 情緒 negative
## 6 驚嚇 negative
## 7 遊戲 positive
## 8 情緒 negative
## 9 破壞 negative
## 10 驚嚇 negative
## # … with 480 more rows
# 統計每天文章正面字的次數與負面字的次數
disney_sentiment_count = disney_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_sentiment_count = pixar_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
all_sentiment_count = all_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
all_sentiment_count %>%
ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
scale_x_date(labels = date_format("%Y/%m/%d")) -> p
p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.2)+#大英雄天團
geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.2)+#冰雪奇緣2
geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.2)+#腦筋急轉彎
geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.2)#玩具總動員4紅色為迪士尼代表作,綠色代表皮克斯代表作。圖中『玩具總動員4』的正面字詞相當多,可以代表『玩具總動員4』為一部相當正面的動畫,其他部動畫則無明顯之差異。
disney_posts_select_frozen <- disney_posts %>%
filter(str_detect(disney_posts$artTitle, "冰雪奇緣2") | str_detect(disney_posts$artTitle, "frozen2"))
# 計算詞彙次數_部分
disney_posts_tokens_frozen <- disney_posts_select_frozen %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date_frozen <- disney_posts_tokens_frozen %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_frozen = disney_tokens_by_date_frozen %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
disney_posts_tokens_frozen %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『冰雪奇緣2』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『冰雪奇緣2』正面字詞如:喜歡、相信、冒險、愛情等。負面字詞如:失去、恐懼、害怕等。
disney_posts_select_hero <- disney_posts %>%
filter(str_detect(disney_posts$artTitle, "大英雄天團") | str_detect(disney_posts$artTitle, "big hero 6"))
# 計算詞彙次數_部分
disney_posts_tokens_hero <- disney_posts_select_hero %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date_hero <- disney_posts_tokens_hero %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_hero = disney_tokens_by_date_hero %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
disney_posts_tokens_hero %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『大英雄天團』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『大英雄天團』正面詞如:英雄、溫暖、溫馨、分享等。負面字詞如:反派、復仇、戰鬥、失去等。
pixar_posts_select_toy <- pixar_posts %>%
filter(str_detect(pixar_posts$artTitle, "玩具總動員4") | str_detect(pixar_posts$artTitle, "toy4"))
# 計算詞彙次數_部分
pixar_posts_tokens_toy <- pixar_posts_select_toy %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date_toy <-pixar_posts_tokens_toy %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_toy = pixar_tokens_by_date_toy %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_posts_tokens_toy %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『玩具總動員4』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『玩具總動員4』正面詞如:自由、朋友、重要、冒險等。負面詞如:失去、放下、反派等。
pixar_posts_select_brain <- pixar_posts %>%
filter(str_detect(pixar_posts$artTitle, "腦筋急轉彎") | str_detect(pixar_posts$artTitle, "inside out"))
# 計算詞彙次數_部分
pixar_posts_tokens_brain <- pixar_posts_select_brain %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date_brain <-pixar_posts_tokens_brain %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_brain = pixar_tokens_by_date_brain %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_posts_tokens_brain %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『腦筋急轉彎』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『腦筋急轉彎』正面詞如:希望、冒險、相信等。負面詞如:憂傷、難過、控制等。
# 將資料轉換為Document Term Matrix (DTM)
#all_posts_tokens <- all_posts %>%
#unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
#count(artUrl, word) %>%
#rename(count=n)
#all_posts_dtm <- all_posts_tokens %>% cast_dtm(artUrl, word, count)
#inspect(all_posts_dtm[1:10,1:10])
#ldas_allPosts = c()
#topics = c(3, 5, 10, 25, 36)
#for(topic in topics){
#start_time <- Sys.time()
#lda_allPosts <- LDA(all_posts_dtm, k = topic, control = list(seed = 2020))
#ldas_allPosts =c(ldas_allPosts,lda_allPosts)
#print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#save(ldas_allPosts,file = "ldas_result_allPosts")
#}#topics = c(3, 5, 10, 25, 36)
#data_frame(k = topics,
# perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>%
#ggplot(aes(k, perplex)) +
#geom_point() +
#geom_line() +
#labs(title = "Evaluating LDA topic models",
#subtitle = "Optimal number of topics (smaller is better)",
# x = "Number of topics",
#y = "Perplexity")透過perplexity找最佳主題數,決定以主題數10個作為接續討論。
new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果
topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts <- topics_allPosts %>%
filter(!term %in% remove_words)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_allPosts %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()+
theme(text = element_text(family='STHeitiTC-Light'))主題1-皮克斯
主題2-玩具總動員
主題3-迪士尼
主題4-冰雪奇緣
主題5-腦筋急轉彎
主題6-票房
主題7-可可夜總會
主題8-大英雄天團
主題9-動物方程式
主題10-無敵破壞王
選定主題1-皮克斯、主題3-迪士尼、主題6-票房為主要討論主題
# 主題命名
topic_name = c('皮克斯', '玩具總動員', '迪士尼', '冰雪奇緣', '腦筋急轉彎', '票房', '可可夜總會', '大英雄天團', '動物方程式', '無敵破壞王')tmResult <- posterior(new_lda_allPosts)
doc_pro <- tmResult$topics
#dim(doc_pro)
allPosts_data <- all_posts %>%
select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics <- doc_pro[allPosts_data$artUrl,]
document_topics_df = data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(allPosts_data, document_topics_df)
#news_topic %>% head(10)news_topic %>%
group_by(artDate = format(artDate,'%Y')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))皮克斯的討論比例較迪士尼稍高一點,也可以發現『票房』為隨時間到來,討論度也提高。
## [1] 19977
# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(all_poster)) %>%
mutate(type=ifelse(user%in%all_posts$artPoster, "poster", "replyer"))## 建立社群網路圖
postsReviews <- merge(x = all_posts, y = all_reviews, by = "artUrl")
#postsReviews
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- postsReviews %>%
select(cmtPoster, artPoster, artUrl)
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
# 畫出網路圖
#plot(reviewNetwork)
# 把點點的大小和線的粗細調小,並不顯示使用者賬號。
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)# LDA 主題進行視覺化
new_lda_allPosts5 = ldas_allPosts[[3]]
topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta")
# 使用LDA分類每篇文章的主題
animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。
group_by(document) %>%
top_n(1, wt=gamma)
# 把文章資訊和主題join起來
postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document")
# 挑選出2014/01/01後的文章,
# 篩選有在15篇以上文章回覆者,
# 文章主題歸類為者,
# 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- postsReviews %>%
filter(artDate > as.Date('2014-01-01')) %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>15) %>%
ungroup() %>%
filter(topic == 1 | topic == 3 ) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。
# PTT的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- postsReviews %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>5) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)『迪士尼』與『皮克斯』討論文章大多為推文居多,代表大家對於兩個工作室的作品都抱有相當高正面的討論。
迪士尼與皮克斯工作室發表的動畫除了畫風不一樣之外,每一部動畫所傳達的寓意也不太相同,儘管是同一個工作室產出,也會有著不一樣的寓意。大家在討論時除了針對主角做討論,也可以對於所傳達之寓意有著不一樣且多元面向的討論。