許多經典電影動畫電影都出自『迪士尼』與『皮克斯』兩大工作室,而其各自代表作品的風格迥異,我們將透過社群網路上大家對其動畫電影的討論度來做以下討論。
由中山大學管理學院文字分析平台收集PTT文章及留言取得之原始csv檔案。
透過中山管院文字分析平台,取得PTT資料,以關鍵字為『皮克斯』、『Pixar』、『玩具總動員』等皮克斯相關電影及『迪士尼』、『冰雪奇緣』等迪士尼相關電影。『皮克斯』相關文章共有711篇,『迪士尼』相關文章共有822篇。
# 安裝需要的packages
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","wordcloud2","scales","tm","reshape2","widyr","ggraph","tidyverse","textstem","e1071","rpart.plot","rpart","caTools","caret")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)# 載入packages
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(scales)
library(tm)
library(reshape2)
library(widyr)
library(ggraph)
library(tidyverse)
library(RColorBrewer)
library(textstem)
library(e1071)
library(rpart)
library(rpart.plot)
library(caTools)
library(caret)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)# 讀取資料
# 迪士尼文章資料
disney_posts <- read_csv("期末_迪士尼_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 迪士尼回覆資料
disney_reviews <- read_csv("期末_迪士尼_articleReviews.csv")
# 皮克斯文章資料
pixar_posts <- read_csv("期末_皮克斯_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 皮克斯回覆資料
pixar_reviews <- read_csv("期末_皮克斯_articleReviews.csv")
# 所有文章資料
all_posts <- read_csv("all_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 所有回覆資料
all_reviews <- read_csv("all_articleReviews.csv")# 轉換日期格式
disney_posts$artDate <- disney_posts$artDate %>% as.Date("%Y/%m/%d")
disney_reviews$artDate <- disney_reviews$artDate %>% as.Date("%Y/%m/%d")
pixar_posts$artDate <- pixar_posts$artDate %>% as.Date("%Y/%m/%d")
pixar_reviews$artDate <- pixar_reviews$artDate %>% as.Date("%Y/%m/%d")
all_posts$artDate <- all_posts$artDate %>% as.Date("%Y/%m/%d")
all_reviews$artDate <- all_reviews$artDate %>% as.Date("%Y/%m/%d")#迪士尼文章斷句
disney_posts_sentences <- strsplit(disney_posts$sentence,"[.。!;?!?;]")
disney_posts_sentences <- data.frame(
artUrl = rep(disney_posts$artUrl, sapply(disney_posts_sentences, length)),
sentence = unlist(disney_posts_sentences),
artDate = rep(disney_posts$artDate, sapply(disney_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
disney_posts_sentences$sentence <- as.character(disney_posts_sentences$sentence)
#皮克斯文章斷句
pixar_posts_sentences <- strsplit(pixar_posts$sentence,"[.。!;?!?;]")
pixar_posts_sentences <- data.frame(
artUrl = rep(pixar_posts$artUrl, sapply(pixar_posts_sentences, length)),
sentence = unlist(pixar_posts_sentences),
artDate = rep(pixar_posts$artDate, sapply(pixar_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
pixar_posts_sentences$sentence <- as.character(pixar_posts_sentences$sentence)
#所有文章斷句
all_posts_sentences <- strsplit(all_posts$sentence, "[.。!;?!?;]")
all_posts_sentences <- data.frame(
artUrl = rep(all_posts$artUrl, sapply(all_posts_sentences, length)),
sentence = unlist(all_posts_sentences),
artDate = rep(all_posts$artDate, sapply(all_posts_sentences, length))
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
all_posts_sentences$sentence <- as.character(all_posts_sentences$sentence)# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker(user="all_dict/user_dict.txt", stop_word="all_dict/stop_words.txt")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}disney_posts_tokens <- disney_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
pixar_posts_tokens <- pixar_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
all_posts_tokens <- all_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))#迪士尼
disney_synonym <- function(i)
{
gsub("disney", "迪士尼", i) %>%
gsub("冰雪", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
gsub("冰雪2", "冰雪奇緣2", .) %>%
gsub("frozen", "冰雪奇緣", .) %>%
gsub("frozen2", "冰雪奇緣2", .) %>%
gsub("Tangled", "魔法公主", .) %>%
gsub("WreckItRalph", "無敵破壞王", .) %>%
gsub("BigHero", "大英雄天團", .) %>%
gsub("Zootopia", "動物方程式", .) %>%
gsub("Moana", "海洋奇緣", .) %>%
gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
gsub("elsa", "艾莎", .) %>% #冰雪奇緣女主角
gsub("anna", "安娜", .) %>% #冰雪奇緣女主角
gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
gsub("Felix", "阿修", .) %>% #無敵破壞王主角
gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
gsub("Judy", "哈茱蒂", .) %>% #動物方程式主角
gsub("艾爾莎", "艾莎", .)
}
## 針對貼文內容套入 function
disney_posts_tokens$word <- disney_synonym(disney_posts_tokens$word)
#皮克斯
pixar_synonym <- function(i)
{
gsub("woody", "胡迪", i) %>%
gsub("dory", "多莉", .) %>%
gsub("toy story", "玩具總動員", .)
}
## 針對貼文內容套入 function
pixar_posts_tokens$word <- pixar_synonym(pixar_posts_tokens$word)
#所有
all_synonym <- function(i)
{
gsub("disney", "迪士尼", i) %>%
gsub("冰雪", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣", "冰雪奇緣", .) %>%
gsub("冰雪奇緣奇緣2", "冰雪奇緣2", .) %>%
gsub("冰雪2", "冰雪奇緣2", .) %>%
gsub("frozen", "冰雪奇緣", .) %>%
gsub("frozen2", "冰雪奇緣2", .) %>%
gsub("Tangled", "魔法公主", .) %>%
gsub("WreckItRalph", "無敵破壞王", .) %>%
gsub("BigHero", "大英雄天團", .) %>%
gsub("Zootopia", "動物方程式", .) %>%
gsub("Moana", "海洋奇緣", .) %>%
gsub("RalphBreakstheInternet", "無敵破壞王2", .) %>%
gsub("Rapunzel", "樂佩", .) %>% #魔法公主女主角
gsub("elsa", "艾莎", .) %>% #冰雪奇緣女主角
gsub("anna", "安娜", .) %>% #冰雪奇緣女主角
gsub("Ralph", "雷夫", .) %>% #無敵破壞王主角
gsub("Vanellope", "雲妮露", .) %>% #無敵破壞王主角
gsub("Felix", "阿修", .) %>% #無敵破壞王主角
gsub("Hiro", "阿廣", .) %>% #大英雄天團主角
gsub("Baymax", "杯麵", .) %>% #大英雄天團主角
gsub("Tadashi", "阿正", .) %>% #大英雄天團主角
gsub("Judy", "哈茱蒂", .) %>% #動物方程式主角
gsub("艾爾莎", "艾莎", .) %>%
gsub("woody", "胡迪", .) %>%
gsub("dory", "多莉", .) %>%
gsub("toy story", "玩具總動員", .)
}
# 針對貼文內容套入 function
all_posts_tokens$word <- all_synonym(all_posts_tokens$word)all_posts %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot(aes(artDate,count)) +
geom_line(color="skyblue", size=1.2)+
scale_x_date(labels = date_format("%Y-%m-%d"))+
theme_classic() -> p## `summarise()` ungrouping output (override with `.groups` argument)
p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.5)+#大英雄天團
geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.5)+#冰雪奇緣2
geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.5)+#腦筋急轉彎
geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.5)#玩具總動員4從上圖可以發現討論度高峰低峰很明顯,高峰代表動畫上映時,大家會熱烈討論,但當動畫下檔時,將不太有討論度。以下將針對四部動畫做討論,分別為迪士尼-『大英雄天團』及『冰雪奇緣2』,皮克斯-『腦筋急轉彎』及『玩具總動員4』。
紅色虛線為迪士尼代表作。綠色虛線為皮克斯代表作。
# 計算所有字在文章的總詞頻
disney_posts_tokens_count <- disney_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 30) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
#head(disney_posts_tokens_count)
pixar_posts_tokens_count <- pixar_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 30) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
#disney_clean_words <- c("迪士尼","電影","動畫")
#disney_posts_tokens_count %>%
#filter(!(disney_posts_tokens_count$word %in% disney_clean_words)) %>%
#filter(sum > 100) #%>%
#wordcloud2()與『迪士尼』相關的文章討論,以『冰雪奇緣』最受歡迎。兩位公主『艾莎』與『安娜』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。另外『歌曲』兩字也很常被討論,因迪士尼動畫之歌曲大都很受歡迎。
#pixar_clean_words <- c("皮克斯","電影","動畫")
#pixar_posts_tokens_count %>%
#filter(!(pixar_posts_tokens_count$word %in% pixar_clean_words)) %>%
#filter(sum > 100) #%>%
#wordcloud2()與『皮克斯』相關的文章討論,以『玩具總動員』最受歡迎。『胡迪』則是最常被討論的主角。其他則多為動畫名稱及其重要角色名稱。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
elsa_posts_ngram <- disney_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
elsa_posts_ngram_separated <- elsa_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(elsa_posts_ngram_separated)
elsa_posts_check_words <- elsa_posts_ngram_separated %>%
filter((word6 == "艾莎"))
#elsa_posts_check_words
elsa_posts_check_words_count <- elsa_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
elsa_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「艾莎」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『艾莎』的妹妹『安娜』公主為出現次數最高。其他如『魔法』、『公主』等都與其相關性很高。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
baymax_posts_ngram <- disney_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
baymax_posts_ngram_separated <- baymax_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(baymax_posts_ngram_separated)
baymax_posts_check_words <- baymax_posts_ngram_separated %>%
filter((word6 == "杯麵"))
#baymax_posts_check_words
baymax_posts_check_words_count <- baymax_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
baymax_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「杯麵」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『杯麵』為主角『阿廣』的哥哥『阿正』所發明出的『機器人』,其代表著『可愛』與『照顧』的特色。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
woody_posts_ngram <- pixar_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
woody_posts_ngram_separated <- woody_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(woody_posts_ngram_separated)
woody_posts_check_words <- woody_posts_ngram_separated %>%
filter((word6 == "胡迪"))
#woody_posts_check_words
woody_posts_check_words_count <- woody_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
woody_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「胡迪」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『胡迪』為『安弟』其中一個『玩具』,其他出現字詞有『胡迪』的『夥伴』,如『巴斯』、『牧羊女』。
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
happy_posts_ngram <- pixar_posts %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
happy_posts_ngram_separated <- happy_posts_ngram %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
#head(happy_posts_ngram_separated)
happy_posts_check_words <- happy_posts_ngram_separated %>%
filter((word6 == "樂樂"))
#happy_posts_check_words
happy_posts_check_words_count <- happy_posts_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
happy_posts_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("貼文出現在「樂樂」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))『樂樂』為『萊莉』『大腦』中的『情緒』『角色』之一,『憂憂』則為與『樂樂』相反的角色。
# 迪士尼
# 剛才的斷詞結果沒有使用新增的辭典,
# 因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
disney_posts_words <- disney_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
# 計算兩個詞彙同時出現的總次數
disney_word_pairs <- disney_posts_words %>%
pairwise_count(word, artUrl, sort = TRUE)## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# 計算兩個詞彙間的相關性
disney_word_cors <- disney_posts_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
# 顯示相關性大於0.5的組合
set.seed(2020)
disney_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()討論『動畫觀影心得』,也會討論到是否為雷文或是是否需要防雷的分隔線等。而『大英雄天團』則會討論關於機器人-杯麵及發明等字詞。
# 皮克斯
# 剛才的斷詞結果沒有使用新增的辭典,
# 因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
pixar_posts_words <- pixar_posts_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
# 計算兩個詞彙同時出現的總次數
pixar_word_pairs <- pixar_posts_words %>%
pairwise_count(word, artUrl, sort = TRUE)
# 計算兩個詞彙間的相關性
pixar_word_cors <- pixar_posts_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
# 顯示相關性大於0.5的組合
set.seed(2020)
pixar_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE,family='STHeitiTC-Light') + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()討論『腦筋急轉彎』除了討論情緒主角,也會討論到關於大腦的記憶、潛意識等。而討論『玩具總動員』可能大多與暴雷等資訊有關,所以會討論到分隔線等資訊。
# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")
# 負向字典txt檔
N <- read_file("liwc/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
# 與LIWC情緒字典做join
# 文集中的字屬於positive還是negative
### 計算詞彙次數_部分
disney_posts_tokens <- disney_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date <- disney_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
disney_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC) ## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 199 x 2
## word sentiment
## <chr> <fct>
## 1 破壞 negative
## 2 破壞 negative
## 3 破壞 negative
## 4 破壞 negative
## 5 遊戲 positive
## 6 情緒 negative
## 7 勇氣 positive
## 8 遊戲 positive
## 9 遊戲 positive
## 10 破壞 negative
## # … with 189 more rows
pixar_posts_tokens <- pixar_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date <- pixar_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
pixar_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC) ## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 237 x 2
## word sentiment
## <chr> <fct>
## 1 寶貝 positive
## 2 解決 positive
## 3 情緒 negative
## 4 情緒 negative
## 5 驚嚇 negative
## 6 情緒 negative
## 7 驚嚇 negative
## 8 情緒 negative
## 9 情緒 negative
## 10 情緒 negative
## # … with 227 more rows
all_posts_tokens <- all_posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
all_tokens_by_date <- all_posts_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
all_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 490 x 2
## word sentiment
## <chr> <fct>
## 1 寶貝 positive
## 2 解決 positive
## 3 破壞 negative
## 4 情緒 negative
## 5 情緒 negative
## 6 驚嚇 negative
## 7 遊戲 positive
## 8 情緒 negative
## 9 破壞 negative
## 10 驚嚇 negative
## # … with 480 more rows
# 統計每天文章正面字的次數與負面字的次數
disney_sentiment_count = disney_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_sentiment_count = pixar_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
all_sentiment_count = all_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
all_sentiment_count %>%
ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
scale_x_date(labels = date_format("%Y/%m/%d")) -> p
p + geom_vline(xintercept = as.Date('2014-12-31'),color = "red", linetype = "dashed",size=0.2)+#大英雄天團
geom_vline(xintercept = as.Date('2019-11-21'),color = "red", linetype = "dashed",size=0.2)+#冰雪奇緣2
geom_vline(xintercept = as.Date('2015-08-07'),color = "darkgreen", linetype = "dashed",size=0.2)+#腦筋急轉彎
geom_vline(xintercept = as.Date('2019-06-20'),color = "darkgreen", linetype = "dashed",size=0.2)#玩具總動員4紅色為迪士尼代表作,綠色代表皮克斯代表作。圖中『玩具總動員4』的正面字詞相當多,可以代表『玩具總動員4』為一部相當正面的動畫,其他部動畫則無明顯之差異。
disney_posts_select_frozen <- disney_posts %>%
filter(str_detect(disney_posts$artTitle, "冰雪奇緣2") | str_detect(disney_posts$artTitle, "frozen2"))
# 計算詞彙次數_部分
disney_posts_tokens_frozen <- disney_posts_select_frozen %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date_frozen <- disney_posts_tokens_frozen %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_frozen = disney_tokens_by_date_frozen %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
disney_posts_tokens_frozen %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『冰雪奇緣2』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『冰雪奇緣2』正面字詞如:喜歡、相信、冒險、愛情等。負面字詞如:失去、恐懼、害怕等。
disney_posts_select_hero <- disney_posts %>%
filter(str_detect(disney_posts$artTitle, "大英雄天團") | str_detect(disney_posts$artTitle, "big hero 6"))
# 計算詞彙次數_部分
disney_posts_tokens_hero <- disney_posts_select_hero %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
disney_tokens_by_date_hero <- disney_posts_tokens_hero %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_hero = disney_tokens_by_date_hero %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
disney_posts_tokens_hero %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『大英雄天團』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『大英雄天團』正面詞如:英雄、溫暖、溫馨、分享等。負面字詞如:反派、復仇、戰鬥、失去等。
pixar_posts_select_toy <- pixar_posts %>%
filter(str_detect(pixar_posts$artTitle, "玩具總動員4") | str_detect(pixar_posts$artTitle, "toy4"))
# 計算詞彙次數_部分
pixar_posts_tokens_toy <- pixar_posts_select_toy %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date_toy <-pixar_posts_tokens_toy %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_toy = pixar_tokens_by_date_toy %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_posts_tokens_toy %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『玩具總動員4』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『玩具總動員4』正面詞如:自由、朋友、重要、冒險等。負面詞如:失去、放下、反派等。
pixar_posts_select_brain <- pixar_posts %>%
filter(str_detect(pixar_posts$artTitle, "腦筋急轉彎") | str_detect(pixar_posts$artTitle, "inside out"))
# 計算詞彙次數_部分
pixar_posts_tokens_brain <- pixar_posts_select_brain %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
pixar_tokens_by_date_brain <-pixar_posts_tokens_brain %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
# 以LIWC情緒字典分析
## 統計每天文章正面字的次數與負面字的次數
sentiment_count_brain = pixar_tokens_by_date_brain %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
pixar_posts_tokens_brain %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "討論『腦筋急轉彎』文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
『腦筋急轉彎』正面詞如:希望、冒險、相信等。負面詞如:憂傷、難過、控制等。
# 將資料轉換為Document Term Matrix (DTM)
#all_posts_tokens <- all_posts %>%
#unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
#count(artUrl, word) %>%
#rename(count=n)
#all_posts_dtm <- all_posts_tokens %>% cast_dtm(artUrl, word, count)
#inspect(all_posts_dtm[1:10,1:10])
#ldas_allPosts = c()
#topics = c(3, 5, 10, 25, 36)
#for(topic in topics){
#start_time <- Sys.time()
#lda_allPosts <- LDA(all_posts_dtm, k = topic, control = list(seed = 2020))
#ldas_allPosts =c(ldas_allPosts,lda_allPosts)
#print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#save(ldas_allPosts,file = "ldas_result_allPosts")
#}#topics = c(3, 5, 10, 25, 36)
#data_frame(k = topics,
# perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>%
#ggplot(aes(k, perplex)) +
#geom_point() +
#geom_line() +
#labs(title = "Evaluating LDA topic models",
#subtitle = "Optimal number of topics (smaller is better)",
# x = "Number of topics",
#y = "Perplexity")透過perplexity找最佳主題數,決定以主題數10個作為接續討論。
new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果
topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts <- topics_allPosts %>%
filter(!term %in% remove_words)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_allPosts %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()+
theme(text = element_text(family='STHeitiTC-Light'))主題1-皮克斯
主題2-玩具總動員
主題3-迪士尼
主題4-冰雪奇緣
主題5-腦筋急轉彎
主題6-票房
主題7-可可夜總會
主題8-大英雄天團
主題9-動物方程式
主題10-無敵破壞王
選定主題1-皮克斯、主題3-迪士尼、主題6-票房為主要討論主題
# 主題命名
topic_name = c('皮克斯', '玩具總動員', '迪士尼', '冰雪奇緣', '腦筋急轉彎', '票房', '可可夜總會', '大英雄天團', '動物方程式', '無敵破壞王')tmResult <- posterior(new_lda_allPosts)
doc_pro <- tmResult$topics
#dim(doc_pro)
allPosts_data <- all_posts %>%
select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics <- doc_pro[allPosts_data$artUrl,]
document_topics_df = data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(allPosts_data, document_topics_df)
#news_topic %>% head(10)news_topic %>%
group_by(artDate = format(artDate,'%Y')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))皮克斯的討論比例較迪士尼稍高一點,也可以發現『票房』為隨時間到來,討論度也提高。
## [1] 19977
# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(all_poster)) %>%
mutate(type=ifelse(user%in%all_posts$artPoster, "poster", "replyer"))## 建立社群網路圖
postsReviews <- merge(x = all_posts, y = all_reviews, by = "artUrl")
#postsReviews
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- postsReviews %>%
select(cmtPoster, artPoster, artUrl)
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
# 畫出網路圖
#plot(reviewNetwork)
# 把點點的大小和線的粗細調小,並不顯示使用者賬號。
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)# LDA 主題進行視覺化
new_lda_allPosts5 = ldas_allPosts[[3]]
topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta")
# 使用LDA分類每篇文章的主題
animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。
group_by(document) %>%
top_n(1, wt=gamma)
# 把文章資訊和主題join起來
postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document")
# 挑選出2014/01/01後的文章,
# 篩選有在15篇以上文章回覆者,
# 文章主題歸類為者,
# 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- postsReviews %>%
filter(artDate > as.Date('2014-01-01')) %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>15) %>%
ungroup() %>%
filter(topic == 1 | topic == 3 ) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。
接著我們將針對這些意見領袖發文的文章及其留言做討論。
disney_leader <- c("AACKball", "xslayer", "YanbinCao")
pixar_leader <- c("filmwalker", "mysmalllamb")
leader_posts <- all_posts %>%
filter(all_posts$artPoster %in% disney_leader)
leader_posts## # A tibble: 4 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [普好雷]冰雪… 2019-11-22 22:07:32 https… xslayer movie 208 40 2
## 2 [討論]冰雪奇… 2019-11-30 23:15:47 https… AACKball movie 174 99 0
## 3 [好普負雷]冰… 2019-12-03 00:43:36 https… YanbinCao movie 500 100 2
## 4 Re:[好雷]… 2019-12-04 20:01:40 https… YanbinCao movie 371 63 15
## # … with 1 more variable: sentence <chr>
## # A tibble: 742 x 4
## artUrl cmtPoster cmtStatus cmtContent
## <chr> <chr> <chr> <chr>
## 1 https://www.ptt.cc/bbs/movi… AACKball 推 :歌是中文翻唱,中配的缺點是我覺得艾莎有點顯老,…
## 2 https://www.ptt.cc/bbs/movi… AACKball → :裡面的歌翻中文唱得還不錯,不過歌手唱腔畢竟不同,…
## 3 https://www.ptt.cc/bbs/movi… AACKball → :要看個人喜好了。
## 4 https://www.ptt.cc/bbs/movi… AACKball 推 :我沒看過冰雪一的中文也無從比較。不過我覺得二是…
## 5 https://www.ptt.cc/bbs/movi… AACKball → :不錯啦,當然也有可能是因為我前幾天才看過重甲機…
## 6 https://www.ptt.cc/bbs/movi… AACKball → :神......
## 7 https://www.ptt.cc/bbs/movi… AACKball 推 :艾莎都被困了,還有心情慢慢洩洪嗎XD…
## 8 https://www.ptt.cc/bbs/movi… AACKball 推 :艾莎進入洞窟深處發現水壩的真相,但是陷入迷惘,因…
## 9 https://www.ptt.cc/bbs/movi… AACKball → :為兩邊都是重要的人,她不知道要如何解決而被困在回…
## 10 https://www.ptt.cc/bbs/movi… AACKball → :憶裡,周圍也呼應了雪寶一直在提的「水有記憶」。…
## # … with 732 more rows
# 計算詞頻
disney_leader_posts_tokens_count <- disney_leader_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
# 文字雲
#disney_leader_posts_tokens_count %>%
#filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>%
#filter(sum > 10) %>%
#wordcloud2()發現disney意見領袖所發文主要是以冰雪奇緣的角色與劇情探討為主。
pixar_leader_posts_tokens <- all_posts_tokens %>%
filter(all_posts_tokens$artPoster %in% pixar_leader)# 計算詞頻
pixar_leader_posts_tokens_count <- pixar_leader_posts_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
# 文字雲
#pixar_leader_posts_tokens_count %>%
#filter(!(leader_posts_tokens_count$word %in% disney_clean_words)) %>%
#filter(sum > 10) %>%
#wordcloud2()發現Pixar討論比較熱烈的文章內容,會出現關於人生、教育、冒險 貼近真實生活所會遇到的相關議題。
disney_leader_sentiment_count = disney_leader_posts_tokens_count %>%
select(word, sum) %>%
inner_join(LIWC) %>%
group_by(sentiment) %>%
summarise(count=sum(sum))## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## sentiment count
## <fct> <int>
## 1 positive 12
## 2 negative 26
迪士尼意見領袖負面文章數量教多。
disney_leader_posts_tokens %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "迪士尼文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
正面詞如:希望、拯救、願意、順利等。負面詞如:自我、恐懼、壓抑、害怕等。
pixar_leader_sentiment_count = pixar_leader_posts_tokens_count %>%
select(word, sum) %>%
inner_join(LIWC) %>%
group_by(sentiment) %>%
summarise(count=sum(sum))## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## sentiment count
## <fct> <int>
## 1 positive 49
## 2 negative 13
皮克斯意見領袖正面文章數量較多。
pixar_leader_posts_tokens %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "皮克斯意見領袖文章的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
正面詞如:冒險、鼓勵、分享、渴望等。負面詞如:問題、恐懼、焦慮、討厭等。
disney_leader_article_url <- disney_leader_posts_tokens %>%
group_by(disney_leader_posts_tokens$artUrl) %>%
select(artUrl)## Adding missing grouping variables: `disney_leader_posts_tokens$artUrl`
disney_leader_article_url <- unique(disney_leader_article_url)
pixar_leader_article_url <- pixar_leader_posts_tokens %>%
group_by(pixar_leader_posts_tokens$artUrl) %>%
select(artUrl)## Adding missing grouping variables: `pixar_leader_posts_tokens$artUrl`
## Joining, by = "artUrl"
## # A tibble: 10 x 5
## artUrl cmtPoster cmtStatus cmtContent `disney_leader_posts_…
## <chr> <chr> <chr> <chr> <chr>
## 1 https://www.pt… sunlockfi… 推 :你真的知道政治正確是什麼意思嗎… https://www.ptt.cc/bb…
## 2 https://www.pt… xslayer → :就是國王要配王后啊?不然咧?… https://www.ptt.cc/bb…
## 3 https://www.pt… yehsongyo… → :換個角度想,就是Anna成熟到… https://www.ptt.cc/bb…
## 4 https://www.pt… yehsongyo… → :Elsa也能放心當第五靈成為人… https://www.ptt.cc/bb…
## 5 https://www.pt… leoheart1… 噓 :我只覺得Elsa把自己不想背的… https://www.ptt.cc/bb…
## 6 https://www.pt… leoheart1… → :。這個結局讓我暴怒orz… https://www.ptt.cc/bb…
## 7 https://www.pt… valentian 推 :其實就只是想撈第3集的鋪陳Or… https://www.ptt.cc/bb…
## 8 https://www.pt… leoheart1… 推 :剛按到噓,補血orzzz… https://www.ptt.cc/bb…
## 9 https://www.pt… valentian → :女王沒有孩子也OK。伊莉莎白也… https://www.ptt.cc/bb…
## 10 https://www.pt… valentian → :也可繼位,不覺得是國王王后的窠… https://www.ptt.cc/bb…
迪士尼意見領袖文章留言多為討論冰雪奇緣的結局。
## Joining, by = "artUrl"
## # A tibble: 10 x 5
## artUrl cmtPoster cmtStatus cmtContent `pixar_leader_posts_t…
## <chr> <chr> <chr> <chr> <chr>
## 1 https://www.pt… minoru04 → :這樣有需要發新聞片稿費喔… https://www.ptt.cc/bb…
## 2 https://www.pt… s87269x 推 :藏成這樣要發現也太難了… https://www.ptt.cc/bb…
## 3 https://www.pt… toto3527 推 :這部老實說還好耶劇情太普通太好… https://www.ptt.cc/bb…
## 4 https://www.pt… sign0 推 :這是'大家來找碴'吧瞬間晃過去… https://www.ptt.cc/bb…
## 5 https://www.pt… kikielle 推 :為什麼大家都喜歡劇情難猜的?… https://www.ptt.cc/bb…
## 6 https://www.pt… faang 推 :劇情太複雜,小朋友在戲院又要跟… https://www.ptt.cc/bb…
## 7 https://www.pt… torukumato 推 :要動腦又要BOSS猜一猜請看柯… https://www.ptt.cc/bb…
## 8 https://www.pt… edwar 推 :娛樂片當推理片在看,比如冰雪的… https://www.ptt.cc/bb…
## 9 https://www.pt… stevexbuc… 推 :是有多好猜 https://www.ptt.cc/bb…
## 10 https://www.pt… aku192 推 :劇情太好猜的話電影就會變的很無… https://www.ptt.cc/bb…
皮克斯意見領袖文章留言多為討論其題材是否適合小孩子的議題。
disney_leader_reviews_tokens <- disney_leader_reviews %>%
unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
disney_leader_reviews_tokens$word <- all_synonym(disney_leader_reviews_tokens$word)
#disney_leader_reviews_tokens# 計算詞頻
disney_leader_reviews_tokens_count <- disney_leader_reviews_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
# 文字雲
#disney_leader_reviews_tokens_count %>%
#filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>%
#filter(sum > 10) %>%
#wordcloud2()迪士尼意見領袖發文的留言,仍多以討論冰雪奇緣居多。
disney_leader_reviews_tokens %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "迪士尼意見領袖發文留言內容的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
正面詞如:喜歡,同意、無憂無慮、信任。負面詞如:問題、不好、指責、批評。
pixar_leader_reviews_tokens <- pixar_leader_reviews %>%
unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
pixar_leader_reviews_tokens$word <- all_synonym(pixar_leader_reviews_tokens$word)
pixar_leader_reviews_tokens## # A tibble: 642 x 5
## artUrl cmtPoster cmtStatus `pixar_leader_posts_tokens$… word
## <chr> <chr> <chr> <chr> <chr>
## 1 https://www.ptt.cc/bb… minoru04 → https://www.ptt.cc/bbs/movi… 需要
## 2 https://www.ptt.cc/bb… minoru04 → https://www.ptt.cc/bbs/movi… 新聞片…
## 3 https://www.ptt.cc/bb… minoru04 → https://www.ptt.cc/bbs/movi… 稿費
## 4 https://www.ptt.cc/bb… s87269x 推 https://www.ptt.cc/bbs/movi… 藏成
## 5 https://www.ptt.cc/bb… s87269x 推 https://www.ptt.cc/bbs/movi… 發現
## 6 https://www.ptt.cc/bb… s87269x 推 https://www.ptt.cc/bbs/movi… 太難
## 7 https://www.ptt.cc/bb… toto3527 推 https://www.ptt.cc/bbs/movi… 實說
## 8 https://www.ptt.cc/bb… toto3527 推 https://www.ptt.cc/bbs/movi… 還好
## 9 https://www.ptt.cc/bb… toto3527 推 https://www.ptt.cc/bbs/movi… 劇情
## 10 https://www.ptt.cc/bb… toto3527 推 https://www.ptt.cc/bbs/movi… 太普通…
## # … with 632 more rows
# 計算詞頻
pixar_leader_reviews_tokens_count <- pixar_leader_reviews_tokens %>%
filter(nchar(.$word)>1) %>% #如果詞彙只有一個字則不列入計算
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 1) %>% #過濾出現太少次的字
arrange(desc(sum))## `summarise()` ungrouping output (override with `.groups` argument)
# 文字雲
#pixar_leader_reviews_tokens_count %>%
#filter(!(leader_posts_tokens_count$word %in% #disney_clean_words)) %>%
#filter(sum > 1) %>%
#wordcloud2()在文字圖中發現有許多青菜如:花椰菜、苦瓜等。在腦筋急轉彎裡,皮克斯針對每個不同的國家畫面做了調整,討人厭的綠色花椰菜,到了日本版變成青椒,討論文章也引起了注意,網友紛紛熱絡地回覆自己不喜歡的食物,這是一個很有趣的現象。
pixar_leader_reviews_tokens %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "皮克斯意見領袖發文留言內容的正負面詞",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
留言情緒分析,可以發現一個有趣的現象,發文的內容偏向正面但網友留言的內容偏向負面。
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("皮克斯","迪士尼"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)“filmwalker”較常討論關於『皮克斯』的主題。
“AACKball”,“xslayer”,“YanbinCao”較常討論關於『迪士尼』的主題。
# PTT的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- postsReviews %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>5) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)『迪士尼』與『皮克斯』討論文章大多為推文居多,代表大家對於兩個工作室的作品都抱有相當高正面的討論。
postsReviews %>%
filter( cmtPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) -> test
test %>% select(-artUrl,-artDate,-artTime,-artCat,-sentence) -> test
test %>% head()## artTitle artPoster commentNum push boo
## 1 [選片]柯南、怪獸大學還是白宮末日 anif 31 20 0
## 2 [分享]《天外奇蹟》電影真正令人感動的是故事 originbook 49 33 2
## 3 Fw:[新聞]迪士尼連續4年海外票房破20億 ooic 8 5 0
## 4 Fw:[新聞]迪士尼連續4年海外票房破20億 ooic 8 5 0
## 5 Fw:[新聞]迪士尼連續4年海外票房破20億 ooic 8 5 0
## 6 [討論]迪士尼將推出「庫伊拉」真人電影 beckseaton 17 14 0
## cmtPoster cmtStatus cmtContent topic
## 1 S890127 推 :今年的柯南在粉絲間評價明明比前幾年都好... 7
## 2 S890127 噓 :是NEMO不是NIMO,天外奇蹟片頭不是倒敘法 5
## 3 S890127 推 :漫威+皮克斯,躺著賺啊 6
## 4 S890127 推 :飛機根本沒宣傳,我連預告片都沒在戲院看過 6
## 5 S890127 → :當初根本沒要上院線只打算出DVD的 6
## 6 S890127 推 :梅莉史翠普不錯 6
## gamma
## 1 0.8373285
## 2 0.9993013
## 3 0.9071989
## 4 0.9071989
## 5 0.9071989
## 6 0.9276916
#連結的
test %>% filter(cmtStatus == "推") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()## # A tibble: 6 x 2
## # Groups: cmtPoster [6]
## cmtPoster n
## <chr> <int>
## 1 Howard61313 315
## 2 KingKingCold 107
## 3 S890127 99
## 4 lovemelissa 89
## 5 Punisher 8
## 6 chirex 4
#全部的
postsReviews %>% filter(cmtStatus == "推")%>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()## # A tibble: 6 x 2
## # Groups: cmtPoster [6]
## cmtPoster n
## <chr> <int>
## 1 Howard61313 315
## 2 vvvvvvvvvvv 258
## 3 mysmalllamb 236
## 4 DemonElf 219
## 5 RapunzElsa 204
## 6 pattda 191
## artTitle artPoster commentNum push boo cmtPoster
## 1 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## 2 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## 3 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## 4 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## 5 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## 6 Re:[非常普雷]真的很普通的冰雪奇緣 bernon 854 89 29 Howard61313
## cmtStatus cmtContent topic gamma
## 1 推 :算了明明有篇更好的普雷文值得我們回應或討論 7 0.8729672
## 2 推 :奇怪,又不是每篇給冰雪普雷的都言之無物...我就看到 7 0.8729672
## 3 推 :最好是啦....一篇文章自己的質量還會受閱讀者的影響喔 7 0.8729672
## 4 推 :一篇文章有沒有道理,才是閱讀者方的主觀意識 7 0.8729672
## 5 推 :OK,我是主觀的不認同別人的意見,但是用到了'錯'這個 7 0.8729672
## 6 推 :我現在發現我沒雅量的地方了,他還沒 7 0.8729672
#連結的
test %>% filter(cmtStatus == "噓") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()## # A tibble: 4 x 2
## # Groups: cmtPoster [4]
## cmtPoster n
## <chr> <int>
## 1 KingKingCold 6
## 2 chirex 4
## 3 Howard61313 2
## 4 S890127 2
#全部的
postsReviews %>% filter(cmtStatus == "噓") %>% group_by(cmtPoster) %>% count() %>% arrange(desc(n)) %>% head()## # A tibble: 6 x 2
## # Groups: cmtPoster [6]
## cmtPoster n
## <chr> <int>
## 1 darkMood 56
## 2 toyamaK52 35
## 3 Alertme 34
## 4 PttGod 28
## 5 callcallABC 25
## 6 dreamtime09 25
## artTitle artPoster commentNum push
## 1 [討論]動物方城市字幕改編團隊的回應 S890127 671 233
## 2 Re:[討論]迪士尼要跟康卡斯特拼了,提713億買福斯 avdcd 124 44
## 3 [新聞]小S首度偕3女兒為【海底總動員2】配音 c84925025 111 21
## 4 [新聞]小S首度偕3女兒為【海底總動員2】配音 c84925025 111 21
## 5 Re:[討論]動物方城市字幕改編團隊的回應 pingtung5566 98 22
## 6 [討論]《冰雪奇緣2》艾莎為何被…?/樂高海報 CYKONGG 87 20
## boo cmtPoster cmtStatus cmtContent
## 1 33 chirex 噓 :冰的啦我也覺得翻得有夠爛,原來是辛普森家庭的翻譯
## 2 1 KingKingCold 噓 :樓下快靠北萬惡米老鼠壟斷電影產業
## 3 47 KingKingCold 噓 :天龍人總動員
## 4 47 KingKingCold 噓 :天龍人總動員
## 5 12 chirex 噓 :我沒有笑,謝謝。在地化跟竄改是兩回事
## 6 7 chirex 噓 :被X是什麼意思?
## topic gamma
## 1 9 0.9994406
## 2 6 0.9988877
## 3 10 0.9992542
## 4 10 0.9992542
## 5 9 0.9983714
## 6 7 0.6837956
藉由將上述連結討論度較高的鄉民“S890127”,“Punisher”,“chirex”,“Howard61313”,“lovemelissa”,"KingKingCold提出查看,推的部分由多到少排序為 Howard61313 > KingKingCold > S890127 > lovemelissa 推較多,且回覆多為是票房、歌曲、劇情等得認同或討論。
相對噓的部分極少(為個位數),由多到少排序為現KingKingCold > chirex > Howard61313 > S890127 推較多內容幕後團隊的內容、翻譯、配音等的不認同。雖然整體噓還是有,推估是因為與推數量相差太多,或是太分散,導致連結性弱。
# 所有文章資料_全部電影(近一年)
all_movie_posts <- read_csv("期末_movie版_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 所有回覆資料_全部電影(近一年)
all_movie_reviews <- read_csv("期末_movie版_articleReviews.csv")
all_movie_posts$artDate <- all_movie_posts$artDate %>% as.Date("%Y/%m/%d")
all_movie_reviews$artDate <- all_movie_reviews$artDate %>% as.Date("%Y/%m/%d")# 選取所需欄位
all_movie_reviews <- all_movie_reviews %>% select(artUrl, cmtPoster, cmtStatus, cmtContent)
# 總共參與人數
all_movie_poster <- c(all_movie_posts$artPoster, all_movie_reviews$cmtPoster)
# 整理所有參與人
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList_movie <- data.frame(user=unique(all_movie_poster)) %>%
mutate(type=ifelse(user%in%all_movie_posts$artPoster, "poster", "replyer"))## 建立社群網路圖
#由於資料太龐大,因為是要判斷對於皮克斯與迪士尼所連結性強且受歡迎的鄉民,相對於近一年的情況,因此塞選(commentNum >= 400 & push >= 150 & boo >= 5)的條件
all_movie_posts %>% filter(commentNum >= 400 & push >= 150 & boo >= 5) -> all_movie_posts_sel
postsReviews_movie <- merge(x = all_movie_posts_sel, y = all_movie_reviews, by = "artUrl")
#postsReviews
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link_movie <- postsReviews_movie %>%
select(cmtPoster, artPoster, artUrl)
# 建立網路關係
reviewNetwork_movie <- graph_from_data_frame(d=link_movie, directed=T)
# 畫出網路圖
#plot(reviewNetwork_movie )
# 把點點的大小和線的粗細調小,並不顯示使用者賬號。
#plot(reviewNetwork_movie , vertex.size=2, edge.arrow.size=.2,vertex.label=NA)# PTT的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link_movie <- postsReviews_movie %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter(n() > 5) %>%
ungroup() %>%
filter(commentNum >= 400 & push >= 150 & boo >= 5) %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>% unique()
# 篩選link中有出現的使用者
filtered_user_movie <- userList_movie %>%
filter(user%in%link_movie$cmtPoster | user%in%link_movie$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork_movie <- graph_from_data_frame(d=link_movie, v=filtered_user_movie, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork_movie)
V(reviewNetwork_movie)$label <- names(labels)
V(reviewNetwork_movie)$color <- ifelse(V(reviewNetwork_movie)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork_movie)$color <- ifelse(E(reviewNetwork_movie)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork_movie, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork_movie) > 7, V(reviewNetwork_movie)$label, NA),vertex.label.font=2)
# 加入標示
par(family = "Heiti TC Light")
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)由於資料太龐大,因為是要判斷對於皮克斯與迪士尼所連結性強且受歡迎的鄉民,相對於近一年的情況,因此塞選(commentNum >= 400 & push >= 150 & boo >= 5)的條件查看
明顯整體會在迪士尼與皮克斯的鄉民,在近一年中會對每部電影進行發文與回覆偏少
postsReviews_movie <- postsReviews_movie %>%
filter(cmtStatus!="→") %>%
filter(artPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) %>% select(artPoster,artTitle,commentNum,push,boo,cmtPoster,cmtStatus,cmtContent) %>% arrange(desc(push))細節查看發現,會在迪士尼與皮克斯的鄉民 以KingKingCold是幾人當中唯一發文者中較多人討論的
postsReviews_movie <- postsReviews_movie %>%
filter(cmtStatus!="→") %>%
filter( cmtPoster %in% c("S890127","Punisher","chirex","Howard61313","lovemelissa","KingKingCold")) %>% select(cmtPoster,cmtStatus,cmtContent,artTitle,commentNum,push,boo,artPoster) %>% arrange(desc(push))而留言者中 Punisher、KingKingCold、chirexn雖然出現率低但仍有在電影版中為其他電影發文進行回覆
#增加欄位判斷disney or pixar
is_disney_posts <- disney_posts %>%
mutate(title = "disney")
is_pixar_posts <- pixar_posts %>%
mutate(title = "pixar")
is_posts <- rbind(is_disney_posts,is_pixar_posts)tidy_is_disney <- is_posts %>%
unnest_tokens(word, sentence) %>%
group_by(word) %>%
anti_join(stop_words) %>%
filter(n() > 10) %>% #只取出出現大於10次的字
ungroup()## Joining, by = "word"
# 轉成document term matrix
dtm = tidy_is_disney %>%
count(artUrl,lemma) %>%
cast_dtm(artUrl,lemma,n)
inspect(dtm[1:10,1:10])## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity : 86%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 安 般 半 伴 包容 包裝 保
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 1 1 1 1 1 1 2
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0 0 0 0 1 0 0
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0 0 0 0 1 0 0
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0 0 0 0 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0 1 0 0 0 0 0
## Terms
## Docs 保護 保守 悲傷
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 1 1 1
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0 0 0
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0 1 0
## Warning in as.integer(rownames(dtm)) %in% disney: 強制變更過程中產生了 NA
##將每個artUrl視為一個document來計算字詞的TF-IDF
document_words <- tidy_is_disney %>% #計算每個document裡不同的字的tf
count(artUrl, lemma, sort = TRUE)
total_words <- document_words %>%
group_by(artUrl) %>%
summarize(total = sum(n)) %>%
right_join(document_words) %>%
mutate(is_disney = artUrl %in% disney)## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "artUrl"
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 75/25
## Sparsity : 25%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 但 的
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.0064325021 0.003217620
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.0066720485 0.002387637
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.0037513920 0.002850147
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.0000000000 0.004230182
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.0026795657 0.004602722
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.0027970905 0.004527408
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.0018812290 0.003790692
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.0062831196 0.003528347
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.0019252427 0.004229160
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.0005395403 0.005168552
## Terms
## Docs 電影 昆
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.0050997145 0.04946648
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.0013224070 0.00000000
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.0044611788 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.0050763079 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.0000000000 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.0055438626 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.0014914462 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.0149438503 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.0007631702 0.00000000
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.0059884973 0.00000000
## Terms
## Docs 了 裡
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.003635134 0.0185995042
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.001099731 0.0067522525
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.001854988 0.0037964870
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.001266457 0.0025919791
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.004858301 0.0000000000
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.004149314 0.0000000000
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.004651149 0.0000000000
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.003106876 0.0063586482
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.002617982 0.0009741929
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.006669795 0.0043682084
## Terms
## Docs 他 與
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.015984792 0.015912509
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.007033967 0.003703065
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.023729296 0.004996960
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.000000000 0.006823158
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.005649832 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.002211612 0.014903214
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.000000000 0.005011700
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.000000000 0.004184646
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.009133540 0.006411194
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.004266054 0.005030780
## Terms
## Docs 在 car
## https://www.ptt.cc/bbs/movie/M.1150400175.A.4C5.html 0.006561669 0.043199324
## https://www.ptt.cc/bbs/movie/M.1153342211.A.612.html 0.002268675 0.003267255
## https://www.ptt.cc/bbs/movie/M.1185280676.A.50E.html 0.003826721 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186364093.A.CF1.html 0.003265776 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186383424.A.026.html 0.001822248 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186385537.A.227.html 0.003566571 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186427524.A.80D.html 0.002878507 0.005527347
## https://www.ptt.cc/bbs/movie/M.1186462552.A.27A.html 0.003204643 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186591324.A.436.html 0.003436829 0.000000000
## https://www.ptt.cc/bbs/movie/M.1186749012.A.CF4.html 0.006054119 0.000000000
將資料分成訓練集與測試集
set.seed(123)
spl = sample.split(dtm$is_disney, 0.7) #在保留is_jane比例的狀況下以7:3將原始資料分割成訓練集與測試集
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)set.seed(123)
spl = sample.split(dtm.tfidf$is_disney, 0.7)
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)# 以dtm.tfidf為input
svm.fit.tfidf = svm(is_disney~.,TR.tfidf,kernel = "linear",cost = 10,scale = F)
p.svm.tfidf = pred = predict(svm.fit.tfidf,TS.tfidf)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 206 5
## 1 6 242
##
## Accuracy : 0.976
## 95% CI : (0.9575, 0.988)
## No Information Rate : 0.5381
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9518
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9798
## Specificity : 0.9717
## Pos Pred Value : 0.9758
## Neg Pred Value : 0.9763
## Prevalence : 0.5381
## Detection Rate : 0.5272
## Detection Prevalence : 0.5403
## Balanced Accuracy : 0.9757
##
## 'Positive' Class : 1
##
Accuracy:0.976, Sensitivity:0.9798, Specificity:0.717
決策樹視覺化
在高維度資料上,特別是本次的document term,很難有一個特徵是可以決定區分兩個類別的,使決策樹的表現並不是很好。但決策樹的好處是易於解釋。
Accuracy:0.9521,Sensitivity:0.9838,Specificity:0.9151
迪士尼與皮克斯工作室發表的動畫除了畫風不一樣之外,每一部動畫所傳達的寓意也不太相同,儘管是同一個工作室產出,也會有著不一樣的寓意。大家在討論時除了針對主角做討論,也可以對於所傳達之寓意有著不一樣且多元面向的討論。另外,大家在討論皮克斯時,討論的深度較迪士尼更深,這也是一個很有趣的發現。