系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

安裝需要的packages

packages = c("dplyr", "tidytext", "stringr", "wordcloud2","gutenbergr", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales',"data.tree", "NLP", "igraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(readr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(gutenbergr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(data.tree)

下載書籍

scarlet <- gutenberg_download(244)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
scarlet$text[1:32] = ""
scarlet <- scarlet %>% filter(text!="")
scarlet <- scarlet %>% 
  mutate(book = "A Study in Scarlet",
         part = cumsum(str_detect(.$text, regex("^PART (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\."))),
         chapter = cumsum(str_detect(.$text, regex("^CHAPTER (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\..*"))),
         linenumber = row_number())
  
hound <- gutenberg_download(2852) %>% filter(text!="")
hound <- hound %>% 
  mutate(book = "The Hound of the Baskervilles",
         chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9][0-9]?\\..*"))),
         linenumber = row_number())

sign <- gutenberg_download(2097) %>% filter(text!="")
sign <- sign %>% 
  mutate(book = "The Sign of the Four",
         chapter = cumsum(str_detect(.$text, regex("^Chapter (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"))),
         linenumber = row_number())

valley <- gutenberg_download(3289) %>% filter(text!="")
valley <- valley %>% 
  mutate(book = "The Valley of Fear",
         part = cumsum(str_detect(.$text, regex("^Part [0-9].*"))),
         chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9].*"))),
         linenumber = row_number())

斷詞

tokens_scar <- scarlet %>% unnest_tokens(word, text)
tokens_hound <- hound %>% unnest_tokens(word, text)
tokens_sign <- sign %>% unnest_tokens(word, text)
tokens_valley <- valley %>% unnest_tokens(word, text)
tokens_all <- bind_rows(tokens_scar, tokens_hound, tokens_sign, tokens_valley)

用Bing詞庫分析情緒

scar_bing <- tokens_scar %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
hound_bing <- tokens_hound %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
sign_bing <- tokens_sign %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
valley_bing <- tokens_valley %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
book_bing <- bind_rows(scar_bing, hound_bing, sign_bing, valley_bing) %>% 
  mutate(method = "Bing")
Bing_compare <- ggplot(book_bing, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "Bing", x = NULL, y =NULL)
Bing_compare

此四篇皆偏向負向分數,尤以血字的研究與巴斯克維爾的獵犬十分明顯。

由於前項結果的發現,因此統計出各篇負向詞出現的前十名,試圖解釋負向分數高的原因

scar_tokens_count <- tokens_scar %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)
## Joining, by = "word"
hound_tokens_count <- tokens_hound %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)
## Joining, by = "word"
sign_tokens_count <- tokens_sign %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)
## Joining, by = "word"
valley_tokens_count <- tokens_valley %>% 
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>% 
  count(book, word, sort = TRUE) %>% 
  head(., 10)
## Joining, by = "word"
all_tokens_count <- bind_rows(scar_tokens_count, hound_tokens_count, sign_tokens_count, valley_tokens_count)
all_tokens_count %>% 
  ggplot(aes(reorder(word, n), n, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  geom_text(aes(label=n, hjust = 1.1))+
  labs(x = "字詞", y = NULL) +
  #scale_fill_manual(values = c("gray60", "gray60", "gray60", "gray60")) +
  theme(text=element_text(size=12)) +
  theme(text = element_text(family = "Heiti TC Light")) +
  coord_flip()

結果中可看出death(死), dead(死), murder(謀殺), fear(恐懼), dark(黑暗), danger(危險)等字出現多次,符合偵探小說的特性,因此導致通篇小說的分數偏向負向。

用AFINN詞庫分析情緒

scar_afinn <- tokens_scar %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "A Study in Scarlet")
## Joining, by = "word"
hound_afinn <- tokens_hound %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Hound of the Baskervilles")
## Joining, by = "word"
sign_afinn <- tokens_sign %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Sign of the Four")
## Joining, by = "word"
valley_afinn <- tokens_valley %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Valley of Fear")
## Joining, by = "word"
book_afinn <- bind_rows(scar_afinn, hound_afinn, sign_afinn, valley_afinn)
ggplot(book_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "AFINN", x = NULL, y = NULL)

結果血字的研究負向十分明顯,巴斯克維爾的獵犬、四個簽名、恐怖谷傾向負向。

用NRC詞庫分析情緒

scar_nrc <- tokens_scar %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
hound_nrc <- tokens_hound %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
sign_nrc <- tokens_sign %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
valley_nrc <- tokens_valley %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
book_nrc <- bind_rows(scar_nrc, hound_nrc, sign_nrc, valley_nrc) %>% 
  mutate(method = "NRC")
ggplot(book_nrc, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs("NRC", x = NULL, y = NULL)

血字的研究、四個簽名、恐怖谷分數皆轉正,巴斯克維爾的獵犬末段負向明顯。

四本書與三種詞庫分析出來的總結果

book_all <- bind_rows(book_bing, book_afinn, book_nrc)
book_all %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(method~book, scales = "free_x") +
  labs(x = NULL, y = NULL) 

可明顯看出 NRC詞庫 的正向明顯較其他兩個詞庫高。

分析各詞庫差異

get_sentiments("afinn") %>% 
  summarise(n = sum(score))
get_sentiments("bing") %>% 
  count(sentiment)
get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

正負面字詞文字雲

tokens_scar %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

每本小說負面字詞比例最高的章節

wordcounts_chapter <- tokens_all %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tokens_all %>% 
  semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>% 
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts_chapter, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup() 
## Joining, by = "word"
## Selecting by ratio

可看出各本小說負面詞比例最高的章節,可以推斷該章節可能常出現死亡或恐怖場景。

查看負面詞比例最高的章節,所出現的負面詞最高前兩個

tokens_all %>% 
  semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>% 
  filter((book == "A Study in Scarlet" & chapter == 12) | 
           (book == "The Hound of the Baskervilles" & chapter == 12) |
           (book == "The Sign of the Four" & chapter == 3) |
           (book == "The Valley of Fear" & chapter == 6)) %>%
  group_by(book, word) %>% 
  summarise(count = n()) %>% 
  top_n(2)
## Joining, by = "word"
## Selecting by count

分析整篇內容出現正負面詞的比例,與正面減除負面比例

wordcounts_all <- tokens_all %>%
  group_by(book) %>%
  summarize(words = n())

all_ratio <- tokens_all %>% 
  inner_join(get_sentiments("bing")) %>% 
  group_by(book, sentiment) %>%
  summarize(count = n()) %>%
  spread(sentiment, count, fill = 0) %>%
  ungroup() %>% 
  left_join(wordcounts_all, by = c("book")) %>%
  mutate(positive_ratio = positive/words, negative_ratio = negative/words) %>%
  mutate(sentiment_ratio = positive_ratio - negative_ratio)
## Joining, by = "word"
all_ratio

將此圖以圖形化呈現

all_ratio %>% 
  select(book, positive_ratio, negative_ratio, sentiment_ratio) %>% 
  gather(key = class, value = ratio, 2:4) %>% 
  ggplot(aes(x=book, y=ratio, fill = class)) + 
  geom_bar(position=position_dodge(width = 1), stat="identity") +
  theme(text = element_text(size=10), axis.text.x = element_text(vjust = 0.5, hjust = 0.5, angle = 15))

將此圖與前述的內容情緒走向一同分析,可發現結果吻合,巴斯克維爾的獵犬負向最明顯。

Bing_compare

# book_all %>% filter(.$book == "A Study in Scarlet") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("A Study in Scarlet")
# 
# book_all %>% filter(.$book == "The Hound of the Baskervilles") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Hound of the Baskervilles")
# 
# book_all %>% filter(.$book == "The Sign of the Four") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Sign of the Four")
# 
# book_all %>% filter(.$book == "The Valley of Fear") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Valley of Fear")