109社群媒體分析

【動機與分析目的】

對於資深的偵探迷而言，阿嘉莎·克莉絲蒂有著不可動搖的地位，但是對於普羅大眾而言，說到偵探小說，往往腦中浮現的只會是“福爾摩斯”，甚至是“亞森羅蘋”。

因此，我們想透過文字分析技術，探討作家知名度的差異，是否源自於他們寫作風格或劇情安排上的差異。

在實際分析之前，我們根據過往的閱讀經驗，對於作者的寫作風格有以下統整：
1. 柯南·道爾：擅長、專注於人物形象的營造，並且對於犯罪現場的描摹較為血腥、暴力。
2. 莫理斯·盧布朗：偏向冒險、仗義且輕浮的寫作風格，整體故事情緒較為趣味。
3. 阿嘉莎·克莉絲蒂：專注於故事情節的營造，在敘述上較為精細，會將案件線索開放式展示給讀者，讓讀者如拼圖一般，一塊塊的拼湊出案件真相。

因此，我們也預期分析結果應該也會如此。

【資料集敘述】

從Gutenberg中，各挑選三位作者的一部作品進行分析：
1. 柯南·道爾：
書名：The Adventures of Sherlock Holmes by Arthur Conan Doyle
內容：12篇案件，12個章節
2. 莫理斯·盧布朗：
書名：The Extraordinary Adventures of Arsene Lupin, Gentleman-Burglar by Maurice Leblanc
內容：9篇案件，9個章節
3. 阿嘉莎·克莉絲蒂：
書名：The Mysterious Affair at Styles by Agatha Christie
內容：1篇案件，13個章節

【系統參數設定】

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

【載入package】

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","widyr","igraph", "ggraph","xml2", "httr", "jsonlite")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(stringr)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(wordcloud2)
library(textdata)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(gutenbergr)
library(reshape2)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(data.tree)
library(webshot)
library(htmlwidgets)
library(imager)

## 
## Attaching package: 'imager'

## The following objects are masked from 'package:webshot':
## 
##     resize, shrink

## The following object is masked from 'package:magrittr':
## 
##     add

## The following object is masked from 'package:igraph':
## 
##     spectrum

## The following object is masked from 'package:stringr':
## 
##     boundary

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

load("mid.RData")

【載入資料】

#The Adventures of Sherlock Holmes by Arthur Conan Doyle 夏洛克福爾摩斯(亞瑟·柯南·道爾)
Sherlock= gutenberg_download(1661)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

Sherlock$text[16] = paste0(" ", Sherlock$text[16])
Sherlock$text = sub(pattern = 'ADVENTURE ',replacement = '',x = Sherlock$text)
tidy_Sherlock <- Sherlock %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))),
         author =  "Conan Doyle")

#The Extraordinary Adventures of Arsene Lupin, Gentleman-Burglar by Maurice Leblanc 亞森羅蘋(莫理斯·盧布朗)
LUPIN  = gutenberg_download(6133)
tidy_LUPIN <- LUPIN %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))),
         author =  "Maurice Leblanc")

#The Mysterious Affair at Styles by Agatha Christie 史岱爾莊謀殺案(阿嘉莎·克莉絲蒂)
Mysterious  = gutenberg_download(863)
tidy_Mysterious <- Mysterious%>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case = TRUE))),
         author =  "Agatha Christie")

【斷詞】

custom_stop_words <- bind_rows(data_frame(word = c("miss" , "sir" , "john", "mother" , "de"),
                                          lexicon= c("customer" , "customer" , "customer" , "customer" , "customer")),
                               stop_words)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

tidy_Sherlock2 <- tidy_Sherlock %>%
  unnest_tokens(word, text)%>%
  anti_join(custom_stop_words)

## Joining, by = "word"

tidy_LUPIN2 <- tidy_LUPIN %>%
  unnest_tokens(word, text)%>%
  anti_join(custom_stop_words)

## Joining, by = "word"

tidy_Mysterious2 <- tidy_Mysterious %>%
  unnest_tokens(word, text)%>%
  anti_join(custom_stop_words)

## Joining, by = "word"

tidy_Mysterious3 <- tidy_Mysterious %>%
  unnest_tokens(word, text)%>%
  anti_join(stop_words)

## Joining, by = "word"

【詞頻】

三本書最常出現的辭彙都是人名

tidy_Sherlock2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tidy_LUPIN2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 60) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tidy_Mysterious2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

【比較福爾摩斯與另兩本的詞頻】

books<-bind_rows(tidy_Sherlock2,tidy_LUPIN2,tidy_Mysterious2)

frequency <- bind_rows(mutate(tidy_Sherlock2, book = "Sherlock"),
                       mutate(tidy_LUPIN2, book = "LUPIN"), 
                       mutate(tidy_Mysterious2, book = "Mysterious")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(book, word) %>%
  group_by(book) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(book, proportion) %>% 
  gather(book, proportion, `LUPIN`:`Mysterious`)

中間虛線為兩本書共同出現詞彙，越往上出現頻率越高，偏左為夏洛克，偏右為另一本

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

# Comparing the word frequencies of Sherlock, LUPIN, and Mysterious
ggplot(frequency, aes(x = proportion, y = `Sherlock`, color = abs(`Sherlock` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~book, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Sherlock", x = NULL)

## Warning: Removed 17132 rows containing missing values (geom_point).

## Warning: Removed 17134 rows containing missing values (geom_text).

【情緒字典 (afinn、bing and nrc)】

使用三種情緒字典，分析三本書在各個章節的情緒

afinn <- books %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(author, chapter) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(books %>% 
                          inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          books %>% 
                            inner_join(get_sentiments("nrc") %>% 
                            filter(sentiment %in% c("positive","negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method,author, chapter, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(chapter,sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(method~author, scales = "fixed")

【最常見的懸疑、恐懼、悲傷單詞】

#在營造"懸疑"、"恐懼"、"沮喪"的情緒時，三本書使用的詞彙
books %>%
  inner_join(get_sentiments("nrc")) %>% 
  filter(sentiment %in% c("anticipation","fear", "sadness" )) %>%
  count(author,word, sentiment, sort = TRUE) %>%
  ungroup() %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  coord_cartesian(expand = T)+
  geom_col(show.legend = F) +
  facet_grid(author~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  coord_flip()

## Joining, by = "word"

## Selecting by n

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

【繪製情緒字文字雲】

Sherlock

tidy_Sherlock2 %>%
  inner_join(get_sentiments("nrc")) %>%
  filter( sentiment == c("positive" , "negative" )) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray30", "gray80"), max.words = 100)

## Joining, by = "word"

## Warning in sentiment == c("positive", "negative"): 較長的物件長度並非較短物件長
## 度的倍數

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## presence could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## considerable could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## importance could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## marriage could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## extraordinary could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## innocent could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## laughing could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## visitor could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## excellent could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## impression could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## majesty could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## professional could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## shoulder could not be fit on page. It will not be plotted.

LUPIN

tidy_LUPIN2 %>%
  inner_join(get_sentiments("nrc")) %>%
  filter( sentiment == c("positive" , "negative" )) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray30", "gray80"), max.words = 100)

## Joining, by = "word"

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## possessed could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## scarcely could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## bottom could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## darkness could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## enemy could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## examination could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## impossible could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## lying could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## mistake could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## robbery could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## servant could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## stolen could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## suspect could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## trembling could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## wound could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## considerable could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## fortune could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## gentleman could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## importance could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## information could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## marvelous could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## presence could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## promise could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## reason could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## companion could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## excellent could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## impression could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## inspector could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## laugh could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## pardon could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray30", "gray80"), max.words = 100):
## pride could not be fit on page. It will not be plotted.

Mysterious

tidy_Mysterious2 %>%
  inner_join(get_sentiments("nrc")) %>%
  filter( sentiment == c("positive" , "negative" )) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray30", "gray80"), max.words = 100)

## Joining, by = "word"

【計算TF】

三本書中絕大多數詞彙的詞頻都很低

Sherlock= gutenberg_download(1661)
LUPIN  = gutenberg_download(6133)
Agatha = gutenberg_download(863)

Sherlock$text[16] = paste0(" ", Sherlock$text[16])
Sherlock$text = sub(pattern = 'ADVENTURE ',replacement = '',x = Sherlock$text)

tidy_Sherlock <- Sherlock %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))))


tidy_LUPIN <- LUPIN %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))))

temp1 = Sherlock %>% mutate(book = "Sherlock") %>% select(-gutenberg_id) %>% unnest_tokens(word, text)

temp2 = LUPIN %>% mutate(book = "LUPIN") %>% select(-gutenberg_id) %>% unnest_tokens(word, text)

temp3 = Agatha %>% mutate(book = "Agatha") %>% select(-gutenberg_id) %>% unnest_tokens(word, text)


book_words = bind_rows(temp1,temp2, temp3) %>% count(book, word, sort = T)

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)

## Joining, by = "book"

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 421 rows containing non-finite values (stat_bin).

## Warning: Removed 3 rows containing missing values (geom_bar).

【檢查是否符合Zipf’s law】

rank與詞頻呈現反比，rank越前面、TF越高

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank

## # A tibble: 19,193 x 6
## # Groups:   book [3]
##    book     word      n  total  rank `term frequency`
##    <chr>    <chr> <int>  <int> <int>            <dbl>
##  1 Sherlock the    5630 105414     1           0.0534
##  2 LUPIN    the    3955  54337     1           0.0728
##  3 Sherlock and    3018 105414     2           0.0286
##  4 Sherlock i      3003 105414     3           0.0285
##  5 Sherlock to     2743 105414     4           0.0260
##  6 Agatha   the    2657  57272     1           0.0464
##  7 Sherlock of     2655 105414     5           0.0252
##  8 Sherlock a      2641 105414     6           0.0251
##  9 Sherlock in     1766 105414     7           0.0168
## 10 Sherlock that   1744 105414     8           0.0165
## # ... with 19,183 more rows

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

【計算TF-IDF】

各本書的tf-idf最高的都是人名

book_words <- book_words %>%
  bind_tf_idf(word, book, n)

book_words

## # A tibble: 19,193 x 7
##    book     word      n  total     tf   idf tf_idf
##    <chr>    <chr> <int>  <int>  <dbl> <dbl>  <dbl>
##  1 Sherlock the    5630 105414 0.0534     0      0
##  2 LUPIN    the    3955  54337 0.0728     0      0
##  3 Sherlock and    3018 105414 0.0286     0      0
##  4 Sherlock i      3003 105414 0.0285     0      0
##  5 Sherlock to     2743 105414 0.0260     0      0
##  6 Agatha   the    2657  57272 0.0464     0      0
##  7 Sherlock of     2655 105414 0.0252     0      0
##  8 Sherlock a      2641 105414 0.0251     0      0
##  9 Sherlock in     1766 105414 0.0168     0      0
## 10 Sherlock that   1744 105414 0.0165     0      0
## # ... with 19,183 more rows

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 19,193 x 6
##    book   word           n      tf   idf  tf_idf
##    <chr>  <chr>      <int>   <dbl> <dbl>   <dbl>
##  1 Agatha poirot       389 0.00679 1.10  0.00746
##  2 LUPIN  lupin        311 0.00572 1.10  0.00629
##  3 LUPIN  arsene       285 0.00525 1.10  0.00576
##  4 Agatha inglethorp   263 0.00459 1.10  0.00504
##  5 Agatha cavendish    147 0.00257 1.10  0.00282
##  6 Agatha cynthia      102 0.00178 1.10  0.00196
##  7 LUPIN  ganimard      90 0.00166 1.10  0.00182
##  8 Agatha lawrence      93 0.00162 1.10  0.00178
##  9 Agatha howard        92 0.00161 1.10  0.00176
## 10 Agatha mrs          234 0.00409 0.405 0.00166
## # ... with 19,183 more rows

book_words = anti_join(book_words, stop_words, by = "word")

book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

【bigram】

設定function，整理資料

separate_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ")
}

count_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}

visualize_bigrams <- function(bigrams) {
  set.seed(2017)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

Sherlock_separate <- tidy_Sherlock %>% separate_bigrams %>% mutate(book = "Sherlock")
LUPIN_separate <- tidy_LUPIN %>% separate_bigrams %>% mutate(book = "LUPIN")
Mysterious_separate <- tidy_Mysterious %>% separate_bigrams %>% mutate(book = "Mysterious")

Sherlock_biagram <- tidy_Sherlock %>% count_bigrams %>% mutate(book = "Sherlock") 
LUPIN_biagram <- tidy_LUPIN %>% count_bigrams %>% mutate(book = "LUPIN")
Mysterious_biagram <- tidy_Mysterious %>% count_bigrams %>% mutate(book = "Mysterious")

將否定詞如no、not加入分析後，查看各本書被錯估的情緒值

AFINN <- get_sentiments("afinn")
negation_words1 <- c("not")
negation_words2 <- c("no")

all_separate <- bind_rows(Sherlock_separate, LUPIN_separate, Mysterious_separate)

negated_words1 <- all_separate %>%
  group_by(book) %>%
  filter(word1 %in% negation_words1) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

negated_words2 <- all_separate %>%
  group_by(book) %>%
  filter(word1 %in% negation_words2) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)


negated_words1 %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * value, fill = word1)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment value * number of occurrences") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

negated_words2 %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * value, fill = word1)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"no\"") +
  ylab("Sentiment value * number of occurrences") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

【共現性】

計算兩個詞在同一行一起出現的次數

library(widyr)

Sherlock2_pairs <- tidy_Sherlock2 %>%
  pairwise_count(word, linenumber, sort = TRUE)
Sherlock2_pairs

## # A tibble: 85,948 x 3
##    item1    item2        n
##    <chr>    <chr>    <dbl>
##  1 holmes   sherlock    92
##  2 sherlock holmes      92
##  3 simon    st          39
##  4 st       simon       39
##  5 lord     st          29
##  6 st       lord        29
##  7 simon    lord        28
##  8 lord     simon       28
##  9 street   baker       27
## 10 baker    street      27
## # ... with 85,938 more rows

LUPIN2_pairs <- tidy_LUPIN2 %>%
  pairwise_count(word, linenumber, sort = TRUE)
LUPIN2_pairs

## # A tibble: 56,282 x 3
##    item1     item2         n
##    <chr>     <chr>     <dbl>
##  1 lupin     arsene      236
##  2 arsene    lupin       236
##  3 andermatt mon          30
##  4 mon       andermatt    30
##  5 holmes    sherlock     26
##  6 sherlock  holmes       26
##  7 pearl     black        19
##  8 black     pearl        19
##  9 francs    thousand     18
## 10 thousand  francs       18
## # ... with 56,272 more rows

Mysterious2_pairs <- tidy_Mysterious2 %>%
  pairwise_count(word, linenumber, sort = TRUE)
Mysterious2_pairs

## # A tibble: 48,224 x 3
##    item1        item2          n
##    <chr>        <chr>      <dbl>
##  1 inglethorp   alfred        45
##  2 alfred       inglethorp    45
##  3 bauerstein   dr            40
##  4 dr           bauerstein    40
##  5 mary         cavendish     29
##  6 cavendish    mary          29
##  7 ami_         _mon          25
##  8 _mon         ami_          25
##  9 monsieur     poirot        20
## 10 mademoiselle cynthia       20
## # ... with 48,214 more rows

同一行裡最常一起出現的詞彙

Sherlock2_cors <- tidy_Sherlock2 %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, linenumber, sort = TRUE)

LUPIN2_cors <- tidy_LUPIN2 %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, linenumber, sort = TRUE)

Mysterious2_cors <- tidy_Mysterious3 %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, linenumber, sort = TRUE)

Sherlock2_cors %>%
  filter(item1 %in% c("angel", "baker")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

## Selecting by correlation

LUPIN2_cors %>%
  filter(item1 %in% c("lupin", "black")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

## Selecting by correlation

#謀殺、證據
Mysterious2_cors %>%
  filter(item1 %in% c("murder", "evidence")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

## Selecting by correlation

set.seed(2016)
Sherlock2_cors %>%
  filter(correlation > .09) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

LUPIN2_cors %>%
  filter(correlation > .09) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

Mysterious2_cors %>%
  filter(correlation > .09) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

【core-nlp函數設定】

# 生產core-nlp的api url，可以設定斷詞依據、以及要標註的任務
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}',
                     host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
generate_API_url("127.0.0.1")

# 呼叫core-nlp api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

host = "127.0.0.1"
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}


# 1.3.2 資料整理function

#從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將資料丟入coreNLP處理

gc() #釋放不使用的記憶體

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 10912836 582.9   18330941 979.0 18330941 979.0
## Vcells 35990961 274.6   57699826 440.3 48016356 366.4

t0 = Sys.time()

Sherlock= gutenberg_download(1661)
Sherlock$text[16] = paste0(" ", Sherlock$text[16])
Sherlock$text = sub(pattern = 'ADVENTURE ',replacement = '',x = Sherlock$text)

LUPIN  = gutenberg_download(6133)

Agatha = gutenberg_download(863)

temp1 = Sherlock %>% mutate(book = "Sherlock") %>% select(-gutenberg_id) %>% mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))))

temp2 = LUPIN %>% mutate(book = "LUPIN") %>% select(-gutenberg_id) %>% mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^[ivx]+[.][:space:]", ignore_case = TRUE))))

temp3 = Agatha %>% mutate(book = "Agatha") %>% select(-gutenberg_id) %>% mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE))))

book_words_NLP = bind_rows(temp1,temp2, temp3) 

#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

#obj = book_words_NLP  %>% filter(text != "") %>% coreNLP(host)

#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間

## Time difference of 1.532451 secs

#Time difference of 14 mins

【使用coreNLP斷詞】

畫出三本書中有NER標記的詞語

#tokens =  coreNLP_tokens_parser(obj)
head(tokens)

##       book linenumber chapter       word     lemma pos    ner
## 1 Sherlock          1       0        THE       the  DT      O
## 2 Sherlock          1       0 ADVENTURES adventure NNS      O
## 3 Sherlock          1       0         OF        of  IN      O
## 4 Sherlock          1       0   SHERLOCK  SHERLOCK NNP PERSON
## 5 Sherlock          1       0     HOLMES    HOLMES NNP PERSON
## 6 Sherlock          3       0         by        by  IN      O

levels(tokens$ner)

##  [1] "O"                 "PERSON"            "COUNTRY"          
##  [4] "MISC"              "LOCATION"          "CITY"             
##  [7] "NUMBER"            "TITLE"             "ORDINAL"          
## [10] "DURATION"          "SET"               "DATE"             
## [13] "CAUSE_OF_DEATH"    "TIME"              "NATIONALITY"      
## [16] "ORGANIZATION"      "STATE_OR_PROVINCE" "MONEY"            
## [19] "CRIMINAL_CHARGE"   "RELIGION"          "IDEOLOGY"         
## [22] "URL"

tokens %>%
  filter(ner != "O") %>%
  group_by(ner) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(ner = reorder(ner, count)) %>%
  ggplot(aes(ner, count)) + 
  geom_col()+
  ggtitle("NER") +
  theme(text=element_text(size=14))+
  coord_flip()

tokens %>%
  filter(ner != "O") %>%
  group_by(book,ner) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(ner = reorder(ner, count)) %>%
  ggplot(aes(ner, count)) + 
  geom_col()+
  ggtitle("NER") +
  theme(text=element_text(size=14))+
  facet_wrap(~book)+
  coord_flip()

## 【篩選NER為PERSION詞】

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSION
  group_by(word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

【篩選NER為CAUSE_OF_DEATH詞】

tokens %>%
  filter(ner == "CAUSE_OF_DEATH") %>% 
  group_by(word) %>% 
  summarize(count = n()) %>% 
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is CAUSE_OF_DEATH)") +
  theme(text=element_text(size=14))+
  coord_flip()

a = tokens %>% filter(word == "fire" & ner == "CAUSE_OF_DEATH") %>% select(book, linenumber)
a$linenumber = as.numeric(as.character(a$linenumber))

book_words_NLP %>% inner_join(a) %>% select(text)

## Joining, by = c("book", "linenumber")

## Warning: Column `book` joining character vector and factor, coercing into
## character vector

## # A tibble: 27 x 1
##    text                                                                
##    <chr>                                                               
##  1 "stood before the fire and looked me over in his singular"          
##  2 "front of the fire and laughed heartily for some minutes."          
##  3 "The alarm of fire was admirably done. The smoke and shouting were" 
##  4 "of the fire in his lodgings at Baker Street, \"life is infinitely" 
##  5 "\"I beg that you will draw your chair up to the fire and favour me"
##  6 "said he with an oath. 'Tell Mary that I shall want a fire in my"   
##  7 "step up to the room. The fire was burning brightly, and in the"    
##  8 "eyes had regained their fire, and there, sitting by the fire and"  
##  9 "\"The fire looks very seasonable in this weather. You look cold,"  
## 10 "front of us, spouting fire at every chink and window, while in"    
## # ... with 17 more rows

length(unique(tokens$word[tokens$ner != "O"]))

## [1] 1517

length(unique(tokens$word[tokens$ner == "CAUSE_OF_DEATH"]))

## [1] 30

#save.image("mid.RData")

【篩選NER為CRIMINAL_CHARGE詞】

tokens %>%
  filter(ner == "CRIMINAL_CHARGE") %>%  #篩選NER為PERSION
  group_by(word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is CRIMINAL_CHARGE)") +
  theme(text=element_text(size=14))+
  coord_flip()

a = tokens %>% filter(word == "train" & ner == "CRIMINAL_CHARGE") %>% select(book, linenumber)
a$linenumber = as.numeric(as.character(a$linenumber))

book_words_NLP %>% inner_join(a) %>% select(book, text)

## Joining, by = c("book", "linenumber")

## Warning: Column `book` joining character vector and factor, coercing into
## character vector

## # A tibble: 34 x 2
##    book     text                                                               
##    <chr>    <chr>                                                              
##  1 Sherlock "left this morning with her husband by the 5:15 train from Charing"
##  2 Sherlock "still time to take a train to Hereford and see him to-night?\""   
##  3 Sherlock "them present such singular features as the strange train of"      
##  4 Sherlock "\"By train from Waterloo.\""                                      
##  5 Sherlock "the last train from Waterloo Station, and that in his haste and"  
##  6 Sherlock "doubt. You have come in by train this morning, I see.\""          
##  7 Sherlock "twenty past, and came in by the first train to Waterloo. Sir, I"  
##  8 Sherlock "At Waterloo we were fortunate in catching a train for"            
##  9 Sherlock "her by the morning train to the care of her good aunt at Harrow," 
## 10 Sherlock "train this morning, and on inquiring at Paddington as to where I" 
## # ... with 24 more rows

【透過coreNLP分析句子情緒】

sentiment = coreNLP_sentiment_parser(obj)
head(sentiment)

##                                text     book linenumber chapter sentiment
## 1 THE ADVENTURES OF SHERLOCK HOLMES Sherlock          1       0   Neutral
## 2                                by Sherlock          3       0   Neutral
## 3            SIR ARTHUR CONAN DOYLE Sherlock          5       0   Neutral
## 4           I. A Scandal in Bohemia Sherlock          9       0   Neutral
## 5         II. The Red-headed League Sherlock         10       0   Neutral
## 6           III. A Case of Identity Sherlock         11       0   Neutral
##   sentimentValue
## 1              2
## 2              2
## 3              2
## 4              2
## 5              2
## 6              2

levels(sentiment$sentiment)

## [1] "Neutral"      "Negative"     "Positive"     "Verypositive" "Verynegative"

【畫出情緒變化折線圖】

sentiment$sentimentValue = as.numeric(as.character(sentiment$sentimentValue))
sentiment$chapter = as.numeric(as.character(sentiment$chapter))

sentiment %>% 
  filter(book == "LUPIN" & chapter == 5) %>%
  summarise(n = n())

##     n
## 1 566

#選出福爾摩斯選集中的第八章，斑點帶子案件，並分成13個section，觀察在該案子中的情緒變化
sherlock_section = sentiment %>% 
  filter(book == "Sherlock" & chapter == 8) %>%
  mutate(section = row_number() %/% 79 + 1) %>%
  select(-chapter)
names(sherlock_section)[6] <- c("chapter")

#選出亞森羅蘋選集中的第五章，皇后的項鍊，並分成13個section，觀察在該案子中的情緒變化
LUPIN_section = sentiment %>% 
  filter(book == "LUPIN" & chapter == 5) %>%
  mutate(section = row_number() %/% 47 + 1) %>%
  select(-chapter)
names(LUPIN_section)[6] <- c("chapter")

sentiment_section = sentiment %>% 
  filter(book == "Agatha") %>%
  bind_rows(sherlock_section, LUPIN_section)

sentiment_section %>% 
  filter(chapter != 0) %>%
  group_by(book, chapter) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=as.factor(chapter),y=avg_sentiment, col = book,group = book)) + 
  xlab("Section")+
  geom_line()

福爾摩斯的最後一個section情緒下降是因為在解析犯罪手法，與凶手最後自食惡果被毒蛇咬死羅蘋的第一個section情緒較高是因為在稱讚皇后與她的項鍊阿嘉莎的第三個section情緒下降是因為發生殺人事件

#sentiment %>% 
  #filter(book == "Agatha" & chapter == 3) %>% 
  #filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  #select(sentiment, text)

#tokens %>% 
  #filter(book == "Agatha"  & ner == "CAUSE_OF_DEATH") %>%
  #select(word) %>%
  #table()

【畫出在正負面句子中的文字雲】

stop_words

## # A tibble: 1,149 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 a           SMART  
##  2 a's         SMART  
##  3 able        SMART  
##  4 about       SMART  
##  5 above       SMART  
##  6 according   SMART  
##  7 accordingly SMART  
##  8 across      SMART  
##  9 actually    SMART  
## 10 after       SMART  
## # ... with 1,139 more rows

custom_stop_words <- bind_rows(tibble(word = c("I","\"i"), lexicon = c("custom","custom")), stop_words)

custom_stop_words

## # A tibble: 1,151 x 2
##    word          lexicon
##    <chr>         <chr>  
##  1 "I"           custom 
##  2 "\"i"         custom 
##  3 "a"           SMART  
##  4 "a's"         SMART  
##  5 "able"        SMART  
##  6 "about"       SMART  
##  7 "above"       SMART  
##  8 "according"   SMART  
##  9 "accordingly" SMART  
## 10 "across"      SMART  
## # ... with 1,141 more rows

tokens$word = as.character(tokens$word)

sentiment %>% 
  merge(tokens) %>% 
  anti_join(custom_stop_words) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lemma) %>% #根據word分組
  summarize(count = n()) %>% 
  filter(count >5)%>%
  wordcloud2()

## Joining, by = "word"

my_graph = sentiment %>% 
  merge(tokens) %>% 
  anti_join(custom_stop_words) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lemma) %>% #根據word分組
  summarize(count = n()) %>% 
  filter(count >10)%>%
  wordcloud2()

## Joining, by = "word"

#saveWidget(my_graph, "tmp.html", selfcontained = F)
#webshot("tmp.html", "wc1.png", delay = 5, vwidth = 2000, vheight = 2000)

“wordcloud”

【結論】

在經過上述分析，結果與原先預計有落差，發現三位作者的劇情安排、跌宕起伏其實很類似。
我們推測原因有二：
一、比較作者的寫作風格時，應該要每一個作者多選幾本著作加入一起分析，才能得到較有力說服力的證據。

二、在文件的選擇時，應避免將短篇選集與單一長篇同時分析，造成因文章長短不同而產生的誤差。

109社群媒體分析_Group14

李沂倩黃晟育陳信瑋林祈妏

2020-04-28 16:55:58

【動機與分析目的】

【資料集敘述】

【系統參數設定】

【載入package】

【載入資料】

【斷詞】

【詞頻】

【比較福爾摩斯與另兩本的詞頻】

【情緒字典 (afinn、bing and nrc)】

【最常見的懸疑、恐懼、悲傷單詞】

【繪製情緒字文字雲】

【計算TF】

【檢查是否符合Zipf’s law】

【計算TF-IDF】

【bigram】

【共現性】

【core-nlp函數設定】

【使用coreNLP斷詞】

【篩選NER為CAUSE_OF_DEATH詞】

【篩選NER為CRIMINAL_CHARGE詞】

【透過coreNLP分析句子情緒】

【畫出情緒變化折線圖】

【畫出在正負面句子中的文字雲】

【結論】

109社群媒體分析_Group14

李沂倩 黃晟育 陳信瑋 林祈妏

2020-04-28 16:55:58

【動機與分析目的】

【資料集敘述】

【系統參數設定】

【載入package】

【載入資料】

【斷詞】

【詞頻】

【比較福爾摩斯與另兩本的詞頻】

【情緒字典 (afinn、bing and nrc)】

【最常見的懸疑、恐懼、悲傷單詞】

【繪製情緒字文字雲】

【計算TF】

【檢查是否符合Zipf’s law】

【計算TF-IDF】

【bigram】

【共現性】

【core-nlp函數設定】

【使用coreNLP斷詞】

【篩選NER為CAUSE_OF_DEATH詞】

【篩選NER為CRIMINAL_CHARGE詞】

【透過coreNLP分析句子情緒】

【畫出情緒變化折線圖】

【畫出在正負面句子中的文字雲】

【結論】

李沂倩黃晟育陳信瑋林祈妏