讀取資料

pacman::p_load(tidytext,readr, tidyverse, data.table, tibble, stringr, textdata, 
               tidyr, reshape2, plotly, topicmodels, dplyr, text2vec, udpipe,ggplot2,wordcloud,wordcloud2,gutenbergr,igraph,ggraph,xml2,httr,jsonlite,magrittr,data.tree,webshot,htmlwidgets,imager)
## 
## The downloaded binary packages are in
##  /var/folders/dt/brfqfqpx2nx6x96txdg151hh0000gn/T//RtmpbBqtYv/downloaded_packages
text = read_file("./grimm.txt")

查看斷詞結果

text_tidy = tibble(text) %>%
  unnest_tokens(word, text) %>%
  mutate(linenumber = row_number(),story = cumsum(str_detect(word, regex("^story([1-9]|[1-6][0-9])$", ignore_case = T)))) 
# 標註集數
data(stop_words) 
text_tidy = text_tidy %>%
  anti_join(stop_words)
## Joining, by = "word"

統計字頻

text_tidy %>%
  count(word, sort = T) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) +
  geom_col(fill = "#00bfc4") +
  coord_flip()

總體文字雲

text_tidy %>% 
  group_by(word) %>% 
  summarise(count = n()) %>% 
  filter(count > 50) %>% 
  arrange(desc(count)) %>% 
  wordcloud2( size = 0.7, shape = 'circle')

區分正、負面情緒的文字雲(採用Bing字典)

text_tidy  %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

情緒分析

NRC字典

準備資料

nrc <- get_sentiments("nrc")
grimmlist <- text_tidy %>%
  filter(nchar(.$word)>1) %>%
  mutate(rown = row_number())
grimmnrc<-grimmlist %>%
  inner_join(nrc, by = "word") 
tt_neg<-filter(grimmnrc,sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')
tt_neg_dis<-distinct(tt_neg,word,rown,story)
neg_re<-tt_neg_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()
neg_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

NRC最常出現的正面情緒字

tt_pos<-filter(grimmnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')

tt_pos_dis<-distinct(tt_pos,word,rown,story)

pos_re<-tt_pos_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()


pos_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

找出“king”最常出現的前三大集數

pos_love<-tt_pos_dis%>%
  filter(word == 'king')%>%
  group_by(story)%>%
  count(word, sort = TRUE)%>%
  ungroup()
pos_love %>% top_n(3)
## Selecting by n
## # A tibble: 3 x 3
##   story word      n
##   <int> <chr> <int>
## 1    61 king     31
## 2    60 king     24
## 3    59 king     19

Afinn字典

準備資料

afinn = get_sentiments("afinn")

storyafinn <- text_tidy %>% 
  inner_join(afinn) %>%
  group_by(story) %>% 
  summarise(sentiment = sum(value)) # 以每集為單位將情緒值加總
## Joining, by = "word"
storyafinn %>%
  ggplot(aes(story, sentiment, fill = "#F7766D")) +
  geom_col(show.legend = FALSE) +
  scale_x_continuous(breaks=seq(1,62,5)) 

Afinn最常出現的負面情緒字

tt_neg <- text_tidy %>% 
  inner_join(afinn) %>% 
  filter(value < 0)
## Joining, by = "word"
tt_neg_dis<-distinct(tt_neg,word,rown,story)
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
neg_re<-tt_neg_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()

neg_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

Afinn最常出現的正面情緒字

tt_pos <- text_tidy %>% 
  inner_join(afinn) %>%
  filter(value > 0)
## Joining, by = "word"
tt_pos_dis <- distinct(tt_pos,word,rown,story)
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
pos_re <- tt_pos_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()
pos_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

Bing字典

準備資料

bing = get_sentiments("bing")
grimm_bing <- text_tidy %>%
  filter(nchar(.$word)>1) %>%
  mutate(nrow = row_number())%>%
  inner_join(bing, by = "word")
grimm_bing_all <- grimm_bing%>%
  group_by(sentiment)%>%
  count(word, sort = TRUE)%>%
  ungroup()
grimm_bing_all%>%
  group_by(sentiment) %>%
  top_n(10,wt = n) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

Loughran字典

Loughran最常出現的正/負面情緒字

loughran = get_sentiments("loughran")
loughran_wc = get_sentiments("loughran") %>%
  filter(sentiment == "negative" | sentiment == "positive") %>%
  inner_join(text_tidy) %>%
  count(word, sentiment, sort = T)
## Joining, by = "word"
loughran_wc %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

四字典各集比較分析

正/負面情緒詞總數差異

grimm_cnt = text_tidy %>% 
  group_by(story, word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

每集情緒詞總數比較

grimm_cnt %>% 
  inner_join(afinn) %>% 
  group_by(story, value) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_boxplot(aes(x=story, y=value, colour=as.factor(story)),show.legend = FALSE) +
  ggtitle("Afinn-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))
## Joining, by = "word"

grimm_cnt %>% 
  inner_join(nrc) %>% 
  group_by(story, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
  ggtitle("NRC-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
## Joining, by = "word"

grimm_cnt %>% 
  inner_join(bing) %>% 
  group_by(story, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
  ggtitle("Bing-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
## Joining, by = "word"

grimm_cnt %>% 
  inner_join(loughran) %>% 
  group_by(story, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
  ggtitle("Loughran-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
## Joining, by = "word"

正面-負面情緒差異

#整理字典"bing"正負面情緒各級出現次數與差值
bing_neg_pos <-  text_tidy  %>%
  dplyr::select(word,story) %>%
  inner_join(get_sentiments("bing"))  %>% 
  group_by(story,sentiment) %>%
  summarise(count=sum(story))
## Joining, by = "word"
bing_trend<-bing_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")


#整理字典"nrc"正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(story)%>%mutate(method='nrc') 

nrc_trend<-nrc_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)


#整理字典"afinn"正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")
## Joining, by = "word"
afinn_neg <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value<0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")
## Joining, by = "word"
afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(story)%>%mutate(method='nrc') 

afinn_trend<-afinn_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")


#整理字典"loughran"正負面情緒各級出現次數與差值
loughran_data <-  text_tidy  %>%
  dplyr::select(word,story) %>%
  inner_join(get_sentiments("loughran"))  %>% 
  group_by(story,sentiment) %>%
  summarise(count=sum(story))
## Joining, by = "word"
loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")

loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(story)%>%mutate(method='nrc') 

loughran_trend<-loughran_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")

合併四個字典並劃出折線圖

bind_rows(bing_trend,nrc_trend,afinn_trend,loughran_trend) %>%
  ggplot(aes(x= story,y=Deviation,col=method)) +
  geom_line() +facet_wrap(~method, ncol = 1, scales = "fixed")

計算term frequency

book_words = text_tidy %>% count( word,story, sort = T)

total_words <- book_words %>% 
  group_by(story) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "story"
freq_by_rank <- book_words %>% 
  group_by(story) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
## # A tibble: 15,718 x 6
## # Groups:   story [62]
##    word     story     n total  rank `term frequency`
##    <chr>    <int> <int> <int> <int>            <dbl>
##  1 hans        42    71   343     1           0.207 
##  2 tailor      18    57   991     1           0.0575
##  3 hansel      19    45   905     1           0.0497
##  4 gretel      42    44   343     2           0.128 
##  5 peasant     28    42   589     1           0.0713
##  6 bird        40    39   987     1           0.0395
##  7 wife        10    37   592     1           0.0625
##  8 gretel      19    36   905     2           0.0398
##  9 shudder     58    34  1024     1           0.0332
## 10 snowdrop    31    33   701     1           0.0471
## # … with 15,708 more rows

計算TF-IDF

book_words <- book_words %>%
  bind_tf_idf(word, story, n)

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 15,718 x 6
##    word       story     n     tf   idf tf_idf
##    <chr>      <int> <int>  <dbl> <dbl>  <dbl>
##  1 hans          42    71 0.207   2.74  0.567
##  2 gretel        42    44 0.128   3.03  0.388
##  3 doctor        54    18 0.0807  4.13  0.333
##  4 elsie         33    27 0.0714  4.13  0.295
##  5 sparrow        8    28 0.0650  4.13  0.268
##  6 fox           56    28 0.115   2.18  0.251
##  7 sultan         5    15 0.0584  4.13  0.241
##  8 rapunzel      16    24 0.0562  4.13  0.232
##  9 fundevogel    17    13 0.0544  4.13  0.224
## 10 peasant       28    42 0.0713  3.03  0.216
## # … with 15,708 more rows
book_words = anti_join(book_words, stop_words, by = "word")

同一行裡最常一起出現的詞彙

library(widyr)
text_tidy_cors <- text_tidy %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, linenumber, sort = TRUE)

set.seed(2016)
text_tidy_cors %>%
  filter(abs(correlation) > .01) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

# 故事背景圍繞在皇室貴族、平民家庭、等不同接階級的故事

LDA分析

主題-文字分析

word_counts <- text_tidy %>%
  count(story, word, sort = TRUE) %>%
  ungroup()

episode_dtm <- word_counts %>%
  cast_dtm(story, word, n)

episode_dtm
## <<DocumentTermMatrix (documents: 62, terms: 4397)>>
## Non-/sparse entries: 15718/256896
## Sparsity           : 94%
## Maximal term length: 15
## Weighting          : term frequency (tf)

4個主題 Beta值

episode_lda <- LDA(episode_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")
text_topics %>% head(10)
## # A tibble: 10 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 hans   3.27e-  3
##  2     2 hans   9.92e-  3
##  3     3 hans   3.60e-  3
##  4     4 hans   2.76e-164
##  5     1 tailor 1.31e-  4
##  6     2 tailor 2.25e-118
##  7     3 tailor 8.17e-  3
##  8     4 tailor 8.22e-  4
##  9     1 hansel 1.74e-171
## 10     2 hansel 6.28e-  3

選出各主題前5高的文字

top_terms <- text_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>% head(10)
## # A tibble: 10 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 king     0.0144 
##  2     1 fox      0.00982
##  3     1 father   0.00865
##  4     1 princess 0.00852
##  5     1 till     0.00691
##  6     2 gretel   0.0133 
##  7     2 wife     0.0111 
##  8     2 hans     0.00992
##  9     2 day      0.00868
## 10     2 mother   0.00862
remove_word = c("day","time","boy","hans","son","father","mother","women","wife","door","set","gretel","hans","hansel","king","elsie","beautiful","woman")
episode_lda <- LDA(episode_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")

top_terms <- text_topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
  
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

  • 第一個主題由於是鄉野傳說常以野獸精怪作為主體去敘述故事
  • 第二個則是因為故事的搜集並非都通過沒有受過教育的農民,其中的大部分倒是經由受過良好教育的中產階級女性的講述,所以故事多會跟家庭相關
  • 第三個則是跟當時的產業結構有所相關
  • 第四個是因為當時的歷史背景仍是19世紀,所以當時蒐集的故事難免都會與貴族王室有所關聯

文件-主題分析

使用4個主題gamma值,算出每個主題故事數量

episode_gamma <- tidy(episode_lda, matrix = "gamma")
episode_gamma2<-episode_gamma %>%group_by(document)%>%filter(gamma==(max(gamma)))

table(episode_gamma2$topic)
## 
##  1  2  3  4 
## 16 16 14 16