1. 載入package

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
require(tidytext)
## Loading required package: tidytext
library(wordcloud2)
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(ggplot2)
## Loading required package: ggplot2
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
require(wordcloud)
## Loading required package: wordcloud
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'wordcloud'
require(tidyr)
## Loading required package: tidyr
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
require(readr)
## Loading required package: readr
require(scales)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor

1.1 匯入檔案&資料前處理

Cinderalla<-read_file("Cinderella.txt")
C <- strsplit(Cinderalla, "[.]")[[1]]
C <- data.frame(text = C)
C$text<-C$text %>% as.character()
C <- C %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
C$book<-c("Cinderalla")
Beauty<-read_file("Beauty and the beast.txt")
B <- strsplit(Beauty, "[.]")[[1]]
B <- data.frame(text = B)
B$text<-B$text %>% as.character()
B <- B %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
B$book<-c("Beauty and the Beast")
Sleep<-read_file("Sleeping beauty in the wood.txt")
S <- strsplit(Sleep, "[.]")[[1]]
S <- data.frame(text = S)
S$text<-S$text %>% as.character()
S <- S %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
S$book<-c("Sleeping Beauty in the Wood")
Aladdin<-read_file("ALADDIN AND THE WONDERFUL LAMP.txt")
A <- strsplit(Aladdin, "[.]")[[1]]
A <- data.frame(text = A)
A$text<-A$text %>% as.character()
A <- A %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
A$book<-c("Aladdin and the Wonderful Lamp")

1.2 合併四本書

Fairy_Tales<-rbind(B,C,S,A)

2. NRC 字典過濾出“joy”字詞

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

Fairy_Tales %>%
  filter(book == "Beauty and the Beast") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 77 x 2
##    word         n
##    <chr>    <int>
##  1 beauty      80
##  2 found       24
##  3 good        19
##  4 happy       11
##  5 splendid     8
##  6 journey      6
##  7 love         6
##  8 promise      6
##  9 glad         5
## 10 save         5
## # ... with 67 more rows

3.1 用bing字典比較四本童話故事的情緒分布

library(tidyr)

bing_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)%>%
  mutate(method = "bing")
## Joining, by = "word"
bing_Fairy_Tales
## # A tibble: 526 x 6
##    book                           index negative positive sentiment method
##    <chr>                          <dbl>    <dbl>    <dbl>     <dbl> <chr> 
##  1 Aladdin and the Wonderful Lamp     1        4        1        -3 bing  
##  2 Aladdin and the Wonderful Lamp     2        2        0        -2 bing  
##  3 Aladdin and the Wonderful Lamp     3        1        0        -1 bing  
##  4 Aladdin and the Wonderful Lamp     4        1        0        -1 bing  
##  5 Aladdin and the Wonderful Lamp     5        2        1        -1 bing  
##  6 Aladdin and the Wonderful Lamp     8        1        0        -1 bing  
##  7 Aladdin and the Wonderful Lamp    10        1        0        -1 bing  
##  8 Aladdin and the Wonderful Lamp    11        1        0        -1 bing  
##  9 Aladdin and the Wonderful Lamp    12        1        0        -1 bing  
## 10 Aladdin and the Wonderful Lamp    13        0        3         3 bing  
## # ... with 516 more rows
  • 因為文本字數不多所以index跟原本的行數一樣

3.1.1 bing畫圖

library(ggplot2)

ggplot(bing_Fairy_Tales, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

  • 童話故事最後的結局幾乎都是正面的,尤其是灰姑娘及美女與野獸。(補充:睡美人最後面幾乎都在講王后的結局,因此負面情緒較高)
  • Beauty and the Beast 在大約27行-30之間負面情緒很高,主要是變成野獸的時候十分無助,因為他朋友都以為他已經死了
  • Sleeping Beauty在第58行時獲得高分的正面情緒,正是王子親完公主時,對公主表達愛慕之意

3.2 用Afinn字典比較四本童話故事的情緒分數

Afinn_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("afinn")) %>%
  count(book, index = linenumber %/% 1, score) %>%
  mutate(sentiment = score*n)%>%
  mutate(method = "Afinn")
## Joining, by = "word"
Afinn_Fairy_Tales
## # A tibble: 913 x 6
##    book                           index score     n sentiment method
##    <chr>                          <dbl> <int> <int>     <int> <chr> 
##  1 Aladdin and the Wonderful Lamp     1    -2     2        -4 Afinn 
##  2 Aladdin and the Wonderful Lamp     1     2     1         2 Afinn 
##  3 Aladdin and the Wonderful Lamp     2    -3     1        -3 Afinn 
##  4 Aladdin and the Wonderful Lamp     2    -2     2        -4 Afinn 
##  5 Aladdin and the Wonderful Lamp     4    -3     1        -3 Afinn 
##  6 Aladdin and the Wonderful Lamp     8    -3     1        -3 Afinn 
##  7 Aladdin and the Wonderful Lamp     9     1     1         1 Afinn 
##  8 Aladdin and the Wonderful Lamp    11    -2     1        -2 Afinn 
##  9 Aladdin and the Wonderful Lamp    12    -1     1        -1 Afinn 
## 10 Aladdin and the Wonderful Lamp    13     2     2         4 Afinn 
## # ... with 903 more rows

3.2.1 Afinn畫圖

#要把每個index對應的sentiments相加不然圖畫出來數值是錯的
Afinn_Fairy_Tales1<-Afinn_Fairy_Tales %>% 
  group_by(book,index)%>% 
  summarise(sentiment=sum(sentiment))%>%
  mutate(method = "Afinn")

Afinn_Fairy_Tales1%>%
  ggplot(., aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

3.3 用NRC字典比較四本童話故事的情緒分數

NRC_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("nrc")) %>% 
  filter(sentiment %in% c("positive", "negative"))%>%
  count(book, index = linenumber %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)%>%
  mutate(method = "NRC")
## Joining, by = "word"
NRC_Fairy_Tales
## # A tibble: 566 x 6
##    book                           index negative positive sentiment method
##    <chr>                          <dbl>    <dbl>    <dbl>     <dbl> <chr> 
##  1 Aladdin and the Wonderful Lamp     1        1        0        -1 NRC   
##  2 Aladdin and the Wonderful Lamp     2        1        0        -1 NRC   
##  3 Aladdin and the Wonderful Lamp     3        1        1         0 NRC   
##  4 Aladdin and the Wonderful Lamp     4        0        1         1 NRC   
##  5 Aladdin and the Wonderful Lamp     5        2        2         0 NRC   
##  6 Aladdin and the Wonderful Lamp     6        1        1         0 NRC   
##  7 Aladdin and the Wonderful Lamp     7        1        2         1 NRC   
##  8 Aladdin and the Wonderful Lamp     8        0        2         2 NRC   
##  9 Aladdin and the Wonderful Lamp     9        1        1         0 NRC   
## 10 Aladdin and the Wonderful Lamp    10        2        1        -1 NRC   
## # ... with 556 more rows

3.3.1 NRC畫圖

library(ggplot2)

ggplot(NRC_Fairy_Tales, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

3.4 用四本童話故事與三本字典比較的結果

book_all <- bind_rows(bing_Fairy_Tales, Afinn_Fairy_Tales1, NRC_Fairy_Tales)
book_all %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(method~book, scales = "free_x") +
  labs(x = NULL, y = NULL) 

  • 結論:可發現三種字典對於童話故事的正負面字詞比較都差不多,但是Afinn字典所計算出的情緒起伏比較極端,可能是因為Afinn是計算score,bing跟NRC是統計negative與positive次數,所以差距較大

4.最常出現的正面字及負面字

bing_word_counts <- Fairy_Tales %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) 
## Joining, by = "word"
bing_word_counts
## # A tibble: 463 x 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 beauty    positive     83
##  2 good      positive     39
##  3 great     positive     36
##  4 beautiful positive     22
##  5 well      positive     20
##  6 fell      negative     18
##  7 gold      positive     17
##  8 like      positive     15
##  9 dead      negative     14
## 10 fine      positive     14
## # ... with 453 more rows

5.1 畫圖

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

  • 童話故事書常見的正負面字詞,負面字詞常出現兇猛、貧窮、死亡、失去、恐懼;正面字詞則有漂亮、美麗、愛、滿足等

5.2 統計四本書負向字前十名

#睡美人
Sleep_negative<-Fairy_Tales %>%
  filter(book=="Sleeping Beauty in the Wood")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#阿拉丁
Aladdin_negative<-Fairy_Tales %>%
  filter(book=="Aladdin and the Wonderful Lamp")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#美女與野獸
Beauty_negative<-Fairy_Tales %>%
  filter(book=="Beauty and the Beast")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#Cinderalla
Cinderalla_negative<-Fairy_Tales %>%
  filter(book=="Cinderalla")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"

5.2.1 畫圖

all_negative <- bind_rows(Sleep_negative, Beauty_negative, Aladdin_negative, Cinderalla_negative)
all_negative %>% 
  ggplot(aes(reorder(word, n), n)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  geom_text(aes(label=n, hjust = -0.5))+
  labs(x = "字詞", y = NULL) +
  theme(text=element_text(size=12)) +
  coord_flip()

  • 阿拉丁代表字:slave奴隸;睡美人:fell睡著;美女與野獸:野獸本身有許多負面的形容詞
  • 負面的字詞幾乎都有談到貧窮、死亡有關,也是在童話故事中常見的背景描述

5.3 統計四本書正向字前十名

#睡美人
Sleep_positive<-Fairy_Tales %>%
  filter(book=="Sleeping Beauty in the Wood")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#阿拉丁
Aladdin_positive<-Fairy_Tales %>%
  filter(book=="Aladdin and the Wonderful Lamp")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#美女與野獸
Beauty_positive<-Fairy_Tales %>%
  filter(book=="Beauty and the Beast")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"
#Cinderalla
Cinderalla_positive<-Fairy_Tales %>%
  filter(book=="Cinderalla")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)
## Joining, by = "word"

5.3.1 畫圖

all_positive <- bind_rows(Sleep_positive, Beauty_positive, Aladdin_positive, Cinderalla_positive)
all_positive %>% 
  ggplot(aes(reorder(word, n), n)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  geom_text(aes(label=n, hjust = 0))+
  labs(x = "字詞", y = NULL) +
  theme(text=element_text(size=12)) +
  coord_flip()

  • 正面字幾乎都有beauty,good,great,gold
  • 比較特別的字詞例如阿拉丁有出現magic

6.1 正負面字詞比率

bing <- get_sentiments("bing") 

wordcounts <- Fairy_Tales %>%
  group_by(book) %>%
  summarize(words = n())

plot_Fairt<-Fairy_Tales %>%
  inner_join(bing) %>%
  group_by(book,sentiment) %>%
  summarize(count=n() ) %>%
  left_join(wordcounts, by = c("book")) %>%
  mutate(ratio= count/words)
## Joining, by = "word"
plot_Fairt
## # A tibble: 8 x 5
## # Groups:   book [4]
##   book                           sentiment count words  ratio
##   <chr>                          <chr>     <int> <int>  <dbl>
## 1 Aladdin and the Wonderful Lamp negative    156  5317 0.0293
## 2 Aladdin and the Wonderful Lamp positive    150  5317 0.0282
## 3 Beauty and the Beast           negative    241  7196 0.0335
## 4 Beauty and the Beast           positive    360  7196 0.0500
## 5 Cinderalla                     negative     48  2475 0.0194
## 6 Cinderalla                     positive     96  2475 0.0388
## 7 Sleeping Beauty in the Wood    negative     85  3654 0.0233
## 8 Sleeping Beauty in the Wood    positive    133  3654 0.0364

6.2 四本童話故事書正負面比率比較

plot_Fairt %>%
  ggplot(aes(fill=sentiment,x=book,y=ratio))+ 
  geom_bar(position="dodge",stat="identity") +
    theme(text = element_text(size=10),
        axis.text.x = element_text(vjust = 0.5, hjust = 0.5, angle = 15))

  • 童話故事比較常出現正面字詞,以四本書來看,只有阿拉丁這本書的出現的負面字詞比率較正面字詞高