SMS_HW2

1. 載入package

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
require(tidytext)

## Loading required package: tidytext

library(wordcloud2)
require(data.table)

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require(ggplot2)

## Loading required package: ggplot2

require(reshape2)

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

require(wordcloud)

## Loading required package: wordcloud

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'wordcloud'

require(tidyr)

## Loading required package: tidyr

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

require(readr)

## Loading required package: readr

require(scales)

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

1.1 匯入檔案&資料前處理

資料來源：http://www.mythfolklore.net/andrewlang/blue.htm

Cinderalla<-read_file("Cinderella.txt")
C <- strsplit(Cinderalla, "[.]")[[1]]
C <- data.frame(text = C)
C$text<-C$text %>% as.character()
C <- C %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
C$book<-c("Cinderalla")

Beauty<-read_file("Beauty and the beast.txt")
B <- strsplit(Beauty, "[.]")[[1]]
B <- data.frame(text = B)
B$text<-B$text %>% as.character()
B <- B %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
B$book<-c("Beauty and the Beast")

Sleep<-read_file("Sleeping beauty in the wood.txt")
S <- strsplit(Sleep, "[.]")[[1]]
S <- data.frame(text = S)
S$text<-S$text %>% as.character()
S <- S %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
S$book<-c("Sleeping Beauty in the Wood")

Aladdin<-read_file("ALADDIN AND THE WONDERFUL LAMP.txt")
A <- strsplit(Aladdin, "[.]")[[1]]
A <- data.frame(text = A)
A$text<-A$text %>% as.character()
A <- A %>% mutate(linenumber = row_number())%>%unnest_tokens(word, text)
A$book<-c("Aladdin and the Wonderful Lamp")

1.2 合併四本書

Fairy_Tales<-rbind(B,C,S,A)

2. NRC 字典過濾出“joy”字詞

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

Fairy_Tales %>%
  filter(book == "Beauty and the Beast") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 77 x 2
##    word         n
##    <chr>    <int>
##  1 beauty      80
##  2 found       24
##  3 good        19
##  4 happy       11
##  5 splendid     8
##  6 journey      6
##  7 love         6
##  8 promise      6
##  9 glad         5
## 10 save         5
## # ... with 67 more rows

3.1 用bing字典比較四本童話故事的情緒分布

library(tidyr)

bing_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)%>%
  mutate(method = "bing")

## Joining, by = "word"

bing_Fairy_Tales

## # A tibble: 526 x 6
##    book                           index negative positive sentiment method
##    <chr>                          <dbl>    <dbl>    <dbl>     <dbl> <chr> 
##  1 Aladdin and the Wonderful Lamp     1        4        1        -3 bing  
##  2 Aladdin and the Wonderful Lamp     2        2        0        -2 bing  
##  3 Aladdin and the Wonderful Lamp     3        1        0        -1 bing  
##  4 Aladdin and the Wonderful Lamp     4        1        0        -1 bing  
##  5 Aladdin and the Wonderful Lamp     5        2        1        -1 bing  
##  6 Aladdin and the Wonderful Lamp     8        1        0        -1 bing  
##  7 Aladdin and the Wonderful Lamp    10        1        0        -1 bing  
##  8 Aladdin and the Wonderful Lamp    11        1        0        -1 bing  
##  9 Aladdin and the Wonderful Lamp    12        1        0        -1 bing  
## 10 Aladdin and the Wonderful Lamp    13        0        3         3 bing  
## # ... with 516 more rows

因為文本字數不多所以index跟原本的行數一樣

3.1.1 bing畫圖

library(ggplot2)

ggplot(bing_Fairy_Tales, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

童話故事最後的結局幾乎都是正面的，尤其是灰姑娘及美女與野獸。(補充：睡美人最後面幾乎都在講王后的結局，因此負面情緒較高)
Beauty and the Beast 在大約27行-30之間負面情緒很高，主要是變成野獸的時候十分無助，因為他朋友都以為他已經死了
Sleeping Beauty在第58行時獲得高分的正面情緒，正是王子親完公主時，對公主表達愛慕之意

3.2 用Afinn字典比較四本童話故事的情緒分數

Afinn_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("afinn")) %>%
  count(book, index = linenumber %/% 1, score) %>%
  mutate(sentiment = score*n)%>%
  mutate(method = "Afinn")

## Joining, by = "word"

Afinn_Fairy_Tales

## # A tibble: 913 x 6
##    book                           index score     n sentiment method
##    <chr>                          <dbl> <int> <int>     <int> <chr> 
##  1 Aladdin and the Wonderful Lamp     1    -2     2        -4 Afinn 
##  2 Aladdin and the Wonderful Lamp     1     2     1         2 Afinn 
##  3 Aladdin and the Wonderful Lamp     2    -3     1        -3 Afinn 
##  4 Aladdin and the Wonderful Lamp     2    -2     2        -4 Afinn 
##  5 Aladdin and the Wonderful Lamp     4    -3     1        -3 Afinn 
##  6 Aladdin and the Wonderful Lamp     8    -3     1        -3 Afinn 
##  7 Aladdin and the Wonderful Lamp     9     1     1         1 Afinn 
##  8 Aladdin and the Wonderful Lamp    11    -2     1        -2 Afinn 
##  9 Aladdin and the Wonderful Lamp    12    -1     1        -1 Afinn 
## 10 Aladdin and the Wonderful Lamp    13     2     2         4 Afinn 
## # ... with 903 more rows

3.2.1 Afinn畫圖

#要把每個index對應的sentiments相加不然圖畫出來數值是錯的
Afinn_Fairy_Tales1<-Afinn_Fairy_Tales %>% 
  group_by(book,index)%>% 
  summarise(sentiment=sum(sentiment))%>%
  mutate(method = "Afinn")

Afinn_Fairy_Tales1%>%
  ggplot(., aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

3.3 用NRC字典比較四本童話故事的情緒分數

NRC_Fairy_Tales <- Fairy_Tales %>%
  inner_join(get_sentiments("nrc")) %>% 
  filter(sentiment %in% c("positive", "negative"))%>%
  count(book, index = linenumber %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)%>%
  mutate(method = "NRC")

## Joining, by = "word"

NRC_Fairy_Tales

## # A tibble: 566 x 6
##    book                           index negative positive sentiment method
##    <chr>                          <dbl>    <dbl>    <dbl>     <dbl> <chr> 
##  1 Aladdin and the Wonderful Lamp     1        1        0        -1 NRC   
##  2 Aladdin and the Wonderful Lamp     2        1        0        -1 NRC   
##  3 Aladdin and the Wonderful Lamp     3        1        1         0 NRC   
##  4 Aladdin and the Wonderful Lamp     4        0        1         1 NRC   
##  5 Aladdin and the Wonderful Lamp     5        2        2         0 NRC   
##  6 Aladdin and the Wonderful Lamp     6        1        1         0 NRC   
##  7 Aladdin and the Wonderful Lamp     7        1        2         1 NRC   
##  8 Aladdin and the Wonderful Lamp     8        0        2         2 NRC   
##  9 Aladdin and the Wonderful Lamp     9        1        1         0 NRC   
## 10 Aladdin and the Wonderful Lamp    10        2        1        -1 NRC   
## # ... with 556 more rows

3.3.1 NRC畫圖

library(ggplot2)

ggplot(NRC_Fairy_Tales, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

3.4 用四本童話故事與三本字典比較的結果

book_all <- bind_rows(bing_Fairy_Tales, Afinn_Fairy_Tales1, NRC_Fairy_Tales)
book_all %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(method~book, scales = "free_x") +
  labs(x = NULL, y = NULL)

結論：可發現三種字典對於童話故事的正負面字詞比較都差不多，但是Afinn字典所計算出的情緒起伏比較極端，可能是因為Afinn是計算score，bing跟NRC是統計negative與positive次數，所以差距較大

4.最常出現的正面字及負面字

bing_word_counts <- Fairy_Tales %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE)

## Joining, by = "word"

bing_word_counts

## # A tibble: 463 x 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 beauty    positive     83
##  2 good      positive     39
##  3 great     positive     36
##  4 beautiful positive     22
##  5 well      positive     20
##  6 fell      negative     18
##  7 gold      positive     17
##  8 like      positive     15
##  9 dead      negative     14
## 10 fine      positive     14
## # ... with 453 more rows

5.1 畫圖

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

童話故事書常見的正負面字詞，負面字詞常出現兇猛、貧窮、死亡、失去、恐懼；正面字詞則有漂亮、美麗、愛、滿足等

5.2 統計四本書負向字前十名

#睡美人
Sleep_negative<-Fairy_Tales %>%
  filter(book=="Sleeping Beauty in the Wood")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#阿拉丁
Aladdin_negative<-Fairy_Tales %>%
  filter(book=="Aladdin and the Wonderful Lamp")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#美女與野獸
Beauty_negative<-Fairy_Tales %>%
  filter(book=="Beauty and the Beast")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#Cinderalla
Cinderalla_negative<-Fairy_Tales %>%
  filter(book=="Cinderalla")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

5.2.1 畫圖

all_negative <- bind_rows(Sleep_negative, Beauty_negative, Aladdin_negative, Cinderalla_negative)
all_negative %>% 
  ggplot(aes(reorder(word, n), n)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  geom_text(aes(label=n, hjust = -0.5))+
  labs(x = "字詞", y = NULL) +
  theme(text=element_text(size=12)) +
  coord_flip()

阿拉丁代表字：slave奴隸；睡美人：fell睡著；美女與野獸：野獸本身有許多負面的形容詞
負面的字詞幾乎都有談到貧窮、死亡有關，也是在童話故事中常見的背景描述

5.3 統計四本書正向字前十名

#睡美人
Sleep_positive<-Fairy_Tales %>%
  filter(book=="Sleeping Beauty in the Wood")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#阿拉丁
Aladdin_positive<-Fairy_Tales %>%
  filter(book=="Aladdin and the Wonderful Lamp")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#美女與野獸
Beauty_positive<-Fairy_Tales %>%
  filter(book=="Beauty and the Beast")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

#Cinderalla
Cinderalla_positive<-Fairy_Tales %>%
  filter(book=="Cinderalla")%>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"))%>%
  count(book,word, sort = TRUE)%>%
  head(.,10)

## Joining, by = "word"

5.3.1 畫圖

all_positive <- bind_rows(Sleep_positive, Beauty_positive, Aladdin_positive, Cinderalla_positive)
all_positive %>% 
  ggplot(aes(reorder(word, n), n)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  geom_text(aes(label=n, hjust = 0))+
  labs(x = "字詞", y = NULL) +
  theme(text=element_text(size=12)) +
  coord_flip()

正面字幾乎都有beauty,good,great,gold
比較特別的字詞例如阿拉丁有出現magic

6.1 正負面字詞比率

bing <- get_sentiments("bing") 

wordcounts <- Fairy_Tales %>%
  group_by(book) %>%
  summarize(words = n())

plot_Fairt<-Fairy_Tales %>%
  inner_join(bing) %>%
  group_by(book,sentiment) %>%
  summarize(count=n() ) %>%
  left_join(wordcounts, by = c("book")) %>%
  mutate(ratio= count/words)

## Joining, by = "word"

plot_Fairt

## # A tibble: 8 x 5
## # Groups:   book [4]
##   book                           sentiment count words  ratio
##   <chr>                          <chr>     <int> <int>  <dbl>
## 1 Aladdin and the Wonderful Lamp negative    156  5317 0.0293
## 2 Aladdin and the Wonderful Lamp positive    150  5317 0.0282
## 3 Beauty and the Beast           negative    241  7196 0.0335
## 4 Beauty and the Beast           positive    360  7196 0.0500
## 5 Cinderalla                     negative     48  2475 0.0194
## 6 Cinderalla                     positive     96  2475 0.0388
## 7 Sleeping Beauty in the Wood    negative     85  3654 0.0233
## 8 Sleeping Beauty in the Wood    positive    133  3654 0.0364

6.2 四本童話故事書正負面比率比較

plot_Fairt %>%
  ggplot(aes(fill=sentiment,x=book,y=ratio))+ 
  geom_bar(position="dodge",stat="identity") +
    theme(text = element_text(size=10),
        axis.text.x = element_text(vjust = 0.5, hjust = 0.5, angle = 15))

童話故事比較常出現正面字詞，以四本書來看，只有阿拉丁這本書的出現的負面字詞比率較正面字詞高