情緒分析_童話故事:快樂王子、灰姑娘

快樂王子是悲慘的童話故事，灰姑娘結尾比較歡樂，欲比較兩種故事類型的用詞、情緒

#下載快樂王子，在書中第53~352行
book_prince <- gutenberg_download(30120) %>% filter(text!="") %>% distinct(gutenberg_id, text)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

prince <- book_prince[53:352,]

#下載灰姑娘
cinderalla <- gutenberg_download(10830) %>% filter(text!="") %>% 
distinct(gutenberg_id, text)

#快樂王子
#因為happy、prince會被切開使"happy"出現次數太多，會影響最後情緒分析
#把"happy prince"視為一個單詞

text<- paste0(prince$text, collapse = ' ') #把所有的文字接在一起

#gsub把正則表達式抓出來的字整個取代成happy prince
text <- gsub('(H|h)appy (P|p)rince','HappyPrince',text) 

df <- tibble(text = text) #tibble存成dataframe

#快樂王子_word分列列出，去除stopwords，增加linenumber
tidy_prince <- df %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)%>%
  mutate(linenumber = row_number())

## Joining, by = "word"

#灰姑娘_word分列列出，去除stopwords，增加linenumber
tidy_Cinderalla <- cinderalla %>%
  unnest_tokens(word, text)  %>%
  anti_join(stop_words) %>% 
  mutate(linenumber = row_number())

## Joining, by = "word"

#灰姑娘_用bing分析 觀察各章節情緒起伏
tidy_Cinderalla %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(index= linenumber%/%30) %>% 
  group_by(index, sentiment) %>%
  summarise(sum=n()) %>%
  spread(sentiment, sum, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive")) %>% 
  ggplot(aes(x= index,y=sentiment)) +
  geom_col(aes(fill = color)) +
  geom_text(aes(label=sentiment))

## Joining, by = "word"

劇情先悲後喜

#快樂王子_用bing分析 觀察各章節情緒起伏
tidy_prince %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(index= linenumber%/%30) %>% 
  group_by(index, sentiment) %>%
  summarise(sum=n()) %>%
  spread(sentiment, sum, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive")) %>% 
  ggplot(aes(x= index,y=sentiment)) +
  geom_col(aes(fill = color)) +
  geom_text(aes(label=sentiment))

## Joining, by = "word"

劇情最後很悲慘，所以負面詞也多很多

#快樂王子_比較用3個辭典有甚麼差別 
prince_afinn <- tidy_prince %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 5) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

prince_bing_and_nrc <- bind_rows(tidy_prince %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_prince %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 5, sentiment) %>%  #因長度不多，縮小微度
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(prince_afinn, 
          prince_bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

原本以為負面詞會很多，但其實還好，只有用Bing分析正負面的比例看起來差不多，可能跟字典收錄的詞有關

#灰姑娘_比較用3個辭典有甚麼差別
afinn <- tidy_Cinderalla %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 5) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(tidy_Cinderalla %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_Cinderalla %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 5, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

灰姑娘用三個辭典都是正面的多，在故事剛開始雖然比較悲慘，到中後面就歡樂的多

#快樂王子_畫出正負面的詞各有什麼字詞
prince_bing_word_counts <- tidy_prince %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

prince_bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

快樂王子負面詞描述生活環境的較多，與故事內描述幫助窮人的情況相符

#灰姑娘_畫出正負面的詞各有什麼字詞
bing_word_counts <- tidy_Cinderalla %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

灰姑娘正負面的詞是較多在描述人上

poor,beautiful在兩本書出現頻率都很高，在灰姑娘中劇情用poor多用在可憐，在快樂王子則指窮人。觀察發現童話故事書的用字都較簡單、直接

#快樂王子內出現的字畫成文字雲
tidy_prince %>%
  anti_join(stop_words) %>%
  count(word,sort=TRUE) %>% 
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

#灰姑娘內出現的字次數畫成文字雲
tidy_Cinderalla %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

#快樂王子_正、負面的詞
tidy_prince %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

#灰姑娘_正、負面的詞
tidy_Cinderalla %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): stranger could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): troubles could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): tyrannical could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): uneasiness could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): unhappy could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): unworthy could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): wretched could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words =
## 100): yawn could not be fit on page. It will not be plotted.

情緒分析_童話故事:快樂王子、灰姑娘

第六組

2019年4月1日

安裝需要的packages

快樂王子是悲慘的童話故事，灰姑娘結尾比較歡樂，欲比較兩種故事類型的用詞、情緒

劇情先悲後喜

劇情最後很悲慘，所以負面詞也多很多

原本以為負面詞會很多，但其實還好，只有用Bing分析正負面的比例看起來差不多，可能跟字典收錄的詞有關

灰姑娘用三個辭典都是正面的多，在故事剛開始雖然比較悲慘，到中後面就歡樂的多

快樂王子負面詞描述生活環境的較多，與故事內描述幫助窮人的情況相符

灰姑娘正負面的詞是較多在描述人上

poor,beautiful在兩本書出現頻率都很高，在灰姑娘中劇情用poor多用在可憐，在快樂王子則指窮人。觀察發現童話故事書的用字都較簡單、直接

灰姑娘做出來符合預期，快樂王子做出來後也不到太負面，可能是最後成為天使讓情緒被拉上去，也可能是快樂王子用比較少的正負面情緒詞，推測童話書都不想讓小孩子太負面吧。