sentiment analysis in Harry Potter I~III

資料來源

Harry Potter 1 Harry Potter and the Sorcerer’s Stone
- 資料來源： https://reurl.cc/KdMEy
- 中文對照 : https://reurl.cc/L5NV3
harry Potter 2 Harry Potter and the Chamber of Secrets
- 資料來源： https://reurl.cc/xploe
- 中文對照 : https://reurl.cc/G5eQx
harry potter 3 Harry Potter and the Prisoner of Azkaban
- 資料來源： https://reurl.cc/kMlk3
- 中文對照 : https://reurl.cc/RLv8e
role
- 資料來源： https://reurl.cc/Nb2an

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

[1] ""

載入package

library(data.table)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(stringr)
library(tidytext)
library(janeaustenr)
library(tidyr)
library(plotly)

Warning: package 'plotly' was built under R version 3.5.3

Loading required package: ggplot2


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(wordcloud2)
library(ggplot2)

載入哈利波特第一到第三集文本

harryI = fread("Harry Potter and the Sorcerer.txt",sep = "\n")
harryII= fread("Harry Potter and the Chamber of Secrets.txt",sep = "\n")
harryIII = fread("Harry Potter and the Prisoner of Azkaban.txt",sep = "\n")
harry_role = fread("harry role list.txt",sep = "\n")
colnames(harryI) = "text"
colnames(harryII) = "text"
colnames(harryIII) = "text"
colnames(harry_role) = "text"

刪除每一集的空行

harryI <-  harryI %>%  
  filter(text!="") %>% 
  distinct(text)
harryII <-  harryII %>%  
  filter(text!="") %>% 
  distinct(text)
harryIII <-  harryIII %>%  
  filter(text!="") %>% 
  distinct(text)

標記集數以及切割章節

根據上方整理出來的規則，我們可以使用正規表示式，將句子區分章節並且過濾stop word

episode = c("I","II","III")
harryI <- harryI %>% 
  mutate( linenumber = row_number(),
          episode = episode[1],
  chapter = cumsum(str_detect(harryI$text, regex("^CHAPTER ")))) %>%
  ungroup() %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) #第一集共17回 

harryII <- harryII %>% 
  mutate(linenumber = row_number(),
         episode = episode[2],
         chapter = cumsum(str_detect(harryII$text, regex("^CHAPTER ")))) %>% 
  ungroup() %>%
  unnest_tokens(word, text)%>%
  filter(!word %in% stop_words$word)#第二集共18回
harryIII <- harryIII %>% 
  mutate(linenumber = row_number(),
         episode = episode[3],
         chapter = cumsum(str_detect(harryIII$text, regex("^CHAPTER ")))) %>% 
  ungroup() %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)#第三集共22回
# 將三集的合在一起
harryI_III = rbind(harryI,harryII,harryIII)

harry_role <- harry_role %>% 
  unnest_tokens(word, text)

情緒字典

我們共使用三個情緒字典分別為老師介紹的afinn,bing,nrc以及我們在github找到的

# get_sentiments("afinn")
# get_sentiments("bing")
# get_sentiments("nrc")
dic_other_p = fread("./positive-words.txt", sep=" ", header = F) %>% mutate(sentiment = "positive")
colnames(dic_other_p)[1] = "word"
dic_other_n = fread("./negative-words.txt", sep=" ", header = F) %>% mutate(sentiment = "negative")
colnames(dic_other_n)[1] = "word"
dic_other = rbind(dic_other_p, dic_other_n)
head(dic_other,10)

          word sentiment
1       abound  positive
2      abounds  positive
3    abundance  positive
4     abundant  positive
5   accessable  positive
6   accessible  positive
7      acclaim  positive
8    acclaimed  positive
9  acclamation  positive
10    accolade  positive

依照chapter做情緒分析，並且畫出各集每個章節的情緒圖圖

harryI_III_sentiment <- harryI_III %>%
  inner_join(get_sentiments("bing")) %>%
  count(episode,chapter, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

Joining, by = "word"

#畫出各集每個章節的情緒圖圖
harryI_III_sentiment %>% ggplot(aes(chapter, sentiment, chapter))+
  geom_col(show.legend = FALSE)+ 
  ggtitle("Plot of Harry Potter sentiment") +
  xlab("Chapter") + ylab("sentiment")+
  facet_wrap(~ episode, ncol = 2, scales = "free_x")

比較四個字典

afinn <- harryI_III %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(episode,chapter) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")

Joining, by = "word"

bing_and_nrc_and_other <-bind_rows(harryI_III %>% 
                          inner_join(dic_other) %>%      
                          mutate(method = "OTHER")
                          ,bind_rows(harryI_III %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          harryI_III %>% 
                            inner_join(get_sentiments("nrc") %>% 
                            filter(sentiment %in% c("positive","negative"))) %>%
                            mutate(method = "NRC"))) %>%
  count(method,episode, chapter, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

Joining, by = "word"
Joining, by = "word"
Joining, by = "word"

# 畫出四個字典的比較圖圖
all_plot <-bind_rows(afinn, 
          bing_and_nrc_and_other) %>%
  ggplot(aes(chapter,sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(episode~method, scales = "fixed")
all_plot

分析結果

哈利波特I:

整體上都蠻負面的，較明顯的章節為ch4 ,15
- ch4 巨人海格——前往小屋為哈利祝賀11歲生日，告訴哈利他是個巫師。哈利也知道了他父母的真正死因和他額頭上傷疤的由來，對他姨父很氣憤。海格在描述它父母死因時使用了許多難過以及殺人的修飾，導致較為負面。
- ch15哈利、赫敏、馬爾福和奈威被懲罰，和海格到禁林裡去尋找獨角獸，哈利遇到了神秘的人。過程中遇到獨角獸以及提到狼人，該章節有許多負面詞彙討論獨角獸
ch1,5,13相較其他的章節較為正面
- ch1德思禮一家登場了。一場由門前台階上“從天而降”的哈利.波特以及鄧布利多與麥格教授的一段談話展開的奇幻故事拉起了帷幕。只有描述角色的出現。沒有使用相較負面或是正面的詞彙。
- ch5海格帶哈利去了倫敦的對角巷，哈利領回了他父親留給他的財產，還買了些學校需要的東西，一支魔杖，和一隻名叫海德薇的雪白的貓頭鷹。同時，海格從古靈閣拿到了一件神秘的包裹。哈利遇見了德拉科.馬爾福，海格把霍格沃茨特快的車票給了哈利。沒有發生什麼特殊事件，只有描述劇情
- ch13哈利找到尼可勒梅的名字，赫敏發現了路威看守的是魔法石，並告訴哈利和羅恩。也是交待劇情走向沒有特殊的事件發生

哈利波特II：

整體上都蠻負面的，較明顯的章節為ch10, 11, 15, 16
- ch10 這賽季的第一場魁地奇比賽,哈利也因此受傷。
- ch11 密室內居住了可怕的怪物。由於哈利會跟蛇講話，讓大家一度認為哈利波特是兇手。
- ch15 哈利和榮恩去森林裡,森林黑暗恐怖,並且遇到了蜘蛛阿拉哥跟他們說密室的事。
- ch16 找到密室入口,並遇到當時死亡的麥朵,也前往危險的密室去拯救金妮。
ch5相較前後兩章節特別負面
- ch4描述哈利被接到榮恩家中;去購買課本時，遇上知名作家吉德羅·洛哈，並遇上了馬份父子，魯休斯還趁大家不注意時，把湯姆·瑞斗的日記塞進金妮的大釜中;
- ch6 也是在寫一些日常上課的事;
- ch5 因為這段是開學當天，哈利和榮恩無法搭上特快車，只好偷開榮恩爸爸的飛天魔法車趕往學園，結果卻撞壞了珍貴的渾拼柳，讓石內卜大發雷霆，差點將他們趕出校園。整章較為緊張刺激，且包含偷，撞壞，退學等狀況，因此ch5(驚險)相較ch4,ch6(日常)較為負面。
最不負面的ch18
- 救完金妮的快樂結局！

哈利波特III：

整體上都蠻負面的，較明顯的章節為ch3, 10, 17, 19, 21。
- Ch3 剛逃離威農姨丈家，充滿恐懼不安，擔心被懲罰;加上介紹天狼星用了許多外界給予天狼星的負面評價。
- Ch12 較為正面情緒，可能與招喚守護神中哈利找尋快樂記憶有關。
- Ch17 和天狼星對峙，心中掙扎要不要殺他的過程中，有許多負面情緒。

哈利波特I~III:

哈利波特整體由於是科幻小說，會時常出現可怕的怪物或是遇見怪誕的事物，作者用了許多方式去形容，導致 semtiment的分數都偏低

計算各個字的情緒貢獻

bing_word_counts <-harryI_III %>%
  inner_join(get_sentiments("bing")) %>%
  count(episode,word, sentiment, sort = TRUE) %>%
  ungroup()

Joining, by = "word"

bing_word_counts

# A tibble: 3,324 x 4
   episode word   sentiment     n
   <chr>   <chr>  <chr>     <int>
 1 III     fudge  negative    100
 2 II      dark   negative     86
 3 III     dark   negative     79
 4 III     magic  positive     64
 5 I       dark   negative     62
 6 II      magic  positive     58
 7 I       magic  positive     48
 8 III     slowly negative     47
 9 I       fell   negative     46
10 II      hard   negative     46
# ... with 3,314 more rows

繪製成圖表

bing_word_counts %>%
  group_by(sentiment) %>%
  # top_n(10) %>%
  filter(n>30) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_grid(episode~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

計算種類數

bing_I= bing_word_counts %>%
  filter(episode=="I")
table(bing_I$sentiment)


negative positive 
     602      295

bing_II= bing_word_counts %>%
  filter(episode=="II")
table(bing_II$sentiment)


negative positive 
     815      379

bing_III= bing_word_counts %>%
  filter(episode=="III")
table(bing_III$sentiment)


negative positive 
     844      389

bing_I_III <-rbind(bing_I,bing_II,bing_III) %>%
ggplot(aes(episode,n))+
    geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "fixed") 
bing_I_III

繪製成文字雲

count_word_harryI <- harryI %>%
  anti_join(stop_words) %>%
  count(word) %>%
  filter(n>20)%>%
  arrange(desc(n)) %>%
  wordcloud2()

Joining, by = "word"

count_word_harryI

count_word_harryII <- harryII %>%
  anti_join(stop_words) %>%
  count(word) %>%
  filter(n>20)%>%
  arrange(desc(n)) %>%
  wordcloud2()

Joining, by = "word"

count_word_harryII

count_word_harryIII <- harryIII %>%
  anti_join(stop_words) %>%
  count(word) %>%
  filter(n>20)%>%
  arrange(desc(n)) %>%
  wordcloud2()

Joining, by = "word"

count_word_harryIII

哈利波特三集挑出negative特別高的章節，找出該章節出現角色的次數，與劇情連結

harryI_choose_chapter = harryI %>% 
  filter(chapter == 4 | chapter == 15 ) %>% 
  inner_join(harry_role, by = c("word" = "word")) %>%
  count(chapter, word) %>%
  arrange(desc(n)) %>%
  arrange(chapter) %>%
  group_by(chapter) %>%
  top_n(n = 7, wt = n)
ggplotly(ggplot(harryI_choose_chapter, aes(chapter, n, fill = word))+   # 互動式圖表
          geom_col(show.legend = F))

harryII_choose_chapter = harryII %>% 
  filter(chapter == 10 | chapter == 11 | chapter == 15 | chapter == 16) %>% 
  inner_join(harry_role, by = c("word" = "word")) %>%
  count(chapter, word) %>%
  arrange(desc(n)) %>%
  arrange(chapter) %>%
  group_by(chapter) %>%
  top_n(n = 7, wt = n)
ggplotly(ggplot(harryII_choose_chapter, aes(chapter, n, fill = word))+   # 互動式圖表
          geom_col(show.legend = F))

harryIII_choose_chapter = harryIII %>% 
  filter(chapter == 3 | chapter == 10 | chapter == 17 | chapter == 19 | chapter == 21) %>% 
  inner_join(harry_role, by = c("word" = "word")) %>%
  count(chapter, word) %>%
  arrange(desc(n)) %>%
  arrange(chapter) %>%
  group_by(chapter) %>%
  top_n(n = 7, wt = n)
ggplotly(ggplot(harryIII_choose_chapter, aes(chapter, n, fill = word))+   # 互動式圖表
          geom_col(show.legend = F))

分析結果

episode I

ch4 巨人hagrid——前往小屋為哈利祝賀11歲生日，告訴哈利他是個巫師。哈利也知道了他父母（dursley family）的真正死因和他額頭上傷疤的由來，對他姨父很氣憤。海格在描述它父母死因時使用了許多難過以及殺人的修飾，導致較為負面。
ch15 harry、Hermione、malfoy和neveille被懲罰，和hagrid到禁忌森林裡去尋找獨角獸，harry遇到了神秘的人馬(ronan)。過程中遇到獨角獸以及提到狼人，該章節有許多負面詞彙討論獨角獸

episode II

ch10 dobby在這章出現次數高是因為他希望哈利波特受傷並從霍格沃茨回家，從而避開密室和斯萊特林蛇的開放,所以偷偷竄改了魁地奇比賽。另外因為在講魁地奇球賽,madam pince出現的次數也較多。
ch11 在決鬥社上malfoy用咒語變出毒蛇攻擊哈利，snape表示要救哈利，被試圖表現的洛哈阻止，但最後仍由他除掉毒蛇。
ch15 這章特別出現aragog,是因為他們進入森林,遇到了蜘蛛aragog跟他們說密室的事。(hagrid叫主角去找它,feng跟主角們一起進去)相較於第一部跟第三部,這部出現了很多次ginny weasley (因為他在此書中是這一年中,一連串對麻瓜學生的神秘襲擊的關鍵),尤其自在本書ch16,就是在描述harry跟ron(也是金妮的哥哥)為拯救ginny weasley進入密室的故事,所以ron,wealey,ginny的數量都比其他章節更多一些。

episode III

ch3 harry從姨丈家逃出來，並從車掌聽說black逃獄，開啟了整集的序幕
ch10 harry在酒吧聽到教授們談論black以前的故事，並且以為是black背叛harry父親進而害死
ch17 harry、hermione、ron發現lupin教授與black是舊識，並說到是black害死harry父親且殺害pettigrew
ch19 pettigrew從一隻寵物鼠現形，lupin與black向harry等人述說pettigrew加入佛地魔並背叛harry爸爸的事
ch21 dumbledore示意harry與herminoe回到過去，拯救無辜的black與鷹馬

社群媒體分析_HW2(group8)

bolun lin

2019-04-01 21:03:57

sentiment analysis in Harry Potter I~III

系統參數設定

載入package

載入哈利波特第一到第三集文本

刪除每一集的空行

標記集數以及切割章節

情緒字典

依照chapter做情緒分析，並且畫出各集每個章節的情緒圖圖

比較四個字典

分析結果

哈利波特I:

哈利波特II：

哈利波特III：

哈利波特I~III:

計算各個字的情緒貢獻

繪製成圖表

計算種類數

繪製成文字雲

哈利波特三集挑出negative特別高的章節，找出該章節出現角色的次數，與劇情連結

分析結果

episode I

episode II

episode III