load package

library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(tidytext)
library(janeaustenr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.2
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(wordcloud2)
library(ggplot2)
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
## 
##     groups
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

load HARRY POTTER full text file

harry1 = fread("Harry Potter and the Sorcerer.txt", sep = "\n", header = F)
harry2 = fread("Harry Potter and the Chamber of Secrets.txt", sep = "\n" ,header = F)
harry3 = fread("Harry Potter and the Prisoner of Azkaban.txt", sep = "\n", header = F)
harry4 = fread("Harry Potter and the Goblet of Fire.txt", sep = "\n", header = F)
harry5 = fread("Harry Potter and the Order of the Phoenix.txt", sep = "\n", header = F)
harry6 = fread("Harry Potter and the Half-Blood Prince.txt", sep = "\n", header = F)
harry7 = fread("Harry Potter and the Deathly Hallows.txt", sep = "\n", header = F)
colnames(harry1) = "text"
colnames(harry2) = "text"
colnames(harry3) = "text"
colnames(harry4) = "text"
colnames(harry5) = "text"
colnames(harry6) = "text"
colnames(harry7) = "text"

刪除空行、加入書名、標註章節

harry1 = harry1 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-1, ] %>%
  mutate(book = "1the Sorcerer",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry2 = harry2 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-c(1,2), ] %>%
  mutate(book = "2the Chamber of Secrets",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry3 = harry3 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-c(1,2), ] %>%
  mutate(book = "3the Prisoner of Azkaban",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry4 = harry4 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-1, ] %>%
  mutate(book = "4the Goblet of Fire",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry5 = harry5 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-c(1,2), ] %>%
  mutate(book = "5the Order of the Phoenix",chapter = cumsum(str_detect(.$text, regex("^- CHAPTER.*( | |$)"))))
harry6 = harry6 %>%
  subset(text != "") %>%
  distinct(text) %>%
  .[-c(1,2), ] %>%
  mutate(book = "6the Half-Blood Prince",chapter = cumsum(str_detect(.$text, regex("^Chapter.*( | |$)"))))
harry7 = harry7 %>%
  subset(text != "") %>%
  distinct(text) %>%
  mutate(book = "7the Deathly Hallows",chapter = cumsum(str_detect(.$text, regex("^Chapter.*( | |$)"))))

將七集合併成一個data frame

harry = do.call("rbind", list(harry1, harry2, harry3, harry4, harry5, harry6, harry7))

# 把text移到最後一欄
harry = harry[c(2,3,1)] 
# 各詞彙在每本書中的出現次數
harry_words = harry %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = T)

# 每本書的字數
total_words = harry_words %>% 
  group_by(book) %>%
  summarise(total = sum(n))

# 在harry_words加上該詞彙所在書籍之總字數
harry_words = left_join(harry_words, total_words)
## Joining, by = "book"

tf在不同集的分布

ggplot(harry_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend =  F) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1117 rows containing non-finite values (stat_bin).
## Warning: Removed 7 rows containing missing values (geom_bar).

Zipf’s law

freq_by_rank = harry_words %>%
  group_by(book) %>%
  mutate(rank = row_number(), 'term frequency' = n/total)
# rank、term frequency取log後的關係
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = F) + 
  scale_x_log10() +
  scale_y_log10()

# rank在80~1000間的比較接近線性
rank_subset <- freq_by_rank %>% 
  filter(rank < 1000, rank > 80)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.6647      -1.0847
# 加上abline
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.6785, slope = -1.08, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

tf-idf

harry_words = harry_words %>%
  bind_tf_idf(word, book, n) %>%
  select(-total) %>%
  arrange(desc(tf_idf))

harry_words
## # A tibble: 65,990 x 6
##    book                      word            n       tf   idf   tf_idf
##    <chr>                     <chr>       <int>    <dbl> <dbl>    <dbl>
##  1 7the Deathly Hallows      k             958 0.00474  1.25  0.00594 
##  2 7the Deathly Hallows      x            1345 0.00666  0.560 0.00373 
##  3 6the Half-Blood Prince    slughorn      337 0.00197  1.25  0.00246 
##  4 5the Order of the Phoenix umbridge      497 0.00192  0.847 0.00163 
##  5 7the Deathly Hallows      ll            254 0.00126  1.25  0.00158 
##  6 4the Goblet of Fire       bagman        209 0.00109  1.25  0.00136 
##  7 2the Chamber of Secrets   lockhart      196 0.00229  0.560 0.00128 
##  8 3the Prisoner of Azkaban  lupin         368 0.00350  0.336 0.00118 
##  9 4the Goblet of Fire       winky         145 0.000755 1.25  0.000946
## 10 7the Deathly Hallows      xenophilius    92 0.000456 1.95  0.000887
## # … with 65,980 more rows

挑出各書本中tf-idf高的幾個

harry_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 3, scales = "free") +
  theme(axis.text = element_text(size = 8)) +
  coord_flip()
## Selecting by tf_idf

1.第一集(the Sorcerer)中,Quirrell這個詞高出其他許多,他是第一集的反派(被佛地魔附身);Flamel是600多歲的魔法師,為鄧不利多老友,並且是書中唯一能製造魔法石的人。 2.第二集(the Chamber of Secrets),Gilderoy Lockhart為學校新老師,曾在密室中想消除哈利與榮恩的記憶;Dobby是家庭小精靈,起初阻止哈利回霍格華茲,後來時常出現幫助哈利脫困;Riddle則是佛地魔名字,本集闡述許多佛地魔以前的故事。 3.第三集(the Prisoner of Azkaban),Lupin教會哈利使用護法咒來對抗催狂魔,並且他與哈利父親和本集重要人物-天狼星有密不可分的關係;Pettigrew為哈利父親、Lupin、天狼星為學生時期玩伴,但他出賣了哈利父母,使後者遭佛地魔殺害。 4.第四集(the Coblet of Fire),Bagman是此集才有的角色,作為魔法遊戲與運動部司長,在三巫鬥法大賽扮演貫穿劇情的人;Winky為另一個家庭小精靈,在本集首度登場並且也扮演貫穿劇情的角色。 5.第五集(the Order of the Phoenix),Umbridge是霍格華茲新老師兼總督察,與哈利等人有著對立的關係。 6.第六集(the Half-Blood Prince),Slughorn曾是霍格華茲魔藥學教授,本集鄧不利多說服他回學校重新任教黑魔法防禦老師,他相當重視學生才華,因此對哈利與妙麗相當友善,並招募他們進Slughorn俱樂部。

有趣的是,在第1、2、3、5、6集中,最高的都是黑魔法防禦學老師。

Tokenizing by n-gram

harry_bigrams = harry %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)
# count 2-grams
harry_bigrams %>%
  count(bigram, sort = T)
## # A tibble: 337,192 x 2
##    bigram         n
##    <chr>      <int>
##  1 of the      4864
##  2 in the      3558
##  3 said harry  2608
##  4 he was      2483
##  5 at the      2435
##  6 to the      2377
##  7 on the      2353
##  8 he had      2146
##  9 it was      2128
## 10 out of      1900
## # … with 337,182 more rows

separate 2-grams

bigrams_separated <- harry_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 篩選出word1、word2都沒有在stopwords內的
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 87,365 x 3
##    word1        word2          n
##    <chr>        <chr>      <int>
##  1 professor    mcgonagall   584
##  2 uncle        vernon       391
##  3 harry        potter       358
##  4 death        eaters       341
##  5 harry        looked       323
##  6 harry        ron          307
##  7 aunt         petunia      215
##  8 invisibility cloak        192
##  9 dark         arts         178
## 10 professor    trelawney    176
## # … with 87,355 more rows

recombine word1 & word2

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

#bigrams_united

analyze “professor”

bigrams_filtered %>%
  filter(word1 == "professor") %>%
  count(book, word2, sort = TRUE)
## # A tibble: 172 x 3
##    book                      word2          n
##    <chr>                     <chr>      <int>
##  1 5the Order of the Phoenix umbridge     174
##  2 5the Order of the Phoenix mcgonagall   139
##  3 3the Prisoner of Azkaban  lupin        106
##  4 1the Sorcerer             mcgonagall    90
##  5 3the Prisoner of Azkaban  mcgonagall    83
##  6 2the Chamber of Secrets   mcgonagall    78
##  7 4the Goblet of Fire       mcgonagall    76
##  8 6the Half-Blood Prince    mcgonagall    67
##  9 3the Prisoner of Azkaban  trelawney     64
## 10 5the Order of the Phoenix trelawney     51
## # … with 162 more rows

Professor McGonagall是麥教授,霍格華茲副校長、葛萊芬多院長與變形學教授,因此在各集書中出現頻率特別高。 Professor Lupin為第三集中新任的教授,與天狼星關係匪淺,因此在第三集中有大量戲份。 Professor Umbridge 是第五集中新任的黑魔法防禦教授,並擔任霍格華茲的總督察,與哈利等人算是對立的勢力。 Professor Trelawney 是占卜學教授,他在第三集向哈利預言了他和天狼星有關係。

analyze 2-grams

bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
## # A tibble: 105,253 x 6
##    book                      bigram                 n      tf   idf  tf_idf
##    <chr>                     <chr>              <int>   <dbl> <dbl>   <dbl>
##  1 5the Order of the Phoenix professor umbridge   174 0.00536 1.25  0.00672
##  2 3the Prisoner of Azkaban  professor lupin      106 0.00731 0.847 0.00620
##  3 7the Deathly Hallows      elder wand            60 0.00270 1.95  0.00526
##  4 4the Goblet of Fire       ludo bagman           49 0.00201 1.95  0.00390
##  5 3the Prisoner of Azkaban  aunt marge            42 0.00290 1.25  0.00363
##  6 7the Deathly Hallows      death eaters         132 0.00594 0.560 0.00333
##  7 4the Goblet of Fire       madame maxime         89 0.00364 0.847 0.00309
##  8 7the Deathly Hallows      deathly hallows       32 0.00144 1.95  0.00280
##  9 2the Chamber of Secrets   gilderoy lockhart     26 0.00217 1.25  0.00271
## 10 6the Half-Blood Prince    advanced potion       27 0.00130 1.95  0.00252
## # … with 105,243 more rows

各書本中前9個擁有最高tf-idf的2-grams

bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(book) %>% 
  top_n(9) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
  geom_col(show.legend = F) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 3, scales = "free") +
  theme(axis.text = element_text(size = 8)) +
  coord_flip()
## Selecting by tf_idf

利用AFINN詞庫作分析,並考慮negation words

AFINN <- get_sentiments("afinn")
negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = T)

#前十名情緒字前面有加上negation words之影響
negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  mutate(word1 = factor(word1, levels = rev(unique(word1)))) %>%
  #mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%
  group_by(word1) %>%
  top_n(10, wt = abs(contribution)) %>%
  ungroup() %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, contribution, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by negation term") +
  ylab("Sentiment score * number of occurrences") +
  facet_wrap(~word1, ncol = 2, scales = "free") +
  coord_flip()

##反轉情緒

#把內容切成2字word1,word2,順便把stop word清掉
harry_bigrams<-harry%>%
  unnest_tokens(bigram,text,token='ngrams',n=2)%>%
  separate(bigram,c('word1','word2'),sep=' ')%>%
  filter(!word1%in%stop_words$word)%>%
  filter(!word2%in%stop_words$word)
#和afinn情緒字典合併算出這七集的情緒起伏
afinn<-get_sentiments("afinn")
before_sentiment<-harry_bigrams%>%
  count(book,chapter,word1,word2,sort=TRUE)%>%
  inner_join(afinn,by=c(word2="word"))%>%
  mutate(sentiment=n*score)%>%
  group_by(book,chapter)%>%
  summarise(sentiment=sum(sentiment))%>%
  mutate(turn="before")
#畫出反轉前情緒起伏
before_sentiment%>%
  #filter(book=='1the Scorcerer')%>%
  ggplot(aes(chapter,sentiment))+
  geom_col(show.legend = TRUE)+
  facet_wrap(~book,scales='free',ncol=2)+
  labs(y='book sentiment',x='chapter')

#開始反轉情緒
after_sentiment<-harry_bigrams%>%
  count(book,chapter,word1,word2,sort=TRUE)%>%
  inner_join(afinn,by=c(word2="word"))%>%
  mutate(afinn=ifelse(word1=="didn"|word1=="don",-1*score,score))%>%
  group_by(book,chapter)%>%
  summarise(sentiment=sum(afinn))%>%
  mutate(turn="after")
#graph
after_sentiment%>%
  ggplot(aes(chapter,sentiment))+
  geom_col(show.lengend=TRUE)+
  facet_wrap(~book,scales='free',ncol=2)+
  labs(y='book sentiment',x='chapter')
## Warning: Ignoring unknown parameters: show.lengend

#列出反轉前後做比較,僅第六章較明顯(圖太多)
before_after<-rbind(before_sentiment,after_sentiment)
before_after%>%
  filter(book=='6the Half-Blood Prince')%>%
  ggplot(aes(chapter,sentiment))+
  geom_col(show.lengend=TRUE)+
  facet_wrap(book~turn,scales='free',ncol=2)+
  labs(y='before after',x='chapter')
## Warning: Ignoring unknown parameters: show.lengend

反轉結果並沒有非常顯著,但在一些情緒分數較低的章節確實有看到反轉的情況

關聯性

# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 50) %>%
  graph_from_data_frame()

bigram_graph
## IGRAPH 8d2a1f2 DN-- 107 79 -- 
## + attr: name (v/c), n (e/n)
## + edges from 8d2a1f2 (vertex names):
##  [1] professor   ->mcgonagall uncle       ->vernon    
##  [3] harry       ->potter     death       ->eaters    
##  [5] harry       ->looked     harry       ->ron       
##  [7] aunt        ->petunia    invisibility->cloak     
##  [9] dark        ->arts       professor   ->trelawney 
## [11] professor   ->umbridge   death       ->eater     
## [13] dark        ->lord       madam       ->pomfrey   
## [15] entrance    ->hall       lord        ->voldemort 
## + ... omitted several edges
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1, size = 3)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1, size = 3) +
  theme_void()

透過2-grams的網路圖可以發現哈利波特書中,兩兩字的關聯幾乎都圍繞在Professor與Harry居多,與佛地魔(外號黑魔王)、黑魔法相關的dark也有一些。