library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(tidytext)
library(janeaustenr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.2
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(wordcloud2)
library(ggplot2)
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
##
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
##
## groups
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
harry1 = fread("Harry Potter and the Sorcerer.txt", sep = "\n", header = F)
harry2 = fread("Harry Potter and the Chamber of Secrets.txt", sep = "\n" ,header = F)
harry3 = fread("Harry Potter and the Prisoner of Azkaban.txt", sep = "\n", header = F)
harry4 = fread("Harry Potter and the Goblet of Fire.txt", sep = "\n", header = F)
harry5 = fread("Harry Potter and the Order of the Phoenix.txt", sep = "\n", header = F)
harry6 = fread("Harry Potter and the Half-Blood Prince.txt", sep = "\n", header = F)
harry7 = fread("Harry Potter and the Deathly Hallows.txt", sep = "\n", header = F)
colnames(harry1) = "text"
colnames(harry2) = "text"
colnames(harry3) = "text"
colnames(harry4) = "text"
colnames(harry5) = "text"
colnames(harry6) = "text"
colnames(harry7) = "text"
harry1 = harry1 %>%
subset(text != "") %>%
distinct(text) %>%
.[-1, ] %>%
mutate(book = "1the Sorcerer",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry2 = harry2 %>%
subset(text != "") %>%
distinct(text) %>%
.[-c(1,2), ] %>%
mutate(book = "2the Chamber of Secrets",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry3 = harry3 %>%
subset(text != "") %>%
distinct(text) %>%
.[-c(1,2), ] %>%
mutate(book = "3the Prisoner of Azkaban",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry4 = harry4 %>%
subset(text != "") %>%
distinct(text) %>%
.[-1, ] %>%
mutate(book = "4the Goblet of Fire",chapter = cumsum(str_detect(.$text, regex("^CHAPTER.*( | |$)"))))
harry5 = harry5 %>%
subset(text != "") %>%
distinct(text) %>%
.[-c(1,2), ] %>%
mutate(book = "5the Order of the Phoenix",chapter = cumsum(str_detect(.$text, regex("^- CHAPTER.*( | |$)"))))
harry6 = harry6 %>%
subset(text != "") %>%
distinct(text) %>%
.[-c(1,2), ] %>%
mutate(book = "6the Half-Blood Prince",chapter = cumsum(str_detect(.$text, regex("^Chapter.*( | |$)"))))
harry7 = harry7 %>%
subset(text != "") %>%
distinct(text) %>%
mutate(book = "7the Deathly Hallows",chapter = cumsum(str_detect(.$text, regex("^Chapter.*( | |$)"))))
harry = do.call("rbind", list(harry1, harry2, harry3, harry4, harry5, harry6, harry7))
# 把text移到最後一欄
harry = harry[c(2,3,1)]
# 各詞彙在每本書中的出現次數
harry_words = harry %>%
unnest_tokens(word, text) %>%
count(book, word, sort = T)
# 每本書的字數
total_words = harry_words %>%
group_by(book) %>%
summarise(total = sum(n))
# 在harry_words加上該詞彙所在書籍之總字數
harry_words = left_join(harry_words, total_words)
## Joining, by = "book"
ggplot(harry_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = F) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1117 rows containing non-finite values (stat_bin).
## Warning: Removed 7 rows containing missing values (geom_bar).
freq_by_rank = harry_words %>%
group_by(book) %>%
mutate(rank = row_number(), 'term frequency' = n/total)
# rank、term frequency取log後的關係
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = F) +
scale_x_log10() +
scale_y_log10()
# rank在80~1000間的比較接近線性
rank_subset <- freq_by_rank %>%
filter(rank < 1000, rank > 80)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6647 -1.0847
# 加上abline
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.6785, slope = -1.08, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
harry_words = harry_words %>%
bind_tf_idf(word, book, n) %>%
select(-total) %>%
arrange(desc(tf_idf))
harry_words
## # A tibble: 65,990 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 7the Deathly Hallows k 958 0.00474 1.25 0.00594
## 2 7the Deathly Hallows x 1345 0.00666 0.560 0.00373
## 3 6the Half-Blood Prince slughorn 337 0.00197 1.25 0.00246
## 4 5the Order of the Phoenix umbridge 497 0.00192 0.847 0.00163
## 5 7the Deathly Hallows ll 254 0.00126 1.25 0.00158
## 6 4the Goblet of Fire bagman 209 0.00109 1.25 0.00136
## 7 2the Chamber of Secrets lockhart 196 0.00229 0.560 0.00128
## 8 3the Prisoner of Azkaban lupin 368 0.00350 0.336 0.00118
## 9 4the Goblet of Fire winky 145 0.000755 1.25 0.000946
## 10 7the Deathly Hallows xenophilius 92 0.000456 1.95 0.000887
## # … with 65,980 more rows
harry_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 3, scales = "free") +
theme(axis.text = element_text(size = 8)) +
coord_flip()
## Selecting by tf_idf
1.第一集(the Sorcerer)中,Quirrell這個詞高出其他許多,他是第一集的反派(被佛地魔附身);Flamel是600多歲的魔法師,為鄧不利多老友,並且是書中唯一能製造魔法石的人。 2.第二集(the Chamber of Secrets),Gilderoy Lockhart為學校新老師,曾在密室中想消除哈利與榮恩的記憶;Dobby是家庭小精靈,起初阻止哈利回霍格華茲,後來時常出現幫助哈利脫困;Riddle則是佛地魔名字,本集闡述許多佛地魔以前的故事。 3.第三集(the Prisoner of Azkaban),Lupin教會哈利使用護法咒來對抗催狂魔,並且他與哈利父親和本集重要人物-天狼星有密不可分的關係;Pettigrew為哈利父親、Lupin、天狼星為學生時期玩伴,但他出賣了哈利父母,使後者遭佛地魔殺害。 4.第四集(the Coblet of Fire),Bagman是此集才有的角色,作為魔法遊戲與運動部司長,在三巫鬥法大賽扮演貫穿劇情的人;Winky為另一個家庭小精靈,在本集首度登場並且也扮演貫穿劇情的角色。 5.第五集(the Order of the Phoenix),Umbridge是霍格華茲新老師兼總督察,與哈利等人有著對立的關係。 6.第六集(the Half-Blood Prince),Slughorn曾是霍格華茲魔藥學教授,本集鄧不利多說服他回學校重新任教黑魔法防禦老師,他相當重視學生才華,因此對哈利與妙麗相當友善,並招募他們進Slughorn俱樂部。
有趣的是,在第1、2、3、5、6集中,最高的都是黑魔法防禦學老師。
harry_bigrams = harry %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
# count 2-grams
harry_bigrams %>%
count(bigram, sort = T)
## # A tibble: 337,192 x 2
## bigram n
## <chr> <int>
## 1 of the 4864
## 2 in the 3558
## 3 said harry 2608
## 4 he was 2483
## 5 at the 2435
## 6 to the 2377
## 7 on the 2353
## 8 he had 2146
## 9 it was 2128
## 10 out of 1900
## # … with 337,182 more rows
bigrams_separated <- harry_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 篩選出word1、word2都沒有在stopwords內的
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 87,365 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 professor mcgonagall 584
## 2 uncle vernon 391
## 3 harry potter 358
## 4 death eaters 341
## 5 harry looked 323
## 6 harry ron 307
## 7 aunt petunia 215
## 8 invisibility cloak 192
## 9 dark arts 178
## 10 professor trelawney 176
## # … with 87,355 more rows
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
#bigrams_united
bigrams_filtered %>%
filter(word1 == "professor") %>%
count(book, word2, sort = TRUE)
## # A tibble: 172 x 3
## book word2 n
## <chr> <chr> <int>
## 1 5the Order of the Phoenix umbridge 174
## 2 5the Order of the Phoenix mcgonagall 139
## 3 3the Prisoner of Azkaban lupin 106
## 4 1the Sorcerer mcgonagall 90
## 5 3the Prisoner of Azkaban mcgonagall 83
## 6 2the Chamber of Secrets mcgonagall 78
## 7 4the Goblet of Fire mcgonagall 76
## 8 6the Half-Blood Prince mcgonagall 67
## 9 3the Prisoner of Azkaban trelawney 64
## 10 5the Order of the Phoenix trelawney 51
## # … with 162 more rows
Professor McGonagall是麥教授,霍格華茲副校長、葛萊芬多院長與變形學教授,因此在各集書中出現頻率特別高。 Professor Lupin為第三集中新任的教授,與天狼星關係匪淺,因此在第三集中有大量戲份。 Professor Umbridge 是第五集中新任的黑魔法防禦教授,並擔任霍格華茲的總督察,與哈利等人算是對立的勢力。 Professor Trelawney 是占卜學教授,他在第三集向哈利預言了他和天狼星有關係。
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 105,253 x 6
## book bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 5the Order of the Phoenix professor umbridge 174 0.00536 1.25 0.00672
## 2 3the Prisoner of Azkaban professor lupin 106 0.00731 0.847 0.00620
## 3 7the Deathly Hallows elder wand 60 0.00270 1.95 0.00526
## 4 4the Goblet of Fire ludo bagman 49 0.00201 1.95 0.00390
## 5 3the Prisoner of Azkaban aunt marge 42 0.00290 1.25 0.00363
## 6 7the Deathly Hallows death eaters 132 0.00594 0.560 0.00333
## 7 4the Goblet of Fire madame maxime 89 0.00364 0.847 0.00309
## 8 7the Deathly Hallows deathly hallows 32 0.00144 1.95 0.00280
## 9 2the Chamber of Secrets gilderoy lockhart 26 0.00217 1.25 0.00271
## 10 6the Half-Blood Prince advanced potion 27 0.00130 1.95 0.00252
## # … with 105,243 more rows
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(book) %>%
top_n(9) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = F) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 3, scales = "free") +
theme(axis.text = element_text(size = 8)) +
coord_flip()
## Selecting by tf_idf
AFINN <- get_sentiments("afinn")
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = T)
#前十名情緒字前面有加上negation words之影響
negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
mutate(word1 = factor(word1, levels = rev(unique(word1)))) %>%
#mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%
group_by(word1) %>%
top_n(10, wt = abs(contribution)) %>%
ungroup() %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, contribution, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by negation term") +
ylab("Sentiment score * number of occurrences") +
facet_wrap(~word1, ncol = 2, scales = "free") +
coord_flip()
##反轉情緒
#把內容切成2字word1,word2,順便把stop word清掉
harry_bigrams<-harry%>%
unnest_tokens(bigram,text,token='ngrams',n=2)%>%
separate(bigram,c('word1','word2'),sep=' ')%>%
filter(!word1%in%stop_words$word)%>%
filter(!word2%in%stop_words$word)
#和afinn情緒字典合併算出這七集的情緒起伏
afinn<-get_sentiments("afinn")
before_sentiment<-harry_bigrams%>%
count(book,chapter,word1,word2,sort=TRUE)%>%
inner_join(afinn,by=c(word2="word"))%>%
mutate(sentiment=n*score)%>%
group_by(book,chapter)%>%
summarise(sentiment=sum(sentiment))%>%
mutate(turn="before")
#畫出反轉前情緒起伏
before_sentiment%>%
#filter(book=='1the Scorcerer')%>%
ggplot(aes(chapter,sentiment))+
geom_col(show.legend = TRUE)+
facet_wrap(~book,scales='free',ncol=2)+
labs(y='book sentiment',x='chapter')
#開始反轉情緒
after_sentiment<-harry_bigrams%>%
count(book,chapter,word1,word2,sort=TRUE)%>%
inner_join(afinn,by=c(word2="word"))%>%
mutate(afinn=ifelse(word1=="didn"|word1=="don",-1*score,score))%>%
group_by(book,chapter)%>%
summarise(sentiment=sum(afinn))%>%
mutate(turn="after")
#graph
after_sentiment%>%
ggplot(aes(chapter,sentiment))+
geom_col(show.lengend=TRUE)+
facet_wrap(~book,scales='free',ncol=2)+
labs(y='book sentiment',x='chapter')
## Warning: Ignoring unknown parameters: show.lengend
#列出反轉前後做比較,僅第六章較明顯(圖太多)
before_after<-rbind(before_sentiment,after_sentiment)
before_after%>%
filter(book=='6the Half-Blood Prince')%>%
ggplot(aes(chapter,sentiment))+
geom_col(show.lengend=TRUE)+
facet_wrap(book~turn,scales='free',ncol=2)+
labs(y='before after',x='chapter')
## Warning: Ignoring unknown parameters: show.lengend
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 50) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 8d2a1f2 DN-- 107 79 --
## + attr: name (v/c), n (e/n)
## + edges from 8d2a1f2 (vertex names):
## [1] professor ->mcgonagall uncle ->vernon
## [3] harry ->potter death ->eaters
## [5] harry ->looked harry ->ron
## [7] aunt ->petunia invisibility->cloak
## [9] dark ->arts professor ->trelawney
## [11] professor ->umbridge death ->eater
## [13] dark ->lord madam ->pomfrey
## [15] entrance ->hall lord ->voldemort
## + ... omitted several edges
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1, size = 3)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1, size = 3) +
theme_void()
透過2-grams的網路圖可以發現哈利波特書中,兩兩字的關聯幾乎都圍繞在Professor與Harry居多,與佛地魔(外號黑魔王)、黑魔法相關的dark也有一些。