資料來源
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
[1] ""
library(data.table)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':
between, first, last
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(janeaustenr)
library(tidyr)
library(plotly)Warning: package 'plotly' was built under R version 3.5.3
Loading required package: ggplot2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(wordcloud2)
library(ggplot2)harryI = fread("Harry Potter and the Sorcerer.txt",sep = "\n")
harryII= fread("Harry Potter and the Chamber of Secrets.txt",sep = "\n")
harryIII = fread("Harry Potter and the Prisoner of Azkaban.txt",sep = "\n")
harry_role = fread("harry role list.txt",sep = "\n")
colnames(harryI) = "text"
colnames(harryII) = "text"
colnames(harryIII) = "text"
colnames(harry_role) = "text"harryI <- harryI %>%
filter(text!="") %>%
distinct(text)
harryII <- harryII %>%
filter(text!="") %>%
distinct(text)
harryIII <- harryIII %>%
filter(text!="") %>%
distinct(text)根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節並且過濾stop word
episode = c("I","II","III")
harryI <- harryI %>%
mutate( linenumber = row_number(),
episode = episode[1],
chapter = cumsum(str_detect(harryI$text, regex("^CHAPTER ")))) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) #第一集共17回
harryII <- harryII %>%
mutate(linenumber = row_number(),
episode = episode[2],
chapter = cumsum(str_detect(harryII$text, regex("^CHAPTER ")))) %>%
ungroup() %>%
unnest_tokens(word, text)%>%
filter(!word %in% stop_words$word)#第二集共18回
harryIII <- harryIII %>%
mutate(linenumber = row_number(),
episode = episode[3],
chapter = cumsum(str_detect(harryIII$text, regex("^CHAPTER ")))) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)#第三集共22回
# 將三集的合在一起
harryI_III = rbind(harryI,harryII,harryIII)
harry_role <- harry_role %>%
unnest_tokens(word, text)我們共使用三個情緒字典分別為老師介紹的afinn,bing,nrc以及我們在github找到的
# get_sentiments("afinn")
# get_sentiments("bing")
# get_sentiments("nrc")
dic_other_p = fread("./positive-words.txt", sep=" ", header = F) %>% mutate(sentiment = "positive")
colnames(dic_other_p)[1] = "word"
dic_other_n = fread("./negative-words.txt", sep=" ", header = F) %>% mutate(sentiment = "negative")
colnames(dic_other_n)[1] = "word"
dic_other = rbind(dic_other_p, dic_other_n)
head(dic_other,10) word sentiment
1 abound positive
2 abounds positive
3 abundance positive
4 abundant positive
5 accessable positive
6 accessible positive
7 acclaim positive
8 acclaimed positive
9 acclamation positive
10 accolade positive
harryI_III_sentiment <- harryI_III %>%
inner_join(get_sentiments("bing")) %>%
count(episode,chapter, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)Joining, by = "word"
#畫出各集每個章節的情緒圖圖
harryI_III_sentiment %>% ggplot(aes(chapter, sentiment, chapter))+
geom_col(show.legend = FALSE)+
ggtitle("Plot of Harry Potter sentiment") +
xlab("Chapter") + ylab("sentiment")+
facet_wrap(~ episode, ncol = 2, scales = "free_x")afinn <- harryI_III %>%
inner_join(get_sentiments("afinn")) %>%
group_by(episode,chapter) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN")Joining, by = "word"
bing_and_nrc_and_other <-bind_rows(harryI_III %>%
inner_join(dic_other) %>%
mutate(method = "OTHER")
,bind_rows(harryI_III %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
harryI_III %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive","negative"))) %>%
mutate(method = "NRC"))) %>%
count(method,episode, chapter, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
# 畫出四個字典的比較圖圖
all_plot <-bind_rows(afinn,
bing_and_nrc_and_other) %>%
ggplot(aes(chapter,sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_grid(episode~method, scales = "fixed")
all_plotbing_word_counts <-harryI_III %>%
inner_join(get_sentiments("bing")) %>%
count(episode,word, sentiment, sort = TRUE) %>%
ungroup()Joining, by = "word"
bing_word_counts# A tibble: 3,324 x 4
episode word sentiment n
<chr> <chr> <chr> <int>
1 III fudge negative 100
2 II dark negative 86
3 III dark negative 79
4 III magic positive 64
5 I dark negative 62
6 II magic positive 58
7 I magic positive 48
8 III slowly negative 47
9 I fell negative 46
10 II hard negative 46
# ... with 3,314 more rows
bing_word_counts %>%
group_by(sentiment) %>%
# top_n(10) %>%
filter(n>30) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_grid(episode~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()bing_I= bing_word_counts %>%
filter(episode=="I")
table(bing_I$sentiment)
negative positive
602 295
bing_II= bing_word_counts %>%
filter(episode=="II")
table(bing_II$sentiment)
negative positive
815 379
bing_III= bing_word_counts %>%
filter(episode=="III")
table(bing_III$sentiment)
negative positive
844 389
bing_I_III <-rbind(bing_I,bing_II,bing_III) %>%
ggplot(aes(episode,n))+
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "fixed")
bing_I_IIIcount_word_harryI <- harryI %>%
anti_join(stop_words) %>%
count(word) %>%
filter(n>20)%>%
arrange(desc(n)) %>%
wordcloud2()Joining, by = "word"
count_word_harryIcount_word_harryII <- harryII %>%
anti_join(stop_words) %>%
count(word) %>%
filter(n>20)%>%
arrange(desc(n)) %>%
wordcloud2()Joining, by = "word"
count_word_harryIIcount_word_harryIII <- harryIII %>%
anti_join(stop_words) %>%
count(word) %>%
filter(n>20)%>%
arrange(desc(n)) %>%
wordcloud2()Joining, by = "word"
count_word_harryIIIharryI_choose_chapter = harryI %>%
filter(chapter == 4 | chapter == 15 ) %>%
inner_join(harry_role, by = c("word" = "word")) %>%
count(chapter, word) %>%
arrange(desc(n)) %>%
arrange(chapter) %>%
group_by(chapter) %>%
top_n(n = 7, wt = n)
ggplotly(ggplot(harryI_choose_chapter, aes(chapter, n, fill = word))+ # 互動式圖表
geom_col(show.legend = F))harryII_choose_chapter = harryII %>%
filter(chapter == 10 | chapter == 11 | chapter == 15 | chapter == 16) %>%
inner_join(harry_role, by = c("word" = "word")) %>%
count(chapter, word) %>%
arrange(desc(n)) %>%
arrange(chapter) %>%
group_by(chapter) %>%
top_n(n = 7, wt = n)
ggplotly(ggplot(harryII_choose_chapter, aes(chapter, n, fill = word))+ # 互動式圖表
geom_col(show.legend = F))harryIII_choose_chapter = harryIII %>%
filter(chapter == 3 | chapter == 10 | chapter == 17 | chapter == 19 | chapter == 21) %>%
inner_join(harry_role, by = c("word" = "word")) %>%
count(chapter, word) %>%
arrange(desc(n)) %>%
arrange(chapter) %>%
group_by(chapter) %>%
top_n(n = 7, wt = n)
ggplotly(ggplot(harryIII_choose_chapter, aes(chapter, n, fill = word))+ # 互動式圖表
geom_col(show.legend = F))