Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','gutenbergr','jiebaR')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
library(stringr)
require(tidytext)
require(jiebaR)
library(wordcloud2)
library(gutenbergr)
require(data.table)
require(ggplot2)
require(reshape2)
require(wordcloud)
require(tidyr)
require(readr)
require(scales)
#設定資料夾
#設定資料夾
setwd('C:/Users/XA/Documents/R')
getwd()
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
# 魔戒索引
lortindex <-read.csv("lort_index.csv", header = TRUE, sep = ",")
lort1 <- read_delim("./the lord of rings 1.txt", col_names = "text", delim = "\n",quote = " ") %>%
filter(text!="")
lort2 <- read_delim("./the lord of rings 2.txt", col_names = "text", delim = "\n",quote = " ") %>%
filter(text!="")
lort3 <- read_delim("./the lord of rings 3.txt", col_names = "text", delim = "\n",quote = " ") %>%
filter(text!="")
#將文本以書本和章節區分
lort1_parts <- lort1 %>%
mutate(linenumber = row_number(),
book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]",
ignore_case = TRUE))))
lort2_parts <- lort2 %>%
mutate(linenumber = row_number(),
book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]",
ignore_case = TRUE))))
lort3_parts <- lort3 %>%
mutate(linenumber = row_number(),
book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]",
ignore_case = TRUE))))
lort_alltext <- bind_rows(lort1, lort2, lort3 )%>%
mutate(book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]",
ignore_case = TRUE))))
lort_alltext
## # A tibble: 35,789 x 3
## text book chapter
## <chr> <int> <int>
## 1 "THE FELLOWSHIP OF THE RING " 0 0
## 2 "J.R.R.ToIkien " 0 0
## 3 "<BOOK I> " 1 0
## 4 "Chapter 1 . A Long-expected Party " 1 1
## 5 "When Mr. Bilbo Baggins of Bag End announced that he woul~ 1 1
## 6 "celebrating his eleventy -first birthday with a party of~ 1 1
## 7 "magnificence, there was much talk and excitement in Hobb~ 1 1
## 8 "Bilbo was very rich and very peculiar, and had been the ~ 1 1
## 9 "Shire for sixty years, ever since his remarkable disappe~ 1 1
## 10 "unexpected return. The riches he had brought back from h~ 1 1
## # ... with 35,779 more rows
#匯入停止文字
data(stop_words)
add_stop <- bind_rows(stop_words,
data_frame(word = c("ring", "lord", "rings", "j.r.r.toikien"),
lexicon = c("custom")))
#匯入人名
#主角群
data_names_protagonists <- c("frodo", "baggins","samwise","sam", "meriadoc", "merry",
"peregrin","pippin", "pip", "gandalf","aragorn","strider",
"legolas", "gimli","boromir","denethor", "faramir",
"galadriel", "celeborn", "elrond","arwen",
"bilbo","theoden", "treebeard")
#邪惡方
data_name_antagonists <- c("sauron", "ringwraiths", "nazgul", "nazgyl", "saruman",
"grima", "orcses","orcs","angmar" ,"goblins","trolls",
"barrow-wights", "gollum", "shelob", "balrog", "morgoth")
#word 分析
word_lort1 <- lort1_parts %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
group_by(word)
word_lort2 <- lort2_parts %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
group_by(word)
word_lort3 <- lort3_parts %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
group_by(word)
#三部合併 計算詞頻
lort_total <- bind_rows(lort1_parts %>%
mutate(book ="lort 1"),
lort2_parts %>%
mutate(book ="lort 2"),
lort3_parts %>%
mutate(book = "lort 3"))%>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE) %>% #以書本區分, 並計算各文字出現次數
ungroup()
lort_total_words <- lort_total %>%
group_by(book) %>%
summarize(total = sum(n)) #計算每本書的字元數
lort_total <- left_join(lort_total, lort_total_words)#保留lort_total的資料,將lort_total_words合併進
lort_total
## # A tibble: 29,345 x 4
## book word n total
## <chr> <chr> <int> <int>
## 1 lort 1 the 10998 179384
## 2 lort 2 the 8892 154929
## 3 lort 3 the 7563 128837
## 4 lort 1 and 7177 179384
## 5 lort 3 and 5943 128837
## 6 lort 2 and 5910 154929
## 7 lort 1 of 4674 179384
## 8 lort 2 of 3968 154929
## 9 lort 1 to 3740 179384
## 10 lort 1 a 3596 179384
## # ... with 29,335 more rows
#三部曲詞頻繪圖, 篩選n>150的詞 ,並繪製長條圖
ggplot_word <- lort_total %>%
anti_join(add_stop)%>%
filter(!word %in% data_names_protagonists) %>% #篩掉人名
filter(!word %in% data_name_antagonists) %>% #篩掉人名
filter(n>150) %>%
mutate(word = reorder(word, n)) %>% #根據n重排word排序
ggplot(aes(word, n), fill = book) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, scales = "free_y") +
labs(y = "Word Frequency",
x = NULL) +
geom_text(aes(label=n))+
coord_flip()
ggplot_word
#先計算字的頻率
freq_by_rank <- lort_total %>%
group_by(book) %>%
mutate(rank = row_number(),
prop = n/total) %>%
ungroup()
freq_by_rank
## # A tibble: 29,345 x 6
## book word n total rank prop
## <chr> <chr> <int> <int> <int> <dbl>
## 1 lort 1 the 10998 179384 1 0.0613
## 2 lort 2 the 8892 154929 1 0.0574
## 3 lort 3 the 7563 128837 1 0.0587
## 4 lort 1 and 7177 179384 2 0.0400
## 5 lort 3 and 5943 128837 2 0.0461
## 6 lort 2 and 5910 154929 2 0.0381
## 7 lort 1 of 4674 179384 3 0.0261
## 8 lort 2 of 3968 154929 3 0.0256
## 9 lort 1 to 3740 179384 4 0.0208
## 10 lort 1 a 3596 179384 5 0.0200
## # ... with 29,335 more rows
#Zipf's law
ggplot_zipf <- freq_by_rank %>%
mutate(word = reorder(word, prop))%>%
filter(prop > 0.01)%>%
ggplot(aes(word, prop, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_y")+
geom_text(aes(label=percent(prop)))+ #以百分比顯示各word的頻率
coord_flip()
ggplot_zipf
#繪製曲線圖
ggplot_log <- freq_by_rank %>%
ggplot(aes(rank, prop, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
ggplot_log
#bind_tf_idf
book_idf <- lort_total %>%
bind_tf_idf(word, book, n) %>%
select(-total) %>%
arrange(desc(tf_idf))
ggplot_idf <- book_idf %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 1, scales = "free") +
coord_flip()
ggplot_idf
afinn_all <- lort_alltext %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = chapter) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN")
bing_nrc_all <- bind_rows(lort_alltext %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
lort_alltext %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = chapter, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn_all,
bing_nrc_all) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
xlab("chapter")+
facet_wrap(~method, ncol = 1, scales = "free_y")+
geom_vline(xintercept = as.numeric(c(23,44)), color="black")
afinn_lort_all <- bind_rows( word_lort1 %>%
inner_join(get_sentiments("afinn")) %>%
mutate(method = "afinn to lort1"),
word_lort2 %>%
inner_join(get_sentiments("afinn")) %>%
mutate(method = "afinn to lort2"),
word_lort3 %>%
inner_join(get_sentiments("afinn")) %>%
mutate(method = "afinn to lort3"))
afinn_word_counts_lort <- afinn_lort_all %>%
filter(word!= "merry") %>% #把merry(人名)去除
count(word, score, method, sort = TRUE) %>%
ungroup()
afinn_word_counts_lort %>%
mutate(cont = n*score) %>%
arrange(desc(abs(cont))) %>%
head(50) %>%
mutate(word = reorder(word,cont))%>%
ggplot(aes(word, n*score , fill = n*score>0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, scales = "free_y") +
labs(y = "Lord of rings",
x = NULL) +
coord_flip()
afinn_word_counts_lort
## # A tibble: 2,190 x 4
## word score method n
## <chr> <int> <chr> <int>
## 1 fire -2 afinn to lort1 152
## 2 cried -2 afinn to lort1 147
## 3 hope 2 afinn to lort1 143
## 4 grey -1 afinn to lort1 134
## 5 fear -2 afinn to lort1 126
## 6 hope 2 afinn to lort3 120
## 7 cried -2 afinn to lort3 119
## 8 grey -1 afinn to lort2 119
## 9 fear -2 afinn to lort3 116
## 10 dead -3 afinn to lort3 113
## # ... with 2,180 more rows
txt_wordcloud <- lort_alltext %>%
unnest_tokens(word, text) %>%
anti_join(add_stop)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
filter(word!= "merry") %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words =200)
txt_wordcloud
lort_parts_double <- lort_alltext %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)%>%
count(bigram, book, sort = TRUE)
lort_parts_double_sept <- lort_parts_double %>%
separate(bigram, c("word1", "word2"), sep = " ")
#否定詞庫
avoid <- c("not", "no", "never", "without")
#AFINN分析否定詞
afinn_not_words <- lort_parts_double_sept %>%
filter(word1 %in% avoid) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE)
afinn_not_words
## # A tibble: 255 x 4
## word1 word2 score n
## <chr> <chr> <int> <int>
## 1 no doubt -1 6
## 2 no good 3 6
## 3 no no -1 6
## 4 not like 2 6
## 5 not love 3 6
## 6 not wish 1 6
## 7 no comfort 2 5
## 8 no escape -1 5
## 9 no evil -3 5
## 10 no hope 2 5
## # ... with 245 more rows
#情緒長條圖
ggplot_afinn_not_words <- afinn_not_words %>%
group_by(word1) %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
top_n(10)%>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score < 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~word1, ncol = 2, scales = "free_y") +
xlab("Words preceded by \"avoid\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
ggplot_afinn_not_words
可以看到大部分還是以負面辭彙為主,像是“no welcome”、“not love”等。而“hope”在“no hope”、“not hope”、“without hope”都有看到,更顯現魔戒全文的負面情緒。
# 人名搜尋 good
name_search_prot <- lort_alltext %>%
unnest_tokens(word, text) %>%
filter(word %in% data_names_protagonists) %>%
mutate(camp = "GOOD") %>%
count(word, camp, sort = TRUE) #計算各文字出現次數
name_search_prot
## # A tibble: 23 x 3
## word camp n
## <chr> <chr> <int>
## 1 frodo GOOD 1830
## 2 sam GOOD 1210
## 3 gandalf GOOD 1085
## 4 aragorn GOOD 683
## 5 pippin GOOD 632
## 6 merry GOOD 558
## 7 gimli GOOD 373
## 8 legolas GOOD 328
## 9 bilbo GOOD 273
## 10 boromir GOOD 262
## # ... with 13 more rows
# 人名搜尋 bad
name_search_anta <- lort_alltext %>%
unnest_tokens(word, text) %>%
filter(word %in% data_name_antagonists) %>%
mutate(camp = "BAD") %>%
count(word, camp, sort = TRUE) #計算各文字出現次數
name_search_anta
## # A tibble: 12 x 3
## word camp n
## <chr> <chr> <int>
## 1 gollum BAD 379
## 2 saruman BAD 257
## 3 sauron BAD 123
## 4 nazgyl BAD 50
## 5 shelob BAD 29
## 6 trolls BAD 21
## 7 balrog BAD 13
## 8 ringwraiths BAD 10
## 9 angmar BAD 5
## 10 goblins BAD 5
## 11 orcses BAD 2
## 12 morgoth BAD 1
#人名長條圖
ggplot_name_good <- bind_rows(name_search_prot, name_search_anta) %>%
mutate(word = reorder(word, n))%>%
filter(n > 100) %>%
ggplot(aes(x = word, y = n, color = camp)) +
geom_bar(stat = "identity") +
xlab("Name") +
ylab("Number")+
coord_flip()
ggplot_name_good
#特搜 aragorn、sauron"和"ring"
three_vip <- lort_alltext %>%
unnest_tokens(word, text) %>%
filter(word == c("aragorn","sauron", "ring")) %>%
count(word, chapter, sort = TRUE) #以章節區分, 並計算三名詞出現次數
ggplot_vip <- three_vip %>%
ggplot(aes(x = chapter, y = n, color = word)) +
geom_line(size=1) +
geom_vline(xintercept = as.numeric(c(23,44)), color="black", size= 0.5)+
xlab("Word") +
ylab("Number")
ggplot_vip