Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "stringr", "wordcloud2","gutenbergr", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales',"data.tree", "NLP", "igraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(gutenbergr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(data.tree)
scarlet <- gutenberg_download(244)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
scarlet$text[1:32] = ""
scarlet <- scarlet %>% filter(text!="")
scarlet <- scarlet %>%
mutate(book = "A Study in Scarlet",
part = cumsum(str_detect(.$text, regex("^PART (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\."))),
chapter = cumsum(str_detect(.$text, regex("^CHAPTER (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\..*"))),
linenumber = row_number())
hound <- gutenberg_download(2852) %>% filter(text!="")
hound <- hound %>%
mutate(book = "The Hound of the Baskervilles",
chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9][0-9]?\\..*"))),
linenumber = row_number())
sign <- gutenberg_download(2097) %>% filter(text!="")
sign <- sign %>%
mutate(book = "The Sign of the Four",
chapter = cumsum(str_detect(.$text, regex("^Chapter (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"))),
linenumber = row_number())
valley <- gutenberg_download(3289) %>% filter(text!="")
valley <- valley %>%
mutate(book = "The Valley of Fear",
part = cumsum(str_detect(.$text, regex("^Part [0-9].*"))),
chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9].*"))),
linenumber = row_number())
tokens_scar <- scarlet %>% unnest_tokens(word, text)
tokens_hound <- hound %>% unnest_tokens(word, text)
tokens_sign <- sign %>% unnest_tokens(word, text)
tokens_valley <- valley %>% unnest_tokens(word, text)
tokens_all <- bind_rows(tokens_scar, tokens_hound, tokens_sign, tokens_valley)
scar_bing <- tokens_scar %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
hound_bing <- tokens_hound %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
sign_bing <- tokens_sign %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
valley_bing <- tokens_valley %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
book_bing <- bind_rows(scar_bing, hound_bing, sign_bing, valley_bing) %>%
mutate(method = "Bing")
Bing_compare <- ggplot(book_bing, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "Bing", x = NULL, y =NULL)
Bing_compare
此四篇皆偏向負向分數,尤以血字的研究與巴斯克維爾的獵犬十分明顯。
scar_tokens_count <- tokens_scar %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
count(book, word, sort = TRUE) %>%
head(., 10)
## Joining, by = "word"
hound_tokens_count <- tokens_hound %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
count(book, word, sort = TRUE) %>%
head(., 10)
## Joining, by = "word"
sign_tokens_count <- tokens_sign %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
count(book, word, sort = TRUE) %>%
head(., 10)
## Joining, by = "word"
valley_tokens_count <- tokens_valley %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
count(book, word, sort = TRUE) %>%
head(., 10)
## Joining, by = "word"
all_tokens_count <- bind_rows(scar_tokens_count, hound_tokens_count, sign_tokens_count, valley_tokens_count)
all_tokens_count %>%
ggplot(aes(reorder(word, n), n)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, scales = "free_y", ncol = 2) +
labs(x = "字詞", y = NULL) +
theme(text=element_text(size=12)) +
theme(text = element_text(family = "Heiti TC Light")) +
coord_flip()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
結果中可看出death(死), dead(死), murder(謀殺), fear(恐懼), dark(黑暗), danger(危險)等字出現多次,符合偵探小說的特性,因此導致通篇小說的分數偏向負向。
scar_afinn <- tokens_scar %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN", book = "A Study in Scarlet")
## Joining, by = "word"
hound_afinn <- tokens_hound %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN", book = "The Hound of the Baskervilles")
## Joining, by = "word"
sign_afinn <- tokens_sign %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN", book = "The Sign of the Four")
## Joining, by = "word"
valley_afinn <- tokens_valley %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN", book = "The Valley of Fear")
## Joining, by = "word"
book_afinn <- bind_rows(scar_afinn, hound_afinn, sign_afinn, valley_afinn)
ggplot(book_afinn, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "AFINN", x = NULL, y = NULL)
結果血字的研究負向十分明顯,巴斯克維爾的獵犬、四個簽名、恐怖谷傾向負向。
scar_nrc <- tokens_scar %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
hound_nrc <- tokens_hound %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
sign_nrc <- tokens_sign %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
valley_nrc <- tokens_valley %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
book_nrc <- bind_rows(scar_nrc, hound_nrc, sign_nrc, valley_nrc) %>%
mutate(method = "NRC")
ggplot(book_nrc, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs("NRC", x = NULL, y = NULL)
血字的研究、四個簽名、恐怖谷分數皆轉正,巴斯克維爾的獵犬末段負向明顯。
book_all <- bind_rows(book_bing, book_afinn, book_nrc)
book_all %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_grid(method~book, scales = "free_x") +
labs(x = NULL, y = NULL)
可明顯看出 NRC詞庫 的正向明顯較其他兩個詞庫高。
get_sentiments("afinn") %>%
summarise(n = sum(score))
get_sentiments("bing") %>%
count(sentiment)
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
tokens_scar %>%
inner_join(get_sentiments("bing")) %>%
count(book, word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
wordcounts_chapter <- tokens_all %>%
group_by(book, chapter) %>%
summarize(words = n())
tokens_all %>%
semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts_chapter, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
wordcounts_all <- tokens_all %>%
group_by(book) %>%
summarize(words = n())
tokens_all %>%
semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>%
group_by(book) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts_all, by = c("book")) %>%
mutate(negative_ratio = negativewords/words) %>%
ungroup()
## Joining, by = "word"
tokens_all %>%
semi_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
group_by(book) %>%
summarize(positivewords = n()) %>%
left_join(wordcounts_all, by = c("book")) %>%
mutate(positive_ratio = positivewords/words) %>%
ungroup()
## Joining, by = "word"
Bing_compare
# book_all %>% filter(.$book == "A Study in Scarlet") %>%
# ggplot(aes(index, sentiment, fill= method)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~method, ncol = 1, scales = "free") +
# labs(x = NULL, y = NULL) +
# ggtitle("A Study in Scarlet")
#
# book_all %>% filter(.$book == "The Hound of the Baskervilles") %>%
# ggplot(aes(index, sentiment, fill= method)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~method, ncol = 1, scales = "free") +
# labs(x = NULL, y = NULL) +
# ggtitle("The Hound of the Baskervilles")
#
# book_all %>% filter(.$book == "The Sign of the Four") %>%
# ggplot(aes(index, sentiment, fill= method)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~method, ncol = 1, scales = "free") +
# labs(x = NULL, y = NULL) +
# ggtitle("The Sign of the Four")
#
# book_all %>% filter(.$book == "The Valley of Fear") %>%
# ggplot(aes(index, sentiment, fill= method)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~method, ncol = 1, scales = "free") +
# labs(x = NULL, y = NULL) +
# ggtitle("The Valley of Fear")
book_words <- tokens_all %>%
count(book, word, sort = TRUE)
total_words <- book_words %>%
group_by(book) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 574 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
+前半段rank曲線較為平滑,term frequency幾乎一樣
filter(rank < 500, rank > 10)
lm(log10(term frequency) ~ log10(rank), data = rank_subset)
term frequency) ~ log10(rank), data = rank_subset)```
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.66, slope = -1.11, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
ngram_scar <- scarlet %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_hound <- hound %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_sign <- sign %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_valley <- valley %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_all <- bind_rows(ngram_scar, ngram_hound, ngram_sign, ngram_valley)
ngram_all %>%
count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- ngram_all %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts<-bigram_counts[-1,] #刪除第一列是na
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
trigram_scar <- scarlet %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_hound <- hound %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_sign <- sign %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_valley <- valley %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_all <- bind_rows(trigram_scar, trigram_hound, trigram_sign, trigram_valley)
trigram<-bind_rows(trigram_scar, trigram_hound, trigram_sign, trigram_valley)%>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
trigram<-trigram[-1,]
trigram
bigrams_filtered %>%
filter(word1 == "death") %>%
count(book, word2, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(book) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE)
not_words
library(ggplot2)
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE)
negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(10) %>%
mutate(word2 = reorder(word2, contribution)) %>%
group_by(word1) %>%
ungroup() %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~word1, ncol = 2, scales = "free") +
coord_flip()
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# original counts
bigram_counts
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 5) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH e56e303 DN-- 146 91 --
## + attr: name (v/c), n (e/n)
## + edges from e56e303 (vertex names):
## [1] sir ->henry sherlock ->holmes
## [3] dr ->mortimer sir ->charles
## [5] dr ->watson jefferson ->hope
## [7] miss ->morstan baker ->street
## [9] baskerville->hall john ->ferrier
## [11] sir ->charles's white ->mason
## [13] wedding ->ring henry ->baskerville
## [15] manor ->house athelney ->jones
## + ... omitted several edges
library(ggraph)
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()