packages = c("dplyr", "tidytext", "gutenbergr", "ggplot2", "tidyr", "igraph", "ggraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(gutenbergr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tidyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
scarlet <- gutenberg_download(244) %>%
filter(text!="") %>%
distinct(gutenberg_id, text) %>%
mutate(book = "A Study in Scarlet", linenumber = row_number())
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
hound <- gutenberg_download(2852) %>%
filter(text!="") %>%
distinct(gutenberg_id, text) %>%
mutate(book = "The Hound of the Baskervilles", linenumber = row_number())
sign <- gutenberg_download(2097) %>%
filter(text!="") %>%
distinct(gutenberg_id, text) %>%
mutate(book = "The Sign of the Four", linenumber = row_number())
valley <- gutenberg_download(3289) %>%
filter(text!="") %>%
distinct(gutenberg_id, text) %>%
mutate(book = "The Valley of Fear", linenumber = row_number())
tokens_scar <- scarlet %>% unnest_tokens(word, text)
tokens_hound <- hound %>% unnest_tokens(word, text)
tokens_sign <- sign %>% unnest_tokens(word, text)
tokens_valley <- valley %>% unnest_tokens(word, text)
tokens_all <- bind_rows(tokens_scar, tokens_hound, tokens_sign, tokens_valley)
book_words <- tokens_all %>%
count(book, word, sort = TRUE)
total_words <- book_words %>%
group_by(book) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 22,631 x 4
## book word n total
## <chr> <chr> <int> <int>
## 1 The Hound of the Baskervilles the 3331 59481
## 2 The Valley of Fear the 3266 57955
## 3 A Study in Scarlet the 2567 44202
## 4 The Sign of the Four the 2341 43535
## 5 The Hound of the Baskervilles and 1628 59481
## 6 The Hound of the Baskervilles of 1594 59481
## 7 The Hound of the Baskervilles i 1468 59481
## 8 The Valley of Fear and 1444 57955
## 9 The Valley of Fear of 1441 57955
## 10 The Hound of the Baskervilles to 1408 59481
## # … with 22,621 more rows
四篇都是“the”出現最多次,另外可能是因為巴斯克維爾的獵犬字數最多,導致他其他字“and”, “of”, “i”, “to”也出現很多次。
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE, bins = 30) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## Warning: Removed 573 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
由這張圖可以看出,血字研究、巴斯克維爾的獵犬、恐怖谷這三本,用字的情況較不集中,出現頻率最低與次低的差距甚大。
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(), `term frequency` = n/total)
freq_by_rank %>% arrange(rank)
## # A tibble: 22,631 x 6
## # Groups: book [4]
## book word n total rank `term frequency`
## <chr> <chr> <int> <int> <int> <dbl>
## 1 The Hound of the Baskervilles the 3331 59481 1 0.0560
## 2 The Valley of Fear the 3266 57955 1 0.0564
## 3 A Study in Scarlet the 2567 44202 1 0.0581
## 4 The Sign of the Four the 2341 43535 1 0.0538
## 5 The Hound of the Baskervilles and 1628 59481 2 0.0274
## 6 The Valley of Fear and 1444 57955 2 0.0249
## 7 A Study in Scarlet and 1369 44202 2 0.0310
## 8 The Sign of the Four i 1216 43535 2 0.0279
## 9 The Hound of the Baskervilles of 1594 59481 3 0.0268
## 10 The Valley of Fear of 1441 57955 3 0.0249
## # … with 22,621 more rows
四篇統計的結果可看出,每篇前幾名的用字不外乎“the”, “and”, “of”, “i”, “a”, “to”等,其中“the”的出現頻率皆超過5%。
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
可看出每篇用字的頻率都有一致的走向。
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6597 -1.1090
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.6597, slope = -1.109, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
## # A tibble: 22,631 x 7
## book word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 The Hound of the Baskervilles the 3331 59481 0.0560 0 0
## 2 The Valley of Fear the 3266 57955 0.0564 0 0
## 3 A Study in Scarlet the 2567 44202 0.0581 0 0
## 4 The Sign of the Four the 2341 43535 0.0538 0 0
## 5 The Hound of the Baskervilles and 1628 59481 0.0274 0 0
## 6 The Hound of the Baskervilles of 1594 59481 0.0268 0 0
## 7 The Hound of the Baskervilles i 1468 59481 0.0247 0 0
## 8 The Valley of Fear and 1444 57955 0.0249 0 0
## 9 The Valley of Fear of 1441 57955 0.0249 0 0
## 10 The Hound of the Baskervilles to 1408 59481 0.0237 0 0
## # … with 22,621 more rows
與剛剛計算的結果相同。
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 22,631 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 The Hound of the Baskervilles moor 163 0.00274 1.39 0.00380
## 2 The Hound of the Baskervilles henry 146 0.00245 1.39 0.00340
## 3 The Hound of the Baskervilles baskerville 111 0.00187 1.39 0.00259
## 4 The Valley of Fear douglas 105 0.00181 1.39 0.00251
## 5 The Valley of Fear mcmurdo 193 0.00333 0.693 0.00231
## 6 The Valley of Fear mcginty 90 0.00155 1.39 0.00215
## 7 The Sign of the Four sholto 66 0.00152 1.39 0.00210
## 8 The Hound of the Baskervilles mortimer 87 0.00146 1.39 0.00203
## 9 The Hound of the Baskervilles stapleton 85 0.00143 1.39 0.00198
## 10 The Sign of the Four morstan 60 0.00138 1.39 0.00191
## # … with 22,621 more rows
可看到出現了許多人名,包括“henry”, “baskerville”, “douglas”, “mcmurdo”, “mcginty”…。
這裡我們還可以看到,“mcmurdo”的 idf 與其他人不同,因此可以知道他在不只一本書中出現,除了恐怖谷之外,還出現在四個簽名之中。
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
基本上每本前幾名都是人名,除了巴斯克維爾的獵犬中第一名:moor(沼澤),因為其他本小說沒有用到這個字,而自己使用率很高,導致tf-idf分數十分高。
holmes <- bind_rows(scarlet, hound, sign, valley)
holmes_bigrams <- holmes %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
select(-gutenberg_id) %>%
filter(!is.na(bigram))
holmes_bigrams
## # A tibble: 187,275 x 3
## book linenumber bigram
## <chr> <int> <chr>
## 1 A Study in Scarlet 1 a study
## 2 A Study in Scarlet 1 study in
## 3 A Study in Scarlet 1 in scarlet
## 4 A Study in Scarlet 2 by a
## 5 A Study in Scarlet 2 a conan
## 6 A Study in Scarlet 2 conan doyle
## 7 A Study in Scarlet 4 original transcriber's
## 8 A Study in Scarlet 4 transcriber's note
## 9 A Study in Scarlet 4 note this
## 10 A Study in Scarlet 4 this etext
## # … with 187,265 more rows
第三行內容為“[1]”,斷詞後結果為NA,在這裡我們選擇將NA過濾掉,因此最終結果並們有第三行。
holmes_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 82,408 x 2
## bigram n
## <chr> <int>
## 1 of the 1364
## 2 in the 874
## 3 it was 537
## 4 to the 500
## 5 it is 439
## 6 at the 405
## 7 i have 395
## 8 and the 369
## 9 that i 350
## 10 upon the 342
## # … with 82,398 more rows
結果中多是與停用字有關,導致很難分析。
bigrams_separated <- holmes_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 11,501 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 sir henry 129
## 2 sherlock holmes 116
## 3 dr mortimer 69
## 4 sir charles 64
## 5 dr watson 43
## 6 jefferson hope 33
## 7 miss morstan 31
## 8 baker street 28
## 9 baskerville hall 27
## 10 john ferrier 23
## # … with 11,491 more rows
可看到很多稱謂或地名。
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
## # A tibble: 13,945 x 3
## book linenumber bigram
## <chr> <int> <chr>
## 1 A Study in Scarlet 2 conan doyle
## 2 A Study in Scarlet 4 original transcriber's
## 3 A Study in Scarlet 4 transcriber's note
## 4 A Study in Scarlet 4 prepared directly
## 5 A Study in Scarlet 5 1887 edition
## 6 A Study in Scarlet 6 including typographical
## 7 A Study in Scarlet 8 text include
## 8 A Study in Scarlet 8 include adding
## 9 A Study in Scarlet 8 underscore character
## 10 A Study in Scarlet 9 square braces
## # … with 13,935 more rows
holmes %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
filter(!is.na(trigram)) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 2,804 x 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 sir henry baskerville 17
## 2 sir charles baskerville 11
## 3 salt lake city 9
## 4 sir charles's death 8
## 5 halliday's private hotel 5
## 6 detective police force 4
## 7 dr james mortimer 4
## 8 lodge 341 vermissa 4
## 9 3 lauriston gardens 3
## 10 dr mortimer looked 3
## # … with 2,794 more rows
可看出也是有很多人名。
bigrams_filtered %>%
filter(word2 == "street") %>%
count(book, word1, sort = TRUE)
## # A tibble: 25 x 3
## book word1 n
## <chr> <chr> <int>
## 1 The Hound of the Baskervilles baker 11
## 2 The Sign of the Four baker 10
## 3 The Hound of the Baskervilles regent 8
## 4 A Study in Scarlet baker 6
## 5 The Hound of the Baskervilles oxford 3
## 6 A Study in Scarlet duncan 2
## 7 The Valley of Fear sheridan 2
## 8 A Study in Scarlet busy 1
## 9 A Study in Scarlet henrietta 1
## 10 A Study in Scarlet ragged 1
## # … with 15 more rows
福爾摩斯與華生的住所:貝克街,出現很多次。
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 11,922 x 6
## book bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 The Hound of the Baskervill… sir henry 129 0.0317 1.39 0.0439
## 2 The Hound of the Baskervill… dr mortimer 69 0.0169 1.39 0.0235
## 3 The Sign of the Four miss morstan 31 0.0101 1.39 0.0140
## 4 The Hound of the Baskervill… sir charles 63 0.0155 0.693 0.0107
## 5 A Study in Scarlet john ferrier 23 0.00735 1.39 0.0102
## 6 The Hound of the Baskervill… baskerville ha… 27 0.00663 1.39 0.00919
## 7 The Valley of Fear white mason 23 0.00627 1.39 0.00869
## 8 The Sign of the Four athelney jones 18 0.00585 1.39 0.00811
## 9 The Sign of the Four thaddeus sholto 18 0.00585 1.39 0.00811
## 10 The Hound of the Baskervill… sir charles's 23 0.00565 1.39 0.00783
## # … with 11,912 more rows
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(book) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 368 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not a 58
## 2 not to 41
## 3 not be 40
## 4 not have 37
## 5 not know 29
## 6 not the 29
## 7 not been 25
## 8 not for 19
## 9 not so 17
## 10 not only 14
## # … with 358 more rows
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn") , by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE)
not_words
## # A tibble: 70 x 3
## word2 score n
## <chr> <int> <int>
## 1 help 2 9
## 2 wish 1 4
## 3 fear -2 3
## 4 leave -1 3
## 5 trust 1 3
## 6 ashamed -2 2
## 7 doubt -1 2
## 8 easy 1 2
## 9 escape -1 2
## 10 hurt -2 2
## # … with 60 more rows
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE)
negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
group_by(word1) %>%
top_n(10) %>%
ungroup() %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
facet_wrap(~word1, ncol = 2, scales = "free") +
coord_flip()
## Selecting by contribution
透過以上結果可看出接在“not”和“no”後面多為正面字,“never”後多接否定字,但我們無法看出否定字對我們情緒分析的影響,因此我們將上次做的情緒分析,與加上“not”等否定字調整情緒分數的結果做比較。
all_afinn <- tokens_all %>%
inner_join(get_sentiments("afinn")) %>%
group_by(book, index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(status = "Original")
## Joining, by = "word"
not_words_afinn <- bigrams_separated %>%
filter(word1 == "not" | word1 == "no" | word1 == "never" | word1 == "without") %>%
inner_join(get_sentiments("afinn") , by = c(word2 = "word")) %>%
group_by(book, index = linenumber %/% 80) %>%
summarise(sentiment = (sum(score) * -2)) %>%
mutate(status = "Adjusted")
adjust_afinn <- bind_rows(all_afinn, not_words_afinn) %>%
group_by(book, index) %>%
summarise(sentiment = sum(sentiment)) %>%
mutate(status = "Adjustment")
ggplot(all_afinn, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "AFINN", x = NULL, y = NULL)
ggplot(adjust_afinn, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "AFINN(Adjusted)", x = NULL, y = NULL)
ggplot(not_words_afinn, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "Adjustment", x = NULL, y = NULL)
compare_afinn <- bind_rows(all_afinn, adjust_afinn, not_words_afinn)
ggplot(compare_afinn, aes(index, sentiment, fill= book)) +
geom_col(show.legend = FALSE) +
facet_grid(status~book, scales = "free_x") +
labs(title = "Compare", x = NULL, y = NULL)
bigram_graph <- bigram_counts %>%
filter(n > 10) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH f5182d0 DN-- 63 39 --
## + attr: name (v/c), n (e/n)
## + edges from f5182d0 (vertex names):
## [1] sir ->henry sherlock ->holmes
## [3] dr ->mortimer sir ->charles
## [5] dr ->watson jefferson ->hope
## [7] miss ->morstan baker ->street
## [9] baskerville->hall john ->ferrier
## [11] sir ->charles's white ->mason
## [13] wedding ->ring henry ->baskerville
## [15] manor ->house athelney ->jones
## + ... omitted several edges
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
labs(x = NULL, y = NULL)
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.1, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.05, 'inches')) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()