載入需要的packages
library(gutenbergr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
下載文本
#快樂王子(第53~352行)
book_prince <- gutenberg_download(30120) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
prince <- book_prince[53:352,]
#下載灰姑娘
cinderalla <- gutenberg_download(10830) %>% filter(text!="") %>%
distinct(gutenberg_id, text)
重新處理文本使後面的斷詞能夠正確地斷出happy prince
text<- paste0(prince$text, collapse = ' ') #把所有的文字接在一起
#gsub把正則表達式抓出來的字整個取代成happy prince
text <- gsub('(H|h)appy (P|p)rince','HappyPrince',text)
df <- tibble(text = text) #存成tibble
#將兩本書合併,找出HappyPrince & Cinderalla 總字數及個別字出現次數
Cinderalla_words <- cinderalla %>%
unnest_tokens(word, text) %>%
mutate(book='Cinderalla')
Prince_words <- df %>%
unnest_tokens(word, text) %>%
mutate(book='Prince')
book_words <- full_join(Cinderalla_words,Prince_words) %>%
count(book,word,sort = TRUE)
## Joining, by = c("word", "book")
total_words <-book_words %>%
group_by(book) %>%
summarize(total=sum(n))
book_words <- left_join(book_words,total_words)
## Joining, by = "book"
book_words
## # A tibble: 2,031 x 4
## book word n total
## <chr> <chr> <int> <int>
## 1 Prince the 299 3475
## 2 Cinderalla the 282 4149
## 3 Cinderalla and 169 4149
## 4 Cinderalla to 154 4149
## 5 Prince and 139 3475
## 6 Cinderalla her 134 4149
## 7 Prince he 110 3475
## 8 Cinderalla of 93 4149
## 9 Prince of 87 3475
## 10 Cinderalla she 77 4149
## # ... with 2,021 more rows
##畫出tf
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend =FALSE) +
xlim(NA, 0.01) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
結果:兩本皆有長尾
#計算rank與term frequency
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
## # A tibble: 2,031 x 6
## # Groups: book [2]
## book word n total rank `term frequency`
## <chr> <chr> <int> <int> <int> <dbl>
## 1 Prince the 299 3475 1 0.0860
## 2 Cinderalla the 282 4149 1 0.0680
## 3 Cinderalla and 169 4149 2 0.0407
## 4 Cinderalla to 154 4149 3 0.0371
## 5 Prince and 139 3475 2 0.04
## 6 Cinderalla her 134 4149 4 0.0323
## 7 Prince he 110 3475 3 0.0317
## 8 Cinderalla of 93 4149 5 0.0224
## 9 Prince of 87 3475 4 0.0250
## 10 Cinderalla she 77 4149 6 0.0186
## # ... with 2,021 more rows
#計算tf並繪製成圖,比對rank與tf是否成連續性反比
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.5, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
#因為rank後段較不連續,把線最為線性的中段(取rank 10~100)的部分提出來當基準線
rank_subset <- freq_by_rank %>%
filter(rank < 100,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.9988 -0.8893
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.9988, slope = -0.8893, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = T) +
scale_x_log10() +
scale_y_log10()
結果:大致上符合Zipf’s Law
#兩本書一起做TF-IDF
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
## # A tibble: 2,031 x 7
## book word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Prince the 299 3475 0.0860 0 0
## 2 Cinderalla the 282 4149 0.0680 0 0
## 3 Cinderalla and 169 4149 0.0407 0 0
## 4 Cinderalla to 154 4149 0.0371 0 0
## 5 Prince and 139 3475 0.04 0 0
## 6 Cinderalla her 134 4149 0.0323 0 0
## 7 Prince he 110 3475 0.0317 0 0
## 8 Cinderalla of 93 4149 0.0224 0 0
## 9 Prince of 87 3475 0.0250 0 0
## 10 Cinderalla she 77 4149 0.0186 0 0
## # ... with 2,021 more rows
#依照TF-IDF的排序找出重要的字詞
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 2,031 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Cinderalla cinderella 61 0.0147 0.693 0.0102
## 2 Prince swallow 45 0.0129 0.693 0.00898
## 3 Prince happyprince 17 0.00489 0.693 0.00339
## 4 Cinderalla sisters 18 0.00434 0.693 0.00301
## 5 Prince statue 12 0.00345 0.693 0.00239
## 6 Cinderalla godmother 12 0.00289 0.693 0.00200
## 7 Prince egypt 10 0.00288 0.693 0.00199
## 8 Cinderalla slipper 11 0.00265 0.693 0.00184
## 9 Prince city 9 0.00259 0.693 0.00180
## 10 Cinderalla clothes 10 0.00241 0.693 0.00167
## # ... with 2,021 more rows
#畫出兩本書各自重要的字詞
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
結果:故事主角皆顯得較重要
去除stopwords段詞後的情緒分析結果
#快樂王子_word分列列出,去除stopwords
tidy_prince <- df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(book="happy prince")
## Joining, by = "word"
#灰姑娘_word分列列出,去除stopwords
tidy_cinderalla <- cinderalla %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(book="cinderalla")
## Joining, by = "word"
##比較上周的結果
#計算兩本書個字詞出現的數量並做排序
bind_rows(tidy_prince,tidy_cinderalla[,2:4]) %>%
group_by(word,book) %>%
summarise(sum = n()) %>%
arrange(desc(sum)) %>%
ungroup() %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by sum
可見故事主角是重要的,但其他詞頻高的字詞不代表重要性較高
#將兩本書各自做ngram,再合併兩者的結果
C_bigrams <- cinderalla %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% mutate(book='cinderella')
P_bigrams <- df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% mutate(book='happy prince')
tale_bigrams<- full_join(C_bigrams,P_bigrams)
## Joining, by = c("bigram", "book")
tale_bigrams
## # A tibble: 7,622 x 3
## gutenberg_id bigram book
## <int> <chr> <chr>
## 1 10830 illustration hewet's cinderella
## 2 10830 hewet's household cinderella
## 3 10830 household stories cinderella
## 4 10830 stories for cinderella
## 5 10830 for little cinderella
## 6 10830 little folks cinderella
## 7 10830 folks illustrated cinderella
## 8 10830 illustrated w cinderella
## 9 10830 w h cinderella
## 10 10830 h thwaite cinderella
## # ... with 7,612 more rows
#統計bigram出現次數,找出最常出現的bigram
tale_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 5,565 x 2
## bigram n
## <chr> <int>
## 1 of the 63
## 2 to the 61
## 3 in the 41
## 4 said the 34
## 5 and the 23
## 6 the prince 21
## 7 the swallow 21
## 8 at the 20
## 9 the happyprince 17
## 10 i am 16
## # ... with 5,555 more rows
結果:可以看到在沒去除stopwords的情況下,出現很多沒有意義的詞組
清理有stopwords的bigrams
#將bigram分成個別的兩個字,再移除任一word含有stopwords的bigram
bigrams_separated <- tale_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
#計算去除stopwords後的bigram數目
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 598 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 king's son 9
## 2 town councillors 5
## 3 fine gold 4
## 4 glass slipper 4
## 5 illustration cinderella 4
## 6 swallow swallow 4
## 7 cinder wench 3
## 8 court ball 3
## 9 dear prince 3
## 10 glass slippers 3
## # ... with 588 more rows
可以看到stopwors皆已去除
#合併去除stopwords後的bigram結果
bigrams_united <- bigrams_filtered[,-c(1)] %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
## # A tibble: 663 x 2
## bigram book
## <chr> <chr>
## 1 illustration hewet's cinderella
## 2 hewet's household cinderella
## 3 household stories cinderella
## 4 folks illustrated cinderella
## 5 thwaite engraved cinderella
## 6 artists vol cinderella
## 7 cinderella 1855 cinderella
## 8 1855 illustration cinderella
## 9 illustration frontispiece cinderella
## 10 frontispiece cinderella cinderella
## # ... with 653 more rows
以相同方式做trigram的結果
##改以三個字為單位組成的token,移除任一word含有stopwords的trigram
total_words %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 37 x 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 beautiful sapphire lying 2
## 2 boys wore scarlet 2
## 3 bright scarlet cloaks 2
## 4 broken lead heart 2
## 5 bye dear prince 2
## 6 carry amber beads 2
## 7 cataract swallow swallow 2
## 8 catch gold fish 2
## 9 clean white pinafores 2
## 10 coarse red hands 2
## # ... with 27 more rows
前一周小結:兩本書的poor跟beautiful都很多,只是形容的事物不同、意義不同
#兩本書分別出現那些用beautifel去形容的事物
bigrams_filtered %>%
filter(word1 == "beautiful") %>%
count(book,word2, sort = TRUE) %>%
arrange(book)
## # A tibble: 11 x 3
## book word2 n
## <chr> <chr> <int>
## 1 cinderella pair 2
## 2 cinderella allowed 1
## 3 cinderella creature 1
## 4 cinderella lady 1
## 5 cinderella princess 1
## 6 cinderella stranger 1
## 7 happy prince houses 2
## 8 happy prince girl 1
## 9 happy prince jewels 1
## 10 happy prince reed 1
## 11 happy prince sapphire 1
灰姑娘的beautiful幾乎都是用來形容人,而快樂王子多用於形容物品
#兩本書分別出現那些用poor去形容的事物
bigrams_filtered %>%
filter(word1 == "poor") %>%
count(book,word2, sort = TRUE) %>%
arrange(book)
## # A tibble: 5 x 3
## book word2 n
## <chr> <chr> <int>
## 1 cinderella girl 2
## 2 cinderella cinderella 1
## 3 cinderella lady 1
## 4 happy prince house 2
## 5 happy prince prince 1
poor在兩本書就幾乎都是用來形容人
#兩本書各自對bigram做TF-IDF後的結果
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
bigram_tf_idf
# 查看前面出現否定詞和後面的所有詞彙
BING <- get_sentiments("bing")
negation_words <- c("not", "no", "never", "without")
n_bigrams <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
count(word1, word2, sort = TRUE)
n_bigrams
## # A tibble: 57 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 no longer 3
## 2 not only 3
## 3 never seen 2
## 4 no better 2
## 5 no one 2
## 6 no sooner 2
## 7 not allowed 2
## 8 not bring 2
## 9 not know 2
## 10 not to 2
## # ... with 47 more rows
# 查看前面出現否定詞且後面爲情緒詞彙的組合
#使用"bing" lexicon
BING <- get_sentiments("bing")
n_s_bigrams <- n_bigrams %>%
inner_join(BING, by = c(word2 = "word")) %>%
count(word1, word2, sort = TRUE)
n_s_bigrams
## # A tibble: 9 x 3
## word1 word2 nn
## <chr> <chr> <int>
## 1 no better 1
## 2 no mystery 1
## 3 no uneasiness 1
## 4 not approve 1
## 5 not fail 1
## 6 not lie 1
## 7 not lose 1
## 8 not solid 1
## 9 not sorry 1
從57個減為9個
# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調
#先找出所有word2包含情緒詞的bigrams
sentiment_bigrams <- bigrams_separated %>%
inner_join(BING, by = c(word2 = "word")) %>%
count(word1, word2, sort = TRUE)
#
total_positve_sentiment <- BING %>% filter(sentiment == "positive")
#將正負面情緒放上標籤,正面為1;負面為-1
sentiment_bigrams <- sentiment_bigrams %>%
#inner_join(total_positve_sentiment, by = c(word2 = "word"))
mutate(sentiment=ifelse(word2 %in% total_positve_sentiment$word,1,-1))
#將前面有反面字的bigram的情緒作對調
inverse_bigrams <- sentiment_bigrams %>%
mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))
inverse_bigrams
## # A tibble: 464 x 5
## word1 word2 n sentiment sentiment_tag
## <chr> <chr> <int> <dbl> <chr>
## 1 the poor 9 -1 negative
## 2 the great 8 1 positive
## 3 the beautiful 6 1 positive
## 4 a good 5 1 positive
## 5 a great 5 1 positive
## 6 fine gold 4 1 positive
## 7 a beautiful 3 1 positive
## 8 a lovely 3 1 positive
## 9 i love 3 1 positive
## 10 so beautiful 3 1 positive
## # ... with 454 more rows
比較轉換後情緒分析結果的差異
sentiment_bigrams <- sentiment_bigrams %>%
mutate(tag = "origin")
inverse_bigrams <- inverse_bigrams %>%
mutate(tag = "inverse")
compare_inverse <- bind_rows(sentiment_bigrams, inverse_bigrams) %>%
group_by(tag,sentiment) %>%
summarise(sum(sentiment))
compare_inverse
## # A tibble: 4 x 3
## # Groups: tag [?]
## tag sentiment `sum(sentiment)`
## <chr> <dbl> <dbl>
## 1 inverse -1 -174
## 2 inverse 1 290
## 3 origin -1 -177
## 4 origin 1 287
兩本書所有字詞的關聯性
bigram_graph <- bigram_counts %>%
filter(n > 1) %>%
graph_from_data_frame()
#留下出現過1次以上的詞
set.seed(2019)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
set.seed(2019)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
處理各自的ngram
#快樂王子去除stopwords後的ngram
P_bigrams <- df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
P_bigrams_separated <- P_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
P_bigrams_filtered <- P_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
#計算去除stopwords後的bigram數目
P_bigram_counts <- P_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
#灰姑娘去除stopwords後的ngram
C_bigrams <- cinderalla %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
C_bigrams_separated <- C_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
C_bigrams_filtered <- C_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
#計算去除stopwords後的bigram數目
C_bigram_counts <- C_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
快樂王子的字詞關聯圖
#留下出現過1次以上的詞
P_bigram_graph <- P_bigram_counts %>%
filter(n > 1) %>%
graph_from_data_frame()
set.seed(2019)
ggraph(P_bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
灰姑娘的字詞關聯圖
#留下出現過1次以上的詞
C_bigram_graph <- C_bigram_counts %>%
filter(n > 1) %>%
graph_from_data_frame()
set.seed(2019)
ggraph(C_bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
小結:快樂王子原本的字詞關聯性就很低,沒有像灰姑娘一樣以主角為核心字詞,有可能因為它是一個短篇故事,所以沒辦法像較長的故事一樣有較明顯的結構