格林童話文字分析
讀取資料
pacman::p_load(tidytext,readr, tidyverse, data.table, tibble, stringr, textdata,
tidyr, reshape2, plotly, topicmodels, dplyr, text2vec, udpipe,ggplot2,wordcloud,wordcloud2,gutenbergr,igraph,ggraph,xml2,httr,jsonlite,magrittr,data.tree,webshot,htmlwidgets,imager)##
## The downloaded binary packages are in
## /var/folders/dt/brfqfqpx2nx6x96txdg151hh0000gn/T//RtmpbBqtYv/downloaded_packages
查看斷詞結果
text_tidy = tibble(text) %>%
unnest_tokens(word, text) %>%
mutate(linenumber = row_number(),story = cumsum(str_detect(word, regex("^story([1-9]|[1-6][0-9])$", ignore_case = T))))
# 標註集數
data(stop_words)
text_tidy = text_tidy %>%
anti_join(stop_words)## Joining, by = "word"
統計字頻
text_tidy %>%
count(word, sort = T) %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "#00bfc4") +
coord_flip()總體文字雲
情緒分析
NRC字典
準備資料
nrc <- get_sentiments("nrc")
grimmlist <- text_tidy %>%
filter(nchar(.$word)>1) %>%
mutate(rown = row_number())
grimmnrc<-grimmlist %>%
inner_join(nrc, by = "word")
tt_neg<-filter(grimmnrc,sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')
tt_neg_dis<-distinct(tt_neg,word,rown,story)
neg_re<-tt_neg_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
neg_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()NRC最常出現的正面情緒字
tt_pos<-filter(grimmnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')
tt_pos_dis<-distinct(tt_pos,word,rown,story)
pos_re<-tt_pos_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()Afinn字典
準備資料
afinn = get_sentiments("afinn")
storyafinn <- text_tidy %>%
inner_join(afinn) %>%
group_by(story) %>%
summarise(sentiment = sum(value)) # 以每集為單位將情緒值加總## Joining, by = "word"
storyafinn %>%
ggplot(aes(story, sentiment, fill = "#F7766D")) +
geom_col(show.legend = FALSE) +
scale_x_continuous(breaks=seq(1,62,5)) Afinn最常出現的負面情緒字
## Joining, by = "word"
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
neg_re<-tt_neg_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
neg_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()Afinn最常出現的正面情緒字
## Joining, by = "word"
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
pos_re <- tt_pos_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()Bing字典
準備資料
bing = get_sentiments("bing")
grimm_bing <- text_tidy %>%
filter(nchar(.$word)>1) %>%
mutate(nrow = row_number())%>%
inner_join(bing, by = "word")
grimm_bing_all <- grimm_bing%>%
group_by(sentiment)%>%
count(word, sort = TRUE)%>%
ungroup()
grimm_bing_all%>%
group_by(sentiment) %>%
top_n(10,wt = n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()Loughran字典
Loughran最常出現的正/負面情緒字
loughran = get_sentiments("loughran")
loughran_wc = get_sentiments("loughran") %>%
filter(sentiment == "negative" | sentiment == "positive") %>%
inner_join(text_tidy) %>%
count(word, sentiment, sort = T)## Joining, by = "word"
loughran_wc %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()## Selecting by n
四字典各集比較分析
正/負面情緒詞總數差異
每集情緒詞總數比較
grimm_cnt %>%
inner_join(afinn) %>%
group_by(story, value) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_boxplot(aes(x=story, y=value, colour=as.factor(story)),show.legend = FALSE) +
ggtitle("Afinn-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))## Joining, by = "word"
grimm_cnt %>%
inner_join(nrc) %>%
group_by(story, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=story, y=cnt, colour=sentiment)) +
ggtitle("NRC-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) ## Joining, by = "word"
grimm_cnt %>%
inner_join(bing) %>%
group_by(story, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=story, y=cnt, colour=sentiment)) +
ggtitle("Bing-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) ## Joining, by = "word"
grimm_cnt %>%
inner_join(loughran) %>%
group_by(story, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=story, y=cnt, colour=sentiment)) +
ggtitle("Loughran-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) ## Joining, by = "word"
正面-負面情緒差異
#整理字典"bing"正負面情緒各級出現次數與差值
bing_neg_pos <- text_tidy %>%
dplyr::select(word,story) %>%
inner_join(get_sentiments("bing")) %>%
group_by(story,sentiment) %>%
summarise(count=sum(story))## Joining, by = "word"
bing_trend<-bing_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")
#整理字典"nrc"正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(story)%>%mutate(method='nrc')
nrc_trend<-nrc_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)
#整理字典"afinn"正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")## Joining, by = "word"
afinn_neg <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value<0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")## Joining, by = "word"
afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(story)%>%mutate(method='nrc')
afinn_trend<-afinn_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")
#整理字典"loughran"正負面情緒各級出現次數與差值
loughran_data <- text_tidy %>%
dplyr::select(word,story) %>%
inner_join(get_sentiments("loughran")) %>%
group_by(story,sentiment) %>%
summarise(count=sum(story))## Joining, by = "word"
loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")
loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(story)%>%mutate(method='nrc')
loughran_trend<-loughran_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")合併四個字典並劃出折線圖
bind_rows(bing_trend,nrc_trend,afinn_trend,loughran_trend) %>%
ggplot(aes(x= story,y=Deviation,col=method)) +
geom_line() +facet_wrap(~method, ncol = 1, scales = "fixed")計算term frequency
book_words = text_tidy %>% count( word,story, sort = T)
total_words <- book_words %>%
group_by(story) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)## Joining, by = "story"
freq_by_rank <- book_words %>%
group_by(story) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank## # A tibble: 15,718 x 6
## # Groups: story [62]
## word story n total rank `term frequency`
## <chr> <int> <int> <int> <int> <dbl>
## 1 hans 42 71 343 1 0.207
## 2 tailor 18 57 991 1 0.0575
## 3 hansel 19 45 905 1 0.0497
## 4 gretel 42 44 343 2 0.128
## 5 peasant 28 42 589 1 0.0713
## 6 bird 40 39 987 1 0.0395
## 7 wife 10 37 592 1 0.0625
## 8 gretel 19 36 905 2 0.0398
## 9 shudder 58 34 1024 1 0.0332
## 10 snowdrop 31 33 701 1 0.0471
## # … with 15,708 more rows
計算TF-IDF
book_words <- book_words %>%
bind_tf_idf(word, story, n)
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))## # A tibble: 15,718 x 6
## word story n tf idf tf_idf
## <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 hans 42 71 0.207 2.74 0.567
## 2 gretel 42 44 0.128 3.03 0.388
## 3 doctor 54 18 0.0807 4.13 0.333
## 4 elsie 33 27 0.0714 4.13 0.295
## 5 sparrow 8 28 0.0650 4.13 0.268
## 6 fox 56 28 0.115 2.18 0.251
## 7 sultan 5 15 0.0584 4.13 0.241
## 8 rapunzel 16 24 0.0562 4.13 0.232
## 9 fundevogel 17 13 0.0544 4.13 0.224
## 10 peasant 28 42 0.0713 3.03 0.216
## # … with 15,708 more rows
同一行裡最常一起出現的詞彙
library(widyr)
text_tidy_cors <- text_tidy %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, linenumber, sort = TRUE)
set.seed(2016)
text_tidy_cors %>%
filter(abs(correlation) > .01) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()LDA分析
主題-文字分析
word_counts <- text_tidy %>%
count(story, word, sort = TRUE) %>%
ungroup()
episode_dtm <- word_counts %>%
cast_dtm(story, word, n)
episode_dtm## <<DocumentTermMatrix (documents: 62, terms: 4397)>>
## Non-/sparse entries: 15718/256896
## Sparsity : 94%
## Maximal term length: 15
## Weighting : term frequency (tf)
4個主題 Beta值
episode_lda <- LDA(episode_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")
text_topics %>% head(10)## # A tibble: 10 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 hans 3.27e- 3
## 2 2 hans 9.92e- 3
## 3 3 hans 3.60e- 3
## 4 4 hans 2.76e-164
## 5 1 tailor 1.31e- 4
## 6 2 tailor 2.25e-118
## 7 3 tailor 8.17e- 3
## 8 4 tailor 8.22e- 4
## 9 1 hansel 1.74e-171
## 10 2 hansel 6.28e- 3
選出各主題前5高的文字
top_terms <- text_topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>% head(10)## # A tibble: 10 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 king 0.0144
## 2 1 fox 0.00982
## 3 1 father 0.00865
## 4 1 princess 0.00852
## 5 1 till 0.00691
## 6 2 gretel 0.0133
## 7 2 wife 0.0111
## 8 2 hans 0.00992
## 9 2 day 0.00868
## 10 2 mother 0.00862
remove_word = c("day","time","boy","hans","son","father","mother","women","wife","door","set","gretel","hans","hansel","king","elsie","beautiful","woman")
episode_lda <- LDA(episode_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")
top_terms <- text_topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()- 第一個主題由於是鄉野傳說常以野獸精怪作為主體去敘述故事
- 第二個則是因為故事的搜集並非都通過沒有受過教育的農民,其中的大部分倒是經由受過良好教育的中產階級女性的講述,所以故事多會跟家庭相關
- 第三個則是跟當時的產業結構有所相關
- 第四個是因為當時的歷史背景仍是19世紀,所以當時蒐集的故事難免都會與貴族王室有所關聯