格林童話文字分析
資料介紹
- Data Source: 古騰堡電子書–格林童話
- 共 62 篇故事
文本探索
載入套件
library(data.table)
library(dplyr)
library(ggraph)
library(ggplot2)
library(gutenbergr)
library(hablar)
library(htmlwidgets)
library(janeaustenr)
library(igraph)
library(jiebaR)
library(LDAvis)
library(ramify)
library(purrr)
library(RColorBrewer)
library(readr)
library(reshape2)
library(servr)
library(stringr)
library(text2vec)
library(textdata)
library(textshape)
library(tidyr)
library(tidytext)
library(tidyverse)
library(tm)
library(topicmodels)
library(udpipe)
library(webshot)
library(widyr)
library(wordcloud)
library(wordcloud2)讀取資料
文字前處理
查看斷詞結果
text_tidy = tibble(text) %>%
unnest_tokens(word, text) %>%
# 標註第幾篇故事
mutate(linenumber = row_number(), story = cumsum(str_detect(word, regex("^story([1-9]|[1-6][0-9])$", ignore_case = T))))
text_tidy %>%
head(10)## # A tibble: 10 x 3
## word linenumber story
## <chr> <int> <int>
## 1 story1 1 1
## 2 the 2 1
## 3 golden 3 1
## 4 bird 4 1
## 5 a 5 1
## 6 certain 6 1
## 7 king 7 1
## 8 had 8 1
## 9 a 9 1
## 10 beautiful 10 1
排除停用字
## Joining, by = "word"
文字雲分析
總體文字雲
區分正、負面情緒的文字雲 (採用 Bing 字典)
text_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)## Joining, by = "word"
- 正面詞彙字頻較大
- 形容詞較多
情緒分析
四個情緒字典介紹
- NRC (10 Category)
34% (+): anticipation, joy, positive, surprise, trust
66% (-): anger, disgust, fear, negative, sadness
- Afinn (Numeric: -5~+5, mean: -0.6)
- Bing (2 Category: 30% Positive, 70% Negative)
- Loughran (6 Category)
(+): positive
(-): constraining(被迫/限制), litigious(好爭論的), negative, superfluous(多餘的), uncertainty
NRC字典
準備資料
NRC 最常出現的負面情緒字
tt_neg <- filter(grimmnrc, sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')
tt_neg_dis <- distinct(tt_neg, word, rown, story)
neg_re <- tt_neg_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
neg_re %>%
top_n(10, wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label = n))+
theme(text=element_text(size = 14))+
coord_flip()NRC 最常出現的正面情緒字
tt_pos <- filter(grimmnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')
tt_pos_dis <- distinct(tt_pos, word, rown, story)
pos_re <- tt_pos_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_re %>%
top_n(10, wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label = n))+
theme(text=element_text(size = 14))+
coord_flip()找出 “king” 最常出現的前三大故事
pos_king <- tt_pos_dis %>%
filter(word == 'king') %>%
group_by(story) %>%
count(word, sort = TRUE) %>%
ungroup()
pos_king %>% top_n(3)## Selecting by n
## # A tibble: 3 x 3
## story word n
## <int> <chr> <int>
## 1 61 king 31
## 2 60 king 24
## 3 59 king 19
分別為以下三個故事:
* 故事 61 “CAT-SKIN”
* 故事 60 “IRON HANS”
* 故事 59 “KING GRISLY-BEARD”
Afinn 字典
準備資料
afinn = get_sentiments("afinn")
storyafinn <- text_tidy %>%
inner_join(afinn) %>%
group_by(story) %>%
# 以每個故事為單位將情緒值加總
summarise(sentiment = sum(value)) ## Joining, by = "word"
每個故事情緒加總變化趨勢
storyafinn %>%
ggplot(aes(story, sentiment, fill = "#F7766D")) +
geom_col(show.legend = FALSE) +
scale_x_continuous(breaks = seq(1, 62, 5)) - 故事 8 “THE DOG AND THE SPARROW” 最負面
- 故事 2 “HANS IN LUCK” 最正面
Afinn 最常出現的負面情緒字
## Joining, by = "word"
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
neg_re <- tt_neg_dis %>%
group_by(word) %>%
count(word, sort = TRUE) %>%
ungroup()
neg_re %>%
top_n(10, wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label = n))+
theme(text=element_text(size = 14))+
coord_flip()Afinn 最常出現的正面情緒字
## Joining, by = "word"
## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story
pos_re <- tt_pos_dis %>%
group_by(word) %>%
count(word, sort = TRUE) %>%
ungroup()
pos_re %>%
top_n(10, wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label = n))+
theme(text=element_text(size = 14))+
coord_flip()Bing 字典
準備資料
Bing 最常出現的正/負面情緒字
grimm_bing_all <- grimm_bing %>%
group_by(sentiment) %>%
count(word, sort = TRUE) %>%
ungroup()
grimm_bing_all %>%
group_by(sentiment) %>%
top_n(10, wt = n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()Loughran 字典
Loughran 中各情緒類型包含多少字
Loughran 最常出現的正/負面情緒字
loughran_wc <- get_sentiments("loughran") %>%
filter(sentiment == "negative" | sentiment == "positive") %>%
inner_join(text_tidy) %>%
count(word, sentiment, sort = T)
loughran_wc %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()四字典各故事比較分析
正/負面情緒詞總數差異
準備資料
每個故事情緒詞總數比較
grimm_cnt %>%
inner_join(afinn) %>%
group_by(story, value) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_boxplot(aes(x=story, y=value, colour=as.factor(story)),show.legend = FALSE) +
ggtitle("Afinn-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))grimm_cnt %>%
inner_join(nrc) %>%
group_by(story, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=story, y=cnt, colour=sentiment)) +
ggtitle("NRC-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) grimm_cnt %>%
inner_join(bing) %>%
group_by(story, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=story, y=cnt, colour=sentiment)) +
ggtitle("Bing-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,62,5)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) #grimm_cnt %>%
# inner_join(loughran) %>%
# group_by(story, sentiment) %>%
# summarise(cnt = sum(count)) %>% ggplot() +
# geom_line(aes(x=story, y=cnt, colour=sentiment)) +
# ggtitle("Loughran-每集情緒詞數量差異") +
# scale_x_continuous(breaks=seq(1,62,5)) +
# theme(text=element_text(family="蘋方-繁 中黑體", size=12),
# plot.title=element_text(hjust = 0.5)) 正面-負面情緒差異
準備資料
# 整理字典 "bing" 正負面情緒各級出現次數與差值
bing_neg_pos <- text_tidy %>%
dplyr::select(word,story) %>%
inner_join(get_sentiments("bing")) %>%
group_by(story,sentiment) %>%
summarise(count=sum(story))## Joining, by = "word"
bing_trend<-bing_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")
# 整理字典 "nrc" 正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(story)%>%mutate(method='nrc')
nrc_trend<-nrc_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)
# 整理字典 "afinn" 正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")## Joining, by = "word"
## Joining, by = "word"
afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(story)%>%mutate(method='nrc')
afinn_trend<-afinn_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")
# 整理字典 "loughran" 正負面情緒各級出現次數與差值
loughran_data <- text_tidy %>%
dplyr::select(word,story) %>%
inner_join(get_sentiments("loughran")) %>%
group_by(story,sentiment) %>%
summarise(count=sum(story))## Joining, by = "word"
loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")
#loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(story)%>%mutate(method='nrc')
#loughran_trend<-loughran_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")情緒分析:主要針對NRC字典,Afinn字典,Bing字典,loughran字典
tidy_fairyT <- fairyText %>%
unnest_tokens(word, text) %>%
anti_join(stop_words,by = "word")
#sum(fairyT_g$n)
fairyT_g<-tidy_fairyT %>%
group_by(title)%>%
count(word, sort = TRUE)%>%
ungroup()
fairyT_bing<-fairyT_g %>%
inner_join(get_sentiments("bing"),by="word")%>%
group_by(title)%>%
count(sentiment, sort = TRUE)%>%
ungroup()
fiaryBing<-left_join(fairyT_bing%>%filter(sentiment=='negative'),fairyT_bing%>%filter(sentiment=='positive'), by = c("title"="title"))%>%
mutate(colname = n.x-(ifelse(is.na(n.y),0,n.y)))%>%
mutate(n=ifelse(colname>0,'bing_negative','bing_positive'))%>%
mutate(t='bing')%>%
select(title,n,t)
fiaryAfinn<-fairyT_g %>%
inner_join(get_sentiments("afinn"),by="word")%>%
group_by(title)%>%
summarise(value = sum(value))%>%
#count(value, sort = TRUE)%>%
ungroup()%>%
mutate(n=ifelse(value>0,'afinn_positive','afinn_negative'))%>%
mutate(t='afinn')%>%
select(title,n,t)
fiaryT_nrc<-fairyT_g %>%
inner_join(get_sentiments("nrc"),by="word")%>%
#filter(sentiment %in% c('negative','positive'))
mutate(li=ifelse(sentiment %in% c('negative','anger','disgust','fear','sadness','anticipation'), "nrc_negative", "nrc_positive"))%>%
group_by(title,li)%>%
summarise(sn = sum(n))
fiaryNrc<-fiaryT_nrc%>%
filter(li=='nrc_positive')%>%
inner_join(fiaryT_nrc%>% filter(li=='nrc_negative'),by =c("title"="title"))%>%
mutate(n=ifelse((sn.y-sn.x)>0,'nrc_positive','nrc_negative'))%>%
mutate(t='nrc')%>%
select(title,n,t)
fiaryNrc = as.data.frame(fiaryNrc)
fairyT_loughran<-fairyT_g %>%
inner_join(get_sentiments("loughran"),by="word")%>%
filter(sentiment %in% c('negative','positive'))%>%
group_by(title)%>%
count(sentiment, sort = TRUE)%>%
ungroup()
fiaryloughran<-left_join(fairyT_loughran%>%filter(sentiment=='negative'),fairyT_loughran%>%filter(sentiment=='positive'), by = c("title"="title"))%>%
mutate(colname = n.x-(ifelse(is.na(n.y),0,n.y)))%>%
mutate(n=ifelse(colname>0,'loughran_negative','loughran_positive'))%>%
mutate(t='loughran')%>%
select(title,n,t)
fiaryAll<-rbind(rbind(rbind(fiaryAfinn,fiaryBing),fiaryNrc),fiaryloughran)
a1<-fiaryAll%>%
group_by(n)%>%
count()%>%
separate(n,c("f","t"))
ggplot(data=a1, aes(x=f, y=nn, fill=t)) +
geom_bar(stat="identity", width=0.2)+
scale_fill_brewer(palette="Paired")+
theme_minimal()+
labs(x = "Contribution to sentiment",
y = "story number")+
theme(axis.text.x = element_text(angle = 0, hjust = 1))- 利用Afinn字典,loughran字典,Bing字典分析,大部份的童話故事都偏負面。NRC字典分析,正負面比較平均。可以看的出格林童話未必適合小朋友去閱讀。
TF-IDF 分析
計算 term frequency
book_words <- text_tidy %>%
count(word, story, sort = T)
total_words <- book_words %>%
group_by(story) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)## Joining, by = "story"
freq_by_rank <- book_words %>%
group_by(story) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank## # A tibble: 15,718 x 6
## # Groups: story [62]
## word story n total rank `term frequency`
## <chr> <int> <int> <int> <int> <dbl>
## 1 hans 42 71 343 1 0.207
## 2 tailor 18 57 991 1 0.0575
## 3 hansel 19 45 905 1 0.0497
## 4 gretel 42 44 343 2 0.128
## 5 peasant 28 42 589 1 0.0713
## 6 bird 40 39 987 1 0.0395
## 7 wife 10 37 592 1 0.0625
## 8 gretel 19 36 905 2 0.0398
## 9 shudder 58 34 1024 1 0.0332
## 10 snowdrop 31 33 701 1 0.0471
## # … with 15,708 more rows
計算 TF-IDF
book_words <- book_words %>%
bind_tf_idf(word, story, n)
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))## # A tibble: 15,718 x 6
## word story n tf idf tf_idf
## <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 hans 42 71 0.207 2.74 0.567
## 2 gretel 42 44 0.128 3.03 0.388
## 3 doctor 54 18 0.0807 4.13 0.333
## 4 elsie 33 27 0.0714 4.13 0.295
## 5 sparrow 8 28 0.0650 4.13 0.268
## 6 fox 56 28 0.115 2.18 0.251
## 7 sultan 5 15 0.0584 4.13 0.241
## 8 rapunzel 16 24 0.0562 4.13 0.232
## 9 fundevogel 17 13 0.0544 4.13 0.224
## 10 peasant 28 42 0.0713 3.03 0.216
## # … with 15,708 more rows
同一行裡最常一起出現的詞彙
text_tidy_cors <- text_tidy %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, linenumber, sort = TRUE)
set.seed(2016)
text_tidy_cors %>%
filter(abs(correlation) > .01) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()故事背景圍繞在皇室貴族、平民家庭、等不同接階級的故事
LDA 分析
將資料轉成 DocumentTermMatrix (DTM) 格式
word_counts <- text_tidy %>%
count(story, word, sort = TRUE) %>%
ungroup()
story_dtm <- word_counts %>%
cast_dtm(story, word, n)
story_dtm## <<DocumentTermMatrix (documents: 62, terms: 4397)>>
## Non-/sparse entries: 15718/256896
## Sparsity : 94%
## Maximal term length: 15
## Weighting : term frequency (tf)
主題-文字分析
4 個主題 Beta 值
# 建立一個 k 個 topic 的 LDA model
story_lda <- LDA(story_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(story_lda, matrix = "beta")
text_topics %>% head(10)## # A tibble: 10 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 hans 3.27e- 3
## 2 2 hans 9.92e- 3
## 3 3 hans 3.60e- 3
## 4 4 hans 2.76e-164
## 5 1 tailor 1.31e- 4
## 6 2 tailor 2.25e-118
## 7 3 tailor 8.17e- 3
## 8 4 tailor 8.22e- 4
## 9 1 hansel 1.74e-171
## 10 2 hansel 6.28e- 3
選出各主題前 5 高的文字
top_terms <- text_topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>% head(10)## # A tibble: 10 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 king 0.0144
## 2 1 fox 0.00982
## 3 1 father 0.00865
## 4 1 princess 0.00852
## 5 1 till 0.00691
## 6 2 gretel 0.0133
## 7 2 wife 0.0111
## 8 2 hans 0.00992
## 9 2 day 0.00868
## 10 2 mother 0.00862
story_lda <- LDA(story_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(story_lda, matrix = "beta")
remove_word = c("day","time","boy","hans","son","father","mother","women","wife","door","set","gretel","hans","hansel","king","elsie","beautiful","woman")
top_terms <- text_topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()- 第一個主題由於是鄉野傳說常以野獸精怪作為主體去敘述故事
- 第二個則是因為故事的搜集並非都通過沒有受過教育的農民,其中的大部分倒是經由受過良好教育的中產階級女性的講述,所以故事多會跟家庭相關
- 第三個則是跟當時的產業結構有所相關
- 第四個是因為當時的歷史背景仍是19世紀,所以當時蒐集的故事難免都會與貴族王室有所關聯
主題-文字分析-Beta值(seed=2000)
- 分四個主題去分析
#sum(fairyT_g$n)
title_dtm_beta <- fairyT_g %>%
cast_dtm(title, word, n)
#title_dtm
title_lda_beta <- LDA(title_dtm_beta, k = 4, control = list(seed = 2000))
text_topics_beta <- tidy(title_lda_beta, matrix = "beta")
#text_topics %>% head(10)
top_terms_beta <- text_topics_beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#arrange(topic, desc(beta) ) %>%
#group_by(topic) %>%
#mutate(id = row_number():n())
top_terms_beta %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
# scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip() - 從文字上,我們還看不出每個一主題要表達的含意,我們再針對每個文字跟格林童話故事之間的分佈來探討
- 先分析四個主題文字的相依性會不會太重疊
top_terms_beta <- text_topics_beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
aa<-top_terms_beta%>%
select(topic,term)
gb <- graph_from_data_frame(d=aa, directed=F)
ceb <- cluster_fast_greedy(gb)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "square", "circle")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 15, 10)
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name),V(gb)$name, "" )
plot(ceb,gb)我們發現只有主題一比較獨立,其他主題比較接近。所以我們再針對“主題一”作分析
針對“主題一”的文字再作細部分析
link_b<-top_terms_beta%>%
inner_join(fairyT_g, by=c("term"="word"))%>%
inner_join(story_title, by=c("title"="text"))%>%
filter(topic=='1')%>%
select(term,line)
gb <- graph_from_data_frame(d=link_b, directed=F)
V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "", V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"
V(gb)$type <- bipartite.mapping(gb)$type
V(gb)$index<-c(1:70)## Warning in vattrs[[name]][index] <- value: 被替換的項目不是替換值長度的倍數
V(gb)$label.dist=ifelse(V(gb)$index%%3==1 ,-1.5,ifelse(V(gb)$index%%3==2,0.5,1))
plot(gb, layout = layout_with_gem) + 從上圖分佈,某些字聚在一起,有father/mother/children/door/red/bird等字,表示會出現在固定某幾個童話故事,且會偏向跟家庭成員故事有關, 我們從father/mother/children/door/red/bird等字去找出幾個故事。
- 像是:
- 故事40,THE JUNIPER-TREE(杜松樹),講述繼母想殺死丈夫和前妻的小孩的故事
- 故事19,HANSEL AND GRETEL(糖果屋),繼母把小孩騙出去,然後小孩又回家的故事
LDAvis分析(分10主題)
https://jhnny009.github.io/studyG7/
dtf <- document_term_frequencies(tidy_fairyT, document = "title", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 15)
dim(dtm_clean)## [1] 62 449
set.seed(2000)
topic_n = 10
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)## INFO [08:49:10.234] early stopping at 130 iteration
## INFO [08:49:10.322] early stopping at 30 iteration
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "princess" "red" "door" "king" "prince" "hans"
## [2,] "horse" "cap" "ran" "king's" "dwarf" "gretel"
## [3,] "bride" "youth" "cut" "son" "life" "hansel"
## [4,] "happened" "cow" "fire" "soldier" "till" "goodbye"
## [5,] "girl" "fish" "eyes" "shudder" "thou" "cellar"
## [6,] "huntsman" "fisherman" "left" "wine" "rest" "cart"
## [7,] "met" "wife" "child" "daughter" "twelve" "cock"
## [8,] "father" "grandmother" "heard" "learn" "brother" "crept"
## [9,] "court" "shepherd" "answered" "rapunzel" "thee" "doctor"
## [10,] "true" "flower" "table" "castle" "brothers" "miser"
## [,7] [,8] [,9] [,10]
## [1,] "children" "cat" "queen" "boy"
## [2,] "night" "fox" "home" "tailor"
## [3,] "rose" "wolf" "cook" "mother"
## [4,] "leave" "mouse" "peasant" "tree"
## [5,] "forest" "master" "found" "father"
## [6,] "carried" "wood" "water" "snow"
## [7,] "bear" "walk" "fine" "bird"
## [8,] "dead" "bird" "lady" "wild"
## [9,] "black" "tail" "kitchen" "snowdrop"
## [10,] "chanticleer" "hair" "heart" "blood"
- 針對LDAvis的10個主題,再篩選,只留動物類跟角色名詞(像是動詞/形容詞很常在很多故事出現,反而難分析)
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#tidy(lda_text_t)
LDAvis10<-lda_text_t%>%
gather(m,n,V1:V10)%>%
inner_join(fairyT_g, by=c("n"="word"))%>%
inner_join(story_title, by=c("title"="text"))%>%
select(m,n,line)
remove_word=c('happened','red','cap','youth','door','ran','cut','fire','eyes','life','shudder','till','thou','goodbye','cellar','night','leave','forest',
'home','cook','found','tailor','king\'s','met','true','shepherd','left','heard','answered','table','wine','learn','castle','rest','twelve',
'thee','cart','crept','miser','carried','dead','black','wood',"walk","tail","hair","water","fine","kitchen","heart","snow","wild","snowdrop",
"blood")
colnames(LDAvis10)[colnames(LDAvis10) == 'm'] <- 'topic'
colnames(LDAvis10)[colnames(LDAvis10) == 'n'] <- 'term'
LDAvis10<-LDAvis10%>%
filter(!term %in% remove_word)
all10<-LDAvis10%>%
# filter(topic=='V4')%>%
select(topic,line)
gb <- graph_from_data_frame(d=all10, directed=F)
V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "", V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"
V(gb)$type <- bipartite.mapping(gb)$type
plot(gb, layout = layout_components)## # A tibble: 32 x 3
## topic line text
## <chr> <int> <chr>
## 1 V6 42 CLEVER HANS
## 2 V6 2 HANS IN LUCK
## 3 V6 33 CLEVER ELSIE
## 4 V6 60 IRON HANS
## 5 V6 19 HANSEL AND GRETEL
## 6 V6 26 CLEVER GRETEL
## 7 V6 4 THE TRAVELLING MUSICIANS
## 8 V6 21 MOTHER HOLLE
## 9 V6 54 DOCTOR KNOWALL
## 10 V6 48 THE BLUE LIGHT
## # … with 22 more rows
- 從上圖可以發現,大部份主題都蠻接近的。可能大部份故事用的角色名稱跟動物都是一樣的,像是有些故事都有鳥/魚/國王/女王。
- 唯獨“主題6”,“主題9”有比較聚焦的故事。
- 主題6-主要的因為文字 HANS/Gretel/Hansel都是某些故事角色姓名,所以只會連結到相應的故事。
- 主題9-找到都是跟女王/皇后有關係的故事。像是故事31.SNOWDROP(白雪公主)有壞皇后。
文件-主題分析
- 分成 4 個 topic 下,找出各篇故事適合分類到哪個 topic
- gamma 為 per-document-per-topic probabilities
計算每個主題的故事數量
story_gamma <- tidy(story_lda, matrix = "gamma")
story_gamma2 <- story_gamma %>%
group_by(document) %>%
filter(gamma == (max(gamma)))
table(story_gamma2$topic)##
## 1 2 3 4
## 16 16 14 16
各主題的故事數量較平均
畫出各篇故事屬於各個 topic 的機率
for(i in c(1:4)){
picture <- story_gamma %>%
filter(document %in%
c(story_title %>%
filter(line > 16*(i-1) & line <= 16*i) %>%
select(line))$line) %>%
mutate(title = reorder(document, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ title)
print(picture)
}- 大部分的故事都有一個明確的主題,例如故事 1, 2 等
- 少數主題並無明確主題,例如故事 7, 5
網路分析
建立網路關係
link <- story_gamma %>%
# 只篩選 gamma 值大於 0.25
filter(gamma > 0.25) %>%
# 把 gamma 值當成 link 的權重
rename(weight = gamma)
# 把 topic 欄位的 1 取代成 topic 1,依此類推,避免與 document 的 1 混淆
link$topic <- link$topic %>%
gsub(1, "topic 1", .) %>%
gsub(2, "topic 2", .) %>%
gsub(3, "topic 3", .) %>%
gsub(4, "topic 4", .)
# 建立無向圖
TopicNetwork <- graph_from_data_frame(d = link, directed = F)
TopicNetwork## IGRAPH 7e35733 UNW- 66 64 --
## + attr: name (v/c), weight (e/n)
## + edges from 7e35733 (vertex names):
## [1] 61--topic 1 56--topic 1 24--topic 1 1 --topic 1 2 --topic 1 51--topic 1
## [7] 12--topic 1 37--topic 1 52--topic 1 53--topic 1 5 --topic 1 11--topic 1
## [13] 46--topic 1 57--topic 1 7 --topic 1 47--topic 1 44--topic 1 42--topic 2
## [19] 19--topic 2 40--topic 2 10--topic 2 15--topic 2 8 --topic 2 29--topic 2
## [25] 16--topic 2 13--topic 2 4 --topic 2 20--topic 2 21--topic 2 26--topic 2
## [31] 3 --topic 2 7 --topic 2 43--topic 2 18--topic 3 28--topic 3 58--topic 3
## [37] 60--topic 3 33--topic 3 50--topic 3 35--topic 3 49--topic 3 54--topic 3
## [43] 17--topic 3 23--topic 3 6 --topic 3 41--topic 3 27--topic 3 31--topic 4
## + ... omitted several edges
- U : undirected graph 無向圖
- N : 節點包含 name 屬性
- W : 連結包含 weight 屬性
畫出網路圖
# 顯示有超過 5 個關聯的節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5,
vertex.label = ifelse(degree(TopicNetwork) > 5, V(TopicNetwork)$name, NA), vertex.label.font = 15)可看出主題 3 較獨立於其他三個主題
網路屬性
節點屬性
## $name
## [1] "61" "56" "24" "1" "2" "51" "12"
## [8] "37" "52" "53" "5" "11" "46" "57"
## [15] "7" "47" "44" "42" "19" "40" "10"
## [22] "15" "8" "29" "16" "13" "4" "20"
## [29] "21" "26" "3" "43" "18" "28" "58"
## [36] "60" "33" "50" "35" "49" "54" "17"
## [43] "23" "6" "41" "27" "31" "22" "62"
## [50] "48" "9" "59" "14" "34" "32" "30"
## [57] "36" "25" "38" "45" "55" "39" "topic 1"
## [64] "topic 2" "topic 3" "topic 4"
節點包含各篇故事 (1 ~ 62) 和各主題 (topic1, topic2, topic3, topic4)
設定連結屬性
# 設定連結的 type 為主題分類
E(TopicNetwork)$type <- link$topic
# 設定 weight
E(TopicNetwork)$weight <- link$weight
edge_attr(TopicNetwork)## $weight
## [1] 0.9998963 0.9997504 0.9999133 0.9999143 0.9999096 0.9999217 0.9998272
## [8] 0.9998145 0.9998049 0.9999051 0.5826817 0.9998011 0.9764535 0.9998945
## [15] 0.4776732 0.9995984 0.9993411 0.9998231 0.9999330 0.9999385 0.9632137
## [22] 0.9998963 0.9998592 0.9998806 0.9998579 0.9997841 0.9998456 0.9997408
## [29] 0.9998391 0.9997728 0.9998343 0.5222429 0.8036270 0.9999388 0.9998970
## [36] 0.9999408 0.9999313 0.9998395 0.9998596 0.9999192 0.9999071 0.9997280
## [43] 0.9997462 0.9998621 0.9996210 0.9778185 0.9991582 0.9999135 0.9998624
## [50] 0.9999282 0.9998794 0.9998667 0.9998583 0.8975101 0.9998247 0.9998838
## [57] 0.4171610 0.9998602 0.9559805 0.9998184 0.9997419 0.9998535 0.8095904
## [64] 0.9996808
##
## $type
## [1] "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1"
## [8] "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1"
## [15] "topic 1" "topic 1" "topic 1" "topic 2" "topic 2" "topic 2" "topic 2"
## [22] "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 2"
## [29] "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 3" "topic 3"
## [36] "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 3"
## [43] "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 4" "topic 4"
## [50] "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4"
## [57] "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4"
## [64] "topic 4"
Transitivity 遞移性
## [1] 0
該網路沒有 Transitivity,沒有 triangle loop
degree
deg <- degree(TopicNetwork, mode = "all")
plot(TopicNetwork,
# 依照 degree 大小設定節點大小
vertex.size = deg * 3,
# 只顯示 degree 大於 5 的節點名稱
vertex.label = ifelse(deg > 5, V(TopicNetwork)$name, NA))# Histogram of node degree
hist(deg, breaks = 1:vcount(TopicNetwork)-1, main = "Histogram of node degree")# Degree distribution
deg.dist <- degree_distribution(TopicNetwork, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.2, col="orange", xlab="Degree", ylab="Cumulative Frequency")由圖可知只有 topic 1, topic 2, topic 3, topic 4 這四個節點會有較多的連結,其他節點大多只有一兩個連結,因為通常一個故事只會歸屬一種主題。
Centrality
Degree Centrality
number of ties
## 61 56 24 1 2 51 12 37 52 53
## 1 1 1 1 1 1 1 1 1 1
## 5 11 46 57 7 47 44 42 19 40
## 2 1 1 1 2 1 1 1 1 1
## 10 15 8 29 16 13 4 20 21 26
## 1 1 1 1 1 1 1 1 1 1
## 3 43 18 28 58 60 33 50 35 49
## 1 1 1 1 1 1 1 1 1 1
## 54 17 23 6 41 27 31 22 62 48
## 1 1 1 1 1 1 1 1 1 1
## 9 59 14 34 32 30 36 25 38 45
## 1 1 1 1 1 1 1 1 1 1
## 55 39 topic 1 topic 2 topic 3 topic 4
## 1 1 17 16 14 17
## $res
## [1] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [26] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [51] 1 1 1 1 1 1 1 1 1 1 1 1 17 16 14 17
##
## $centralization
## [1] 0.2317016
##
## $theoretical_max
## [1] 4290
- 各主題 (topic1, topic2, topic3, topic4) 的節點的 Degree Centrality 較高
- res : 節點中心度
- centralization
- theoretical_max : 最大中心化分數
Eigenvector Centrality
centrality proportional to the sum of connection centralities
Betweenness Centrality
centrality based on a broker position connecting others
## 61 56 24 1 2 51 12 37 52 53
## 0 0 0 0 0 0 0 0 0 0
## 5 11 46 57 7 47 44 42 19 40
## 561 0 0 0 544 0 0 0 0 0
## 10 15 8 29 16 13 4 20 21 26
## 0 0 0 0 0 0 0 0 0 0
## 3 43 18 28 58 60 33 50 35 49
## 0 0 0 0 0 0 0 0 0 0
## 54 17 23 6 41 27 31 22 62 48
## 0 0 0 0 0 0 0 0 0 0
## 9 59 14 34 32 30 36 25 38 45
## 0 0 0 0 0 0 0 0 0 0
## 55 39 topic 1 topic 2 topic 3 topic 4
## 0 0 936 630 91 664
## [1] 50 50 50 50 50 50 50 50 50 50 594 50 50 50 578 50 50 50 50
## [20] 50 50 50 50 50 50 50 50 50 50 50 50 560 50 14 14 14 14 14
## [39] 14 14 14 14 14 14 14 14 14 50 50 50 50 50 50 50 50 50 578
## [58] 50 50 50 50 50 50 50
## $res
## [1] 0 0 0 0 0 0 0 0 0 0 561 0 0 0 544 0 0 0 0
## [20] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [58] 0 0 0 0 0 936 630 91 664
##
## $centralization
## [1] 0.4315828
##
## $theoretical_max
## [1] 135200
- 各主題 (topic1, topic2, topic3, topic4) 節點的 Betweenness Centrality 最高
- 故事 5, 7 節點也具一定程度的 Betweenness Centrality
- 其他故事節點的 Betweenness Centrality 則為 0
Find cliques
net.sym <- as.undirected(TopicNetwork, mode = "collapse", edge.attr.comb = list(weight = "sum", "ignore"))
#cliques(net.sym) # list of cliques
sapply(cliques(net.sym), length) # clique sizes## [1] 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [38] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [75] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 2 1 2 1 2 1 2 2 1
## [112] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
#largest_cliques(net.sym) # cliques with max number of nodes
vcol <- rep("grey80", vcount(net.sym))
vcol[unlist(largest_cliques(net.sym))] <- "gold"
plot(as.undirected(net.sym), vertex.label=V(net.sym)$name, vertex.color=vcol)Community detection
## Warning in cluster_edge_betweenness(TopicNetwork): At community.c:
## 460 :Membership vector will be selected based on the lowest modularity score.
## Warning in cluster_edge_betweenness(TopicNetwork): At community.c:
## 467 :Modularity calculation with weighted edge betweenness community detection
## might not make sense -- modularity treats edge weights as similarities while
## edge betwenness treats them as distances
結論
62 篇的格林童話,做文字與主題分析,會因為很多故事的用字,都使用相同的名詞或名稱,導致分析上,需要再作細部折解才有辨法看出主題的方向。