格林童話文字分析 part2
載入套件
讀取資料
text <- c("THE GOLDEN BIRD",
"HANS IN LUCK",
"JORINDA AND JORINDEL",
"THE TRAVELLING MUSICIANS",
"OLD SULTAN",
"THE STRAW, THE COAL, AND THE BEAN",
"BRIAR ROSE",
"THE DOG AND THE SPARROW",
"THE TWELVE DANCING PRINCESSES",
"THE FISHERMAN AND HIS WIFE",
"THE WILLOW-WREN AND THE BEAR",
"THE FROG-PRINCE",
"CAT AND MOUSE IN PARTNERSHIP",
"THE GOOSE-GIRL",
"THE ADVENTURES OF CHANTICLEER AND PARTLET",
"RAPUNZEL",
"FUNDEVOGEL",
"THE VALIANT LITTLE TAILOR",
"HANSEL AND GRETEL",
"THE MOUSE, THE BIRD, AND THE SAUSAGE",
"MOTHER HOLLE",
"LITTLE RED-CAP [LITTLE RED RIDING HOOD]",
"THE ROBBER BRIDEGROOM",
"TOM THUMB",
"RUMPELSTILTSKIN",
"CLEVER GRETEL",
"THE OLD MAN AND HIS GRANDSON",
"THE LITTLE PEASANT",
"FREDERICK AND CATHERINE",
"SWEETHEART ROLAND",
"SNOWDROP",
"THE PINK",
"CLEVER ELSIE",
"THE MISER IN THE BUSH",
"ASHPUTTEL",
"THE WHITE SNAKE",
"THE WOLF AND THE SEVEN LITTLE KIDS",
"THE QUEEN BEE",
"THE ELVES AND THE SHOEMAKER",
"THE JUNIPER-TREE",
"THE TURNIP",
"CLEVER HANS",
"THE THREE LANGUAGES",
"THE FOX AND THE CAT",
"THE FOUR CLEVER BROTHERS",
"LILY AND THE LION",
"THE FOX AND THE HORSE",
"THE BLUE LIGHT",
"THE RAVEN",
"THE GOLDEN GOOSE",
"THE WATER OF LIFE",
"THE TWELVE HUNTSMEN",
"THE KING OF THE GOLDEN MOUNTAIN",
"DOCTOR KNOWALL",
"THE SEVEN RAVENS",
"THE WEDDING OF MRS FOX",
"THE SALAD",
"THE STORY OF THE YOUTH WHO WENT FORTH TO LEARN WHAT FEAR WAS",
"KING GRISLY-BEARD",
"IRON HANS",
"CAT-SKIN",
"SNOW-WHITE AND ROSE-RED")
text_df <- tibble(line = 1:62, text = text)
fairyT <- gutenberg_download(c(2591))## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
資料前處理
fairyT_1<-fairyT%>%
mutate(linum=row_number())%>%
filter(linum>93)
til<-fairyT_1%>%
inner_join(text_df, by=c("text"="text"))
til2<-fairyT_1%>%
inner_join(text_df, by=c("text"="text"))%>%
select(text,linum,line)%>%
rbind(c("End",9172,63))
til2<-til2 %>%
convert(int(line))%>%
convert(int(linum))
til2$line=til2$line-1
til_re<-til%>%
inner_join(til2, by=c("line"="line"))%>%
select(gutenberg_id,text.x,linum.x,linum.y)
fairyText<-fairyT_1%>%
left_join(til_re,by='gutenberg_id')%>%
filter(linum.x <= linum,linum.y > linum)%>%
rename(title=text.x)%>%
select(title,text,linum)
fairyText## # A tibble: 9,078 x 3
## title text linum
## <chr> <chr> <int>
## 1 THE GOLDEN B… "THE GOLDEN BIRD" 94
## 2 THE GOLDEN B… "" 95
## 3 THE GOLDEN B… "A certain king had a beautiful garden, and in the garde… 96
## 4 THE GOLDEN B… "which bore golden apples. These apples were always coun… 97
## 5 THE GOLDEN B… "the time when they began to grow ripe it was found that… 98
## 6 THE GOLDEN B… "of them was gone. The king became very angry at this, a… 99
## 7 THE GOLDEN B… "gardener to keep watch all night under the tree. The ga… 100
## 8 THE GOLDEN B… "eldest son to watch; but about twelve o'clock he fell a… 101
## 9 THE GOLDEN B… "the morning another of the apples was missing. Then the… 102
## 10 THE GOLDEN B… "ordered to watch; and at midnight he too fell asleep, a… 103
## # … with 9,068 more rows
情緒分析:NRC字典,Afinn字典,Bing字典
tidy_fairyT <- fairyText %>%
unnest_tokens(word, text) %>%
anti_join(stop_words,by = "word")
fairyT_g<-tidy_fairyT %>%
group_by(title)%>%
count(word, sort = TRUE)%>%
ungroup()
fairyT_bing<-fairyT_g %>%
inner_join(get_sentiments("bing"),by="word")%>%
group_by(title)%>%
count(sentiment, sort = TRUE)%>%
ungroup()
fiaryBing<-left_join(fairyT_bing%>%filter(sentiment=='negative'),fairyT_bing%>%filter(sentiment=='positive'), by = c("title"="title"))%>%
mutate(colname = n.x-(ifelse(is.na(n.y),0,n.y)))%>%
mutate(n=ifelse(colname>0,'bing_negative','bing_positive'))%>%
mutate(t='bing')%>%
select(title,n,t)
fiaryAfinn<-fairyT_g %>%
inner_join(get_sentiments("afinn"),by="word")%>%
group_by(title)%>%
summarise(value = sum(value))%>%
#count(value, sort = TRUE)%>%
ungroup()%>%
mutate(n=ifelse(value>0,'afinn_positive','afinn_negative'))%>%
mutate(t='afinn')%>%
select(title,n,t)
fiaryT_nrc<-fairyT_g %>%
inner_join(get_sentiments("nrc"),by="word")%>%
#filter(sentiment %in% c('negative','positive'))
mutate(li=ifelse(sentiment %in% c('negative','anger','disgust','fear','sadness','anticipation'), "nrc_negative", "nrc_positive"))%>%
group_by(title,li)%>%
summarise(sn = sum(n))
fiaryNrc<-fiaryT_nrc%>%
filter(li=='nrc_positive')%>%
inner_join(fiaryT_nrc%>% filter(li=='nrc_negative'),by =c("title"="title"))%>%
mutate(n=ifelse((sn.y-sn.x)>0,'nrc_positive','nrc_negative'))%>%
mutate(t='nrc')%>%
select(title,n,t)
fiaryNrc = as.data.frame(fiaryNrc)
fiaryAll<-rbind(rbind(fiaryAfinn,fiaryBing),fiaryNrc)
g_fiaryAll <- graph_from_data_frame(d=fiaryAll, directed=F)
V(g_fiaryAll)$color <- ifelse(V(g_fiaryAll)$name %in% text_df$text, "orange", "#DFFFDF")
V(g_fiaryAll)$color <- ifelse(V(g_fiaryAll)$name %in% c('afinn_negative','afinn_positive'), "#FFF3EE", V(g_fiaryAll)$color)
V(g_fiaryAll)$color <- ifelse(V(g_fiaryAll)$name %in% c('nrc_negative','nrc_positive'), "#D1E9E9", V(g_fiaryAll)$color)
V(g_fiaryAll)$color <- ifelse(V(g_fiaryAll)$name %in% c('bing_negative','bing_positive'), "#D2E9FF", V(g_fiaryAll)$color)
V(g_fiaryAll)$size <- ifelse( V(g_fiaryAll)$name %in% text_df$text , 8, 10)
V(g_fiaryAll)$shape <- ifelse(V(g_fiaryAll)$name %in% text_df$text , "circle", "square")
V(g_fiaryAll)$label <- ifelse(V(g_fiaryAll)$name %in% text_df$text, "", V(g_fiaryAll)$name)
V(g_fiaryAll)$label.color<-"darkred"
#E(g_fiaryAll)$color<-V(g_fiaryAll)$color
E(g_fiaryAll)$width <- 1.5
V(g_fiaryAll)$frame.color <- "white"
E(g_fiaryAll)$arrow.mode <- 0
plot(g_fiaryAll,edge.curved=.3, layout=layout_components) - 利用Afinn字典,Bing字典分析,大部份的童話故事都偏負面。NRC字典分析,正負面比較平均。可以看的出格林童話未必適合小朋友去閱讀。
主題-文字分析-Beta值
- 分四個主題去分析
title_dtm_beta <- fairyT_g %>%
cast_dtm(title, word, n)
#title_dtm
title_lda_beta <- LDA(title_dtm_beta, k = 4, control = list(seed = 1234))
text_topics_beta <- tidy(title_lda_beta, matrix = "beta")
#text_topics %>% head(10)
top_terms_beta <- text_topics_beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#arrange(topic, desc(beta) ) %>%
#group_by(topic) %>%
#mutate(id = row_number():n())
top_terms_beta %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
# scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()- 從文字上,我們還看不出每個一主題要表達的含意,我們再針對每個文字跟格林童話故事之間的分佈來探討
- 先分析四個主題文字的相依性會不會太重疊
top_terms_beta <- text_topics_beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
aa<-top_terms_beta%>%
select(topic,term)
gb <- graph_from_data_frame(d=aa, directed=F)
ceb <- cluster_fast_greedy(gb)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "square", "circle")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 15, 10)
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name),V(gb)$name, "" )
plot(ceb,gb)- 我們發現只有主題一比較獨立,其他主題比較接近。所以我們再針對“主題一”作分析
- 針對“主題一”的文字再作細部分析
link_b<-top_terms_beta%>%
inner_join(fairyT_g, by=c("term"="word"))%>%
inner_join(text_df, by=c("title"="text"))%>%
filter(topic=='1')%>%
select(term,line)
gb <- graph_from_data_frame(d=link_b, directed=F)
V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "", V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"
V(gb)$type <- bipartite.mapping(gb)$type
V(gb)$index<-c(1:70)## Warning in vattrs[[name]][index] <- value: number of items to replace is not a
## multiple of replacement length
V(gb)$label.dist=ifelse(V(gb)$index%%3==1 ,-1.5,ifelse(V(gb)$index%%3==2,0.5,1))
plot(gb, layout = layout_with_gem)- 從上圖分佈,某些字聚在一起,有father/mother/children/door/red/bird等字,表示會出現在固定某幾個童話故事,且會偏向跟家庭成員故事有關, 我們從father/mother/children/door/red/bird等字去找出幾個故事。
- 像是:
- 故事40,THE JUNIPER-TREE(杜松樹),講述繼母想殺死丈夫和前妻的小孩的故事
- 故事19,HANSEL AND GRETEL(糖果屋),繼母把小孩騙出去,然後小孩又回家的故事
LDAvis分析(分10主題)
https://jhnny009.github.io/studyG7/
dtf <- document_term_frequencies(tidy_fairyT, document = "title", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 15)
dim(dtm_clean)## [1] 62 449
set.seed(2000)
topic_n = 10
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)## INFO [2020-05-26 17:55:49] iter 10 loglikelihood = -78760.166
## INFO [2020-05-26 17:55:49] iter 20 loglikelihood = -75580.781
## INFO [2020-05-26 17:55:49] iter 30 loglikelihood = -74196.359
## INFO [2020-05-26 17:55:50] iter 40 loglikelihood = -73580.261
## INFO [2020-05-26 17:55:50] iter 50 loglikelihood = -72910.127
## INFO [2020-05-26 17:55:50] iter 60 loglikelihood = -72714.888
## INFO [2020-05-26 17:55:50] iter 70 loglikelihood = -72408.381
## INFO [2020-05-26 17:55:50] iter 80 loglikelihood = -72001.572
## INFO [2020-05-26 17:55:50] iter 90 loglikelihood = -71833.535
## INFO [2020-05-26 17:55:50] iter 100 loglikelihood = -71658.989
## INFO [2020-05-26 17:55:50] iter 110 loglikelihood = -71505.562
## INFO [2020-05-26 17:55:50] iter 120 loglikelihood = -71210.481
## INFO [2020-05-26 17:55:50] iter 130 loglikelihood = -71036.011
## INFO [2020-05-26 17:55:50] iter 140 loglikelihood = -70898.534
## INFO [2020-05-26 17:55:50] iter 150 loglikelihood = -70583.725
## INFO [2020-05-26 17:55:50] iter 160 loglikelihood = -70674.695
## INFO [2020-05-26 17:55:50] early stopping at 160 iteration
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "bird" "prince" "queen" "peasant" "princess" "red" "woman"
## [2,] "tree" "dwarf" "beautiful" "cow" "tailor" "girl" "fire"
## [3,] "son" "told" "kingdom" "alas" "master" "white" "hansel"
## [4,] "brother" "home" "glass" "blow" "met" "rose" "children"
## [5,] "forest" "till" "found" "thy" "brothers" "answered" "elsie"
## [6,] "mother" "twelve" "lady" "word" "rode" "soldier" "voice"
## [7,] "beard" "live" "snowdrop" "horse" "eldest" "child" "morning"
## [8,] "length" "fish" "blood" "goose" "giant" "ran" "draw"
## [9,] "cook" "wife" "tom" "maiden" "drink" "bear" "rapunzel"
## [10,] "sing" "thee" "goodbye" "country" "dog" "bed" "fetch"
## [,8] [,9] [,10]
## [1,] "king" "hans" "cat"
## [2,] "king's" "gretel" "fox"
## [3,] "golden" "wolf" "mouse"
## [4,] "castle" "cap" "servant"
## [5,] "father" "mother" "eat"
## [6,] "boy" "grandmother" "chanticleer"
## [7,] "daughter" "sadly" "skin"
## [8,] "youth" "apples" "cried"
## [9,] "hair" "frog" "sparrow"
## [10,] "heard" "love" "house"
針對LDAvis的10個主題,再篩選,只留動物類跟角色名詞(像是動詞/形容詞很常在很多故事出現,反而難分析)
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
#tidy(lda_text_t)
LDAvis10<-lda_text_t%>%
gather(m,n,V1:V10)%>%
inner_join(fairyT_g, by=c("n"="word"))%>%
inner_join(text_df, by=c("title"="text"))%>%
select(m,n,line)
remove_word=c('happened','red','cap','youth','door','ran','cut','fire','eyes','life','shudder','till','thou','goodbye','cellar','night','leave','forest',
'home','cook','found','tailor','king\'s','met','true','shepherd','left','heard','answered','table','wine','learn','castle','rest','twelve',
'thee','cart','crept','miser','carried','dead','black','wood',"walk","tail","hair","water","fine","kitchen","heart","snow","wild","snowdrop",
"blood")
colnames(LDAvis10)[colnames(LDAvis10) == 'm'] <- 'topic'
colnames(LDAvis10)[colnames(LDAvis10) == 'n'] <- 'term'
LDAvis10<-LDAvis10%>%
filter(!term %in% remove_word)
all10<-LDAvis10%>%
# filter(topic=='V4')%>%
select(topic,line)
gb <- graph_from_data_frame(d=all10, directed=F)
V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "", V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"
V(gb)$type <- bipartite.mapping(gb)$type
plot(gb, layout = layout_components)## # A tibble: 90 x 3
## topic line text
## <chr> <int> <chr>
## 1 V6 23 THE ROBBER BRIDEGROOM
## 2 V6 21 MOTHER HOLLE
## 3 V6 30 SWEETHEART ROLAND
## 4 V6 14 THE GOOSE-GIRL
## 5 V6 35 ASHPUTTEL
## 6 V6 55 THE SEVEN RAVENS
## 7 V6 31 SNOWDROP
## 8 V6 33 CLEVER ELSIE
## 9 V6 19 HANSEL AND GRETEL
## 10 V6 22 LITTLE RED-CAP [LITTLE RED RIDING HOOD]
## # … with 80 more rows
- 從上圖可以發現,大部份主題都蠻接近的。可能大部份故事用的角色名稱跟動物都是一樣的,像是有些故事都有鳥/魚/國王/女王。
- 唯獨“主題6”,“主題9”有比較聚焦的故事。
- 主題6-主要的因為文字 HANS/Gretel/Hansel都是某些故事角色姓名,所以只會連結到相應的故事。
- 主題9-找到都是跟女王/皇后有關係的故事。像是故事31.SNOWDROP(白雪公主)有壞皇后。