資料介紹

Data Source: 古騰堡電子書–格林童話
共 62 篇故事

文本探索

載入套件

library(data.table)
library(dplyr)
library(ggraph)
library(ggplot2)
library(gutenbergr)
library(hablar)
library(htmlwidgets)
library(janeaustenr)
library(igraph)
library(jiebaR)
library(LDAvis)
library(ramify)
library(purrr)
library(RColorBrewer)
library(readr)
library(reshape2)
library(servr)
library(stringr)
library(text2vec)
library(textdata)
library(textshape)
library(tidyr)
library(tidytext)
library(tidyverse)
library(tm)
library(topicmodels)
library(udpipe)
library(webshot)
library(widyr)
library(wordcloud)
library(wordcloud2)

讀取資料

文字前處理

查看斷詞結果

text_tidy = tibble(text) %>%
  unnest_tokens(word, text) %>%
  # 標註第幾篇故事
  mutate(linenumber = row_number(), story = cumsum(str_detect(word, regex("^story([1-9]|[1-6][0-9])$", ignore_case = T))))  

text_tidy %>% 
  head(10)

## # A tibble: 10 x 3
##    word      linenumber story
##    <chr>          <int> <int>
##  1 story1             1     1
##  2 the                2     1
##  3 golden             3     1
##  4 bird               4     1
##  5 a                  5     1
##  6 certain            6     1
##  7 king               7     1
##  8 had                8     1
##  9 a                  9     1
## 10 beautiful         10     1

排除停用字

# 使用內建字典
data(stop_words)
text_tidy = text_tidy %>% 
  anti_join(stop_words)

## Joining, by = "word"

統計字頻

text_tidy %>%
  count(word, sort = T) %>%
  filter(n > 100) %>%
  # 使 x 軸依 y 軸數值大小排序
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) +
  geom_col(fill = "#00bfc4") +
  coord_flip()

文字雲分析

總體文字雲

text_tidy %>% 
  group_by(word) %>% 
  summarise(count = n()) %>% 
  filter(count > 50) %>% 
  arrange(desc(count)) %>% 
  wordcloud2( size = 0.7, shape = 'circle')

區分正、負面情緒的文字雲 (採用 Bing 字典)

text_tidy  %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

正面詞彙字頻較大
形容詞較多

情緒分析

四個情緒字典介紹

NRC (10 Category)
34% (+): anticipation, joy, positive, surprise, trust
66% (-): anger, disgust, fear, negative, sadness
Afinn (Numeric: -5~+5, mean: -0.6)
Bing (2 Category: 30% Positive, 70% Negative)
Loughran (6 Category)
(+): positive
(-): constraining(被迫/限制), litigious(好爭論的), negative, superfluous(多餘的), uncertainty

NRC字典

準備資料

nrc <- get_sentiments("nrc")

grimmlist <- text_tidy %>%
  filter(nchar(.$word)>1) %>%
  mutate(rown = row_number())

grimmnrc <- grimmlist %>%
  inner_join(nrc, by = "word")

NRC 最常出現的負面情緒字

tt_neg <- filter(grimmnrc, sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')

tt_neg_dis <- distinct(tt_neg, word, rown, story)

neg_re <- tt_neg_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()

neg_re %>% 
  top_n(10, wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label = n))+
  theme(text=element_text(size = 14))+
  coord_flip()

NRC 最常出現的正面情緒字

tt_pos <- filter(grimmnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')

tt_pos_dis <- distinct(tt_pos, word, rown, story)

pos_re <- tt_pos_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()

pos_re %>% 
  top_n(10, wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label = n))+
  theme(text=element_text(size = 14))+
  coord_flip()

找出 “king” 最常出現的前三大故事

pos_king <- tt_pos_dis %>%
  filter(word == 'king') %>%
  group_by(story) %>%
  count(word, sort = TRUE) %>%
  ungroup()

pos_king %>% top_n(3)

## Selecting by n

## # A tibble: 3 x 3
##   story word      n
##   <int> <chr> <int>
## 1    61 king     31
## 2    60 king     24
## 3    59 king     19

分別為以下三個故事：
* 故事 61 “CAT-SKIN”
* 故事 60 “IRON HANS”
* 故事 59 “KING GRISLY-BEARD”

Afinn 字典

準備資料

afinn = get_sentiments("afinn")

storyafinn <- text_tidy %>% 
  inner_join(afinn) %>%
  group_by(story) %>% 
  # 以每個故事為單位將情緒值加總
  summarise(sentiment = sum(value))

## Joining, by = "word"

每個故事情緒加總變化趨勢

storyafinn %>%
  ggplot(aes(story, sentiment, fill = "#F7766D")) +
  geom_col(show.legend = FALSE) +
  scale_x_continuous(breaks = seq(1, 62, 5))

故事 8 “THE DOG AND THE SPARROW” 最負面
故事 2 “HANS IN LUCK” 最正面

Afinn 最常出現的負面情緒字

tt_neg <- text_tidy %>% 
  inner_join(afinn) %>% 
  filter(value < 0)

## Joining, by = "word"

tt_neg_dis <- distinct(tt_neg, word, rown, story)

## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story

neg_re <- tt_neg_dis %>%
  group_by(word) %>%
  count(word, sort = TRUE) %>%
  ungroup()

neg_re %>% 
  top_n(10, wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label = n))+
  theme(text=element_text(size = 14))+
  coord_flip()

Afinn 最常出現的正面情緒字

tt_pos <- text_tidy %>% 
  inner_join(afinn) %>%
  filter(value > 0)

## Joining, by = "word"

tt_pos_dis <- distinct(tt_pos, word, rown, story)

## Warning: Trying to compute distinct() for variables not found in the data:
## - `rown`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - word
## - story

pos_re <- tt_pos_dis %>%
  group_by(word) %>%
  count(word, sort = TRUE) %>%
  ungroup()

pos_re %>% 
  top_n(10, wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label = n))+
  theme(text=element_text(size = 14))+
  coord_flip()

Bing 字典

準備資料

bing = get_sentiments("bing")

grimm_bing <- text_tidy %>%
  filter(nchar(.$word) > 1) %>%
  mutate(nrow = row_number())%>%
  inner_join(bing, by = "word")

Bing 最常出現的正/負面情緒字

grimm_bing_all <- grimm_bing %>%
  group_by(sentiment) %>%
  count(word, sort = TRUE) %>%
  ungroup()

grimm_bing_all %>%
  group_by(sentiment) %>%
  top_n(10, wt = n) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

Loughran 字典

Loughran 中各情緒類型包含多少字

library(dplyr)
loughran = get_sentiments("loughran")
loughran %>%
  count(sentiment, sort = T)

Loughran 最常出現的正/負面情緒字

loughran_wc <- get_sentiments("loughran") %>%
  filter(sentiment == "negative" | sentiment == "positive") %>%
  inner_join(text_tidy) %>%
  count(word, sentiment, sort = T)

loughran_wc %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

四字典各故事比較分析

正/負面情緒詞總數差異

準備資料

grimm_cnt = text_tidy %>% 
  group_by(story, word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

每個故事情緒詞總數比較

grimm_cnt %>% 
  inner_join(afinn) %>% 
  group_by(story, value) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_boxplot(aes(x=story, y=value, colour=as.factor(story)),show.legend = FALSE) +
  ggtitle("Afinn-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

grimm_cnt %>% 
  inner_join(nrc) %>% 
  group_by(story, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
  ggtitle("NRC-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

grimm_cnt %>% 
  inner_join(bing) %>% 
  group_by(story, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
  ggtitle("Bing-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,62,5)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

#grimm_cnt %>% 
#  inner_join(loughran) %>% 
#  group_by(story, sentiment) %>% 
#  summarise(cnt = sum(count)) %>% ggplot() +
#  geom_line(aes(x=story, y=cnt, colour=sentiment)) +
#  ggtitle("Loughran-每集情緒詞數量差異") + 
#  scale_x_continuous(breaks=seq(1,62,5)) +
#  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
#      plot.title=element_text(hjust = 0.5))

正面-負面情緒差異

準備資料

# 整理字典 "bing" 正負面情緒各級出現次數與差值
bing_neg_pos <-  text_tidy  %>%
  dplyr::select(word,story) %>%
  inner_join(get_sentiments("bing"))  %>% 
  group_by(story,sentiment) %>%
  summarise(count=sum(story))

## Joining, by = "word"

bing_trend<-bing_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")

# 整理字典 "nrc" 正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(story)%>%mutate(method='nrc') 

nrc_trend<-nrc_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)

# 整理字典 "afinn" 正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "positive")

## Joining, by = "word"

afinn_neg <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value<0)%>%group_by(story)%>%summarise(count=n())%>%mutate(sentiment = "negative")

## Joining, by = "word"

afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(story)%>%mutate(method='nrc') 

afinn_trend<-afinn_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")

# 整理字典 "loughran" 正負面情緒各級出現次數與差值
loughran_data <-  text_tidy  %>%
  dplyr::select(word,story) %>%
  inner_join(get_sentiments("loughran"))  %>% 
  group_by(story,sentiment) %>%
  summarise(count=sum(story))

## Joining, by = "word"

loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")

#loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(story)%>%mutate(method='nrc') 

#loughran_trend<-loughran_neg_pos%>%group_by(story)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")

合併四個字典並劃出折線圖

#bind_rows(bing_trend, nrc_trend, afinn_trend, loughran_trend) %>%
bind_rows(bing_trend, nrc_trend, afinn_trend) %>%
  ggplot(aes(x = story, y = Deviation, col = method)) +
  geom_line() + facet_wrap(~ method, ncol = 1, scales = "fixed")

情緒分析:主要針對NRC字典,Afinn字典,Bing字典,loughran字典

tidy_fairyT <- fairyText %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words,by = "word")

#sum(fairyT_g$n)

fairyT_g<-tidy_fairyT %>%
  group_by(title)%>%
  count(word, sort = TRUE)%>%
  ungroup()



fairyT_bing<-fairyT_g %>%
  inner_join(get_sentiments("bing"),by="word")%>%
  group_by(title)%>%
  count(sentiment, sort = TRUE)%>%
  ungroup()



fiaryBing<-left_join(fairyT_bing%>%filter(sentiment=='negative'),fairyT_bing%>%filter(sentiment=='positive'), by = c("title"="title"))%>%
  mutate(colname = n.x-(ifelse(is.na(n.y),0,n.y)))%>%
  mutate(n=ifelse(colname>0,'bing_negative','bing_positive'))%>%
  mutate(t='bing')%>%
  select(title,n,t)


fiaryAfinn<-fairyT_g %>%
  inner_join(get_sentiments("afinn"),by="word")%>%
  group_by(title)%>%
  summarise(value = sum(value))%>%
  #count(value, sort = TRUE)%>%
  ungroup()%>%
  mutate(n=ifelse(value>0,'afinn_positive','afinn_negative'))%>%
  mutate(t='afinn')%>%
  select(title,n,t)


fiaryT_nrc<-fairyT_g %>%
  inner_join(get_sentiments("nrc"),by="word")%>%
  #filter(sentiment %in% c('negative','positive'))
  mutate(li=ifelse(sentiment %in% c('negative','anger','disgust','fear','sadness','anticipation'), "nrc_negative", "nrc_positive"))%>%
  group_by(title,li)%>%
  summarise(sn = sum(n))
  

fiaryNrc<-fiaryT_nrc%>%
  filter(li=='nrc_positive')%>%
  inner_join(fiaryT_nrc%>% filter(li=='nrc_negative'),by =c("title"="title"))%>%
  mutate(n=ifelse((sn.y-sn.x)>0,'nrc_positive','nrc_negative'))%>%
  mutate(t='nrc')%>%
  select(title,n,t)

fiaryNrc = as.data.frame(fiaryNrc)
 


fairyT_loughran<-fairyT_g %>%
  inner_join(get_sentiments("loughran"),by="word")%>%
  filter(sentiment %in% c('negative','positive'))%>%
  group_by(title)%>%
  count(sentiment, sort = TRUE)%>%
  ungroup()

fiaryloughran<-left_join(fairyT_loughran%>%filter(sentiment=='negative'),fairyT_loughran%>%filter(sentiment=='positive'), by = c("title"="title"))%>%
  mutate(colname = n.x-(ifelse(is.na(n.y),0,n.y)))%>%
  mutate(n=ifelse(colname>0,'loughran_negative','loughran_positive'))%>%
  mutate(t='loughran')%>%
  select(title,n,t)



fiaryAll<-rbind(rbind(rbind(fiaryAfinn,fiaryBing),fiaryNrc),fiaryloughran)


a1<-fiaryAll%>%
  group_by(n)%>%
  count()%>%
  separate(n,c("f","t"))



ggplot(data=a1, aes(x=f, y=nn, fill=t)) +
  geom_bar(stat="identity", width=0.2)+
  scale_fill_brewer(palette="Paired")+
  theme_minimal()+ 
  labs(x = "Contribution to sentiment",
       y = "story number")+
theme(axis.text.x = element_text(angle = 0, hjust = 1))

利用Afinn字典,loughran字典,Bing字典分析，大部份的童話故事都偏負面。NRC字典分析，正負面比較平均。可以看的出格林童話未必適合小朋友去閱讀。

TF-IDF 分析

計算 term frequency

book_words <- text_tidy %>%
  count(word, story, sort = T)

total_words <- book_words %>% 
  group_by(story) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)

## Joining, by = "story"

freq_by_rank <- book_words %>% 
  group_by(story) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank

## # A tibble: 15,718 x 6
## # Groups:   story [62]
##    word     story     n total  rank `term frequency`
##    <chr>    <int> <int> <int> <int>            <dbl>
##  1 hans        42    71   343     1           0.207 
##  2 tailor      18    57   991     1           0.0575
##  3 hansel      19    45   905     1           0.0497
##  4 gretel      42    44   343     2           0.128 
##  5 peasant     28    42   589     1           0.0713
##  6 bird        40    39   987     1           0.0395
##  7 wife        10    37   592     1           0.0625
##  8 gretel      19    36   905     2           0.0398
##  9 shudder     58    34  1024     1           0.0332
## 10 snowdrop    31    33   701     1           0.0471
## # … with 15,708 more rows

計算 TF-IDF

book_words <- book_words %>%
  bind_tf_idf(word, story, n)

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 15,718 x 6
##    word       story     n     tf   idf tf_idf
##    <chr>      <int> <int>  <dbl> <dbl>  <dbl>
##  1 hans          42    71 0.207   2.74  0.567
##  2 gretel        42    44 0.128   3.03  0.388
##  3 doctor        54    18 0.0807  4.13  0.333
##  4 elsie         33    27 0.0714  4.13  0.295
##  5 sparrow        8    28 0.0650  4.13  0.268
##  6 fox           56    28 0.115   2.18  0.251
##  7 sultan         5    15 0.0584  4.13  0.241
##  8 rapunzel      16    24 0.0562  4.13  0.232
##  9 fundevogel    17    13 0.0544  4.13  0.224
## 10 peasant       28    42 0.0713  3.03  0.216
## # … with 15,708 more rows

book_words = anti_join(book_words, stop_words, by = "word")

同一行裡最常一起出現的詞彙

text_tidy_cors <- text_tidy %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, linenumber, sort = TRUE)

set.seed(2016)
text_tidy_cors %>%
  filter(abs(correlation) > .01) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

故事背景圍繞在皇室貴族、平民家庭、等不同接階級的故事

LDA 分析

將資料轉成 DocumentTermMatrix (DTM) 格式

word_counts <- text_tidy %>%
  count(story, word, sort = TRUE) %>%
  ungroup()

story_dtm <- word_counts %>%
  cast_dtm(story, word, n)

story_dtm

## <<DocumentTermMatrix (documents: 62, terms: 4397)>>
## Non-/sparse entries: 15718/256896
## Sparsity           : 94%
## Maximal term length: 15
## Weighting          : term frequency (tf)

主題-文字分析

4 個主題 Beta 值

# 建立一個 k 個 topic 的 LDA model
story_lda <- LDA(story_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(story_lda, matrix = "beta")
text_topics %>% head(10)

## # A tibble: 10 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 hans   3.27e-  3
##  2     2 hans   9.92e-  3
##  3     3 hans   3.60e-  3
##  4     4 hans   2.76e-164
##  5     1 tailor 1.31e-  4
##  6     2 tailor 2.25e-118
##  7     3 tailor 8.17e-  3
##  8     4 tailor 8.22e-  4
##  9     1 hansel 1.74e-171
## 10     2 hansel 6.28e-  3

選出各主題前 5 高的文字

top_terms <- text_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>% head(10)

## # A tibble: 10 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 king     0.0144 
##  2     1 fox      0.00982
##  3     1 father   0.00865
##  4     1 princess 0.00852
##  5     1 till     0.00691
##  6     2 gretel   0.0133 
##  7     2 wife     0.0111 
##  8     2 hans     0.00992
##  9     2 day      0.00868
## 10     2 mother   0.00862

story_lda <- LDA(story_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(story_lda, matrix = "beta")

remove_word = c("day","time","boy","hans","son","father","mother","women","wife","door","set","gretel","hans","hansel","king","elsie","beautiful","woman")

top_terms <- text_topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
  
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

第一個主題由於是鄉野傳說常以野獸精怪作為主體去敘述故事
第二個則是因為故事的搜集並非都通過沒有受過教育的農民，其中的大部分倒是經由受過良好教育的中產階級女性的講述,所以故事多會跟家庭相關
第三個則是跟當時的產業結構有所相關
第四個是因為當時的歷史背景仍是19世紀,所以當時蒐集的故事難免都會與貴族王室有所關聯

主題-文字分析-Beta值(seed=2000)

分四個主題去分析

#sum(fairyT_g$n)


title_dtm_beta <- fairyT_g %>%
  cast_dtm(title, word, n)

#title_dtm
title_lda_beta <- LDA(title_dtm_beta, k = 4, control = list(seed = 2000))

text_topics_beta <- tidy(title_lda_beta, matrix = "beta")
#text_topics %>% head(10)

top_terms_beta <- text_topics_beta  %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
  #arrange(topic, desc(beta) ) %>% 
  #group_by(topic) %>% 
  #mutate(id = row_number():n())



top_terms_beta %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
#  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

- 從文字上，我們還看不出每個一主題要表達的含意，我們再針對每個文字跟格林童話故事之間的分佈來探討

先分析四個主題文字的相依性會不會太重疊

top_terms_beta <- text_topics_beta  %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

aa<-top_terms_beta%>%
  select(topic,term)



gb <- graph_from_data_frame(d=aa, directed=F)
ceb <- cluster_fast_greedy(gb) 

V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "square", "circle")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 15, 10)
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name),V(gb)$name, ""  )

plot(ceb,gb)

我們發現只有主題一比較獨立，其他主題比較接近。所以我們再針對“主題一”作分析
針對“主題一”的文字再作細部分析

link_b<-top_terms_beta%>%
  inner_join(fairyT_g, by=c("term"="word"))%>%
  inner_join(story_title, by=c("title"="text"))%>%
  filter(topic=='1')%>%
  select(term,line)


gb <- graph_from_data_frame(d=link_b, directed=F)


V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "",  V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"

V(gb)$type <- bipartite.mapping(gb)$type
V(gb)$index<-c(1:70)

## Warning in vattrs[[name]][index] <- value: 被替換的項目不是替換值長度的倍數

V(gb)$label.dist=ifelse(V(gb)$index%%3==1 ,-1.5,ifelse(V(gb)$index%%3==2,0.5,1)) 


plot(gb, layout = layout_with_gem)

+ 從上圖分佈，某些字聚在一起，有father/mother/children/door/red/bird等字，表示會出現在固定某幾個童話故事，且會偏向跟家庭成員故事有關，我們從father/mother/children/door/red/bird等字去找出幾個故事。

像是:
故事40，THE JUNIPER-TREE(杜松樹)，講述繼母想殺死丈夫和前妻的小孩的故事
故事19，HANSEL AND GRETEL(糖果屋)，繼母把小孩騙出去，然後小孩又回家的故事

LDAvis分析(分10主題)

https://jhnny009.github.io/studyG7/

dtf <- document_term_frequencies(tidy_fairyT, document = "title", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 15)
dim(dtm_clean)

## [1]  62 449

set.seed(2000)
topic_n = 10

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [08:49:10.234] early stopping at 130 iteration 
## INFO  [08:49:10.322] early stopping at 30 iteration

lda_text<-lda_model$get_top_words(n = 10, lambda = 0.5)
lda_text

##       [,1]       [,2]          [,3]       [,4]       [,5]       [,6]     
##  [1,] "princess" "red"         "door"     "king"     "prince"   "hans"   
##  [2,] "horse"    "cap"         "ran"      "king's"   "dwarf"    "gretel" 
##  [3,] "bride"    "youth"       "cut"      "son"      "life"     "hansel" 
##  [4,] "happened" "cow"         "fire"     "soldier"  "till"     "goodbye"
##  [5,] "girl"     "fish"        "eyes"     "shudder"  "thou"     "cellar" 
##  [6,] "huntsman" "fisherman"   "left"     "wine"     "rest"     "cart"   
##  [7,] "met"      "wife"        "child"    "daughter" "twelve"   "cock"   
##  [8,] "father"   "grandmother" "heard"    "learn"    "brother"  "crept"  
##  [9,] "court"    "shepherd"    "answered" "rapunzel" "thee"     "doctor" 
## [10,] "true"     "flower"      "table"    "castle"   "brothers" "miser"  
##       [,7]          [,8]     [,9]      [,10]     
##  [1,] "children"    "cat"    "queen"   "boy"     
##  [2,] "night"       "fox"    "home"    "tailor"  
##  [3,] "rose"        "wolf"   "cook"    "mother"  
##  [4,] "leave"       "mouse"  "peasant" "tree"    
##  [5,] "forest"      "master" "found"   "father"  
##  [6,] "carried"     "wood"   "water"   "snow"    
##  [7,] "bear"        "walk"   "fine"    "bird"    
##  [8,] "dead"        "bird"   "lady"    "wild"    
##  [9,] "black"       "tail"   "kitchen" "snowdrop"
## [10,] "chanticleer" "hair"   "heart"   "blood"

#lda_model$plot() # 畫圖
#lda_model$plot(out.dir ="test2", open.browser = TRUE) # 存起來

針對LDAvis的10個主題，再篩選，只留動物類跟角色名詞(像是動詞/形容詞很常在很多故事出現，反而難分析)

lda_text_t<-lda_text%>%as_tibble

## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

#tidy(lda_text_t)

LDAvis10<-lda_text_t%>%
  gather(m,n,V1:V10)%>%
  inner_join(fairyT_g, by=c("n"="word"))%>%
  inner_join(story_title, by=c("title"="text"))%>%
  select(m,n,line)


remove_word=c('happened','red','cap','youth','door','ran','cut','fire','eyes','life','shudder','till','thou','goodbye','cellar','night','leave','forest',
              'home','cook','found','tailor','king\'s','met','true','shepherd','left','heard','answered','table','wine','learn','castle','rest','twelve',
              'thee','cart','crept','miser','carried','dead','black','wood',"walk","tail","hair","water","fine","kitchen","heart","snow","wild","snowdrop",
              "blood")


colnames(LDAvis10)[colnames(LDAvis10) == 'm'] <- 'topic'
colnames(LDAvis10)[colnames(LDAvis10) == 'n'] <- 'term'

LDAvis10<-LDAvis10%>%
  filter(!term  %in% remove_word)


all10<-LDAvis10%>%
 # filter(topic=='V4')%>%   
  select(topic,line)


gb <- graph_from_data_frame(d=all10, directed=F)

V(gb)$color <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "lightsteelblue", "white")
V(gb)$label <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name), "",  V(gb)$name)
V(gb)$shape <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , "circle", "square")
V(gb)$size <- ifelse(grepl("^[0-9]{1,}$", V(gb)$name) , 6, 10)
V(gb)$label.color<-"darkred"
V(gb)$frame.color <- "white"

V(gb)$type <- bipartite.mapping(gb)$type

plot(gb, layout = layout_components)

all10%>%
  filter(topic=='V6'|topic=='V9')%>%
  unique()%>%
  inner_join(story_title, by="line")

## # A tibble: 32 x 3
##    topic  line text                    
##    <chr> <int> <chr>                   
##  1 V6       42 CLEVER HANS             
##  2 V6        2 HANS IN LUCK            
##  3 V6       33 CLEVER ELSIE            
##  4 V6       60 IRON HANS               
##  5 V6       19 HANSEL AND GRETEL       
##  6 V6       26 CLEVER GRETEL           
##  7 V6        4 THE TRAVELLING MUSICIANS
##  8 V6       21 MOTHER HOLLE            
##  9 V6       54 DOCTOR KNOWALL          
## 10 V6       48 THE BLUE LIGHT          
## # … with 22 more rows

從上圖可以發現，大部份主題都蠻接近的。可能大部份故事用的角色名稱跟動物都是一樣的，像是有些故事都有鳥/魚/國王/女王。
唯獨“主題6”,“主題9”有比較聚焦的故事。
主題6-主要的因為文字 HANS/Gretel/Hansel都是某些故事角色姓名，所以只會連結到相應的故事。
主題9-找到都是跟女王/皇后有關係的故事。像是故事31.SNOWDROP(白雪公主)有壞皇后。

文件-主題分析

分成 4 個 topic 下，找出各篇故事適合分類到哪個 topic
gamma 為 per-document-per-topic probabilities

計算每個主題的故事數量

story_gamma <- tidy(story_lda, matrix = "gamma")
story_gamma2 <- story_gamma %>%
  group_by(document) %>% 
  filter(gamma == (max(gamma)))

table(story_gamma2$topic)

## 
##  1  2  3  4 
## 16 16 14 16

各主題的故事數量較平均

畫出各篇故事屬於各個 topic 的機率

for(i in c(1:4)){
  picture <- story_gamma %>%
    filter(document %in% 
             c(story_title %>% 
                 filter(line > 16*(i-1) & line <= 16*i) %>%
                 select(line))$line) %>%
    mutate(title = reorder(document, gamma * topic)) %>%
    ggplot(aes(factor(topic), gamma)) +
    geom_boxplot() +
    facet_wrap(~ title)
  
  print(picture)
}

大部分的故事都有一個明確的主題，例如故事 1, 2 等
少數主題並無明確主題，例如故事 7, 5

網路分析

建立網路關係

link <- story_gamma %>%
  # 只篩選 gamma 值大於 0.25
  filter(gamma > 0.25) %>%
  # 把 gamma 值當成 link 的權重
  rename(weight = gamma)

# 把 topic 欄位的 1 取代成 topic 1，依此類推，避免與 document 的 1 混淆 
link$topic <- link$topic %>%
  gsub(1, "topic 1", .) %>%
  gsub(2, "topic 2", .) %>%
  gsub(3, "topic 3", .) %>%
  gsub(4, "topic 4", .)

# 建立無向圖
TopicNetwork <- graph_from_data_frame(d = link, directed = F)
TopicNetwork

## IGRAPH 7e35733 UNW- 66 64 -- 
## + attr: name (v/c), weight (e/n)
## + edges from 7e35733 (vertex names):
##  [1] 61--topic 1 56--topic 1 24--topic 1 1 --topic 1 2 --topic 1 51--topic 1
##  [7] 12--topic 1 37--topic 1 52--topic 1 53--topic 1 5 --topic 1 11--topic 1
## [13] 46--topic 1 57--topic 1 7 --topic 1 47--topic 1 44--topic 1 42--topic 2
## [19] 19--topic 2 40--topic 2 10--topic 2 15--topic 2 8 --topic 2 29--topic 2
## [25] 16--topic 2 13--topic 2 4 --topic 2 20--topic 2 21--topic 2 26--topic 2
## [31] 3 --topic 2 7 --topic 2 43--topic 2 18--topic 3 28--topic 3 58--topic 3
## [37] 60--topic 3 33--topic 3 50--topic 3 35--topic 3 49--topic 3 54--topic 3
## [43] 17--topic 3 23--topic 3 6 --topic 3 41--topic 3 27--topic 3 31--topic 4
## + ... omitted several edges

U : undirected graph 無向圖
N : 節點包含 name 屬性
W : 連結包含 weight 屬性

畫出網路圖

set.seed(1)
plot(TopicNetwork)

# 調整點線大小且不顯示節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5, vertex.label = NA)

# 顯示有超過 5 個關聯的節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5,
     vertex.label = ifelse(degree(TopicNetwork) > 5, V(TopicNetwork)$name, NA), vertex.label.font = 15)

可看出主題 3 較獨立於其他三個主題

網路屬性

節點屬性

vertex_attr(TopicNetwork)

## $name
##  [1] "61"      "56"      "24"      "1"       "2"       "51"      "12"     
##  [8] "37"      "52"      "53"      "5"       "11"      "46"      "57"     
## [15] "7"       "47"      "44"      "42"      "19"      "40"      "10"     
## [22] "15"      "8"       "29"      "16"      "13"      "4"       "20"     
## [29] "21"      "26"      "3"       "43"      "18"      "28"      "58"     
## [36] "60"      "33"      "50"      "35"      "49"      "54"      "17"     
## [43] "23"      "6"       "41"      "27"      "31"      "22"      "62"     
## [50] "48"      "9"       "59"      "14"      "34"      "32"      "30"     
## [57] "36"      "25"      "38"      "45"      "55"      "39"      "topic 1"
## [64] "topic 2" "topic 3" "topic 4"

節點包含各篇故事 (1 ~ 62) 和各主題 (topic1, topic2, topic3, topic4)

設定連結屬性

# 設定連結的 type 為主題分類
E(TopicNetwork)$type <- link$topic
# 設定 weight
E(TopicNetwork)$weight <- link$weight

edge_attr(TopicNetwork)

## $weight
##  [1] 0.9998963 0.9997504 0.9999133 0.9999143 0.9999096 0.9999217 0.9998272
##  [8] 0.9998145 0.9998049 0.9999051 0.5826817 0.9998011 0.9764535 0.9998945
## [15] 0.4776732 0.9995984 0.9993411 0.9998231 0.9999330 0.9999385 0.9632137
## [22] 0.9998963 0.9998592 0.9998806 0.9998579 0.9997841 0.9998456 0.9997408
## [29] 0.9998391 0.9997728 0.9998343 0.5222429 0.8036270 0.9999388 0.9998970
## [36] 0.9999408 0.9999313 0.9998395 0.9998596 0.9999192 0.9999071 0.9997280
## [43] 0.9997462 0.9998621 0.9996210 0.9778185 0.9991582 0.9999135 0.9998624
## [50] 0.9999282 0.9998794 0.9998667 0.9998583 0.8975101 0.9998247 0.9998838
## [57] 0.4171610 0.9998602 0.9559805 0.9998184 0.9997419 0.9998535 0.8095904
## [64] 0.9996808
## 
## $type
##  [1] "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1"
##  [8] "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1" "topic 1"
## [15] "topic 1" "topic 1" "topic 1" "topic 2" "topic 2" "topic 2" "topic 2"
## [22] "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 2"
## [29] "topic 2" "topic 2" "topic 2" "topic 2" "topic 2" "topic 3" "topic 3"
## [36] "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 3"
## [43] "topic 3" "topic 3" "topic 3" "topic 3" "topic 3" "topic 4" "topic 4"
## [50] "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4"
## [57] "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4" "topic 4"
## [64] "topic 4"

Density

edge_density(TopicNetwork)

## [1] 0.02983683

Reciprocity 相互性

reciprocity(TopicNetwork)

## [1] 1

因為是無向圖，所以 Reciprocity 等於 1

Transitivity 遞移性

transitivity(TopicNetwork, type="global")

## [1] 0

該網路沒有 Transitivity，沒有 triangle loop

Diameter 直徑

diameter(TopicNetwork, directed = F)

## [1] 3.999625

mean_distance

mean_distance(TopicNetwork, directed=F)

## [1] 3.482609

degree

deg <- degree(TopicNetwork, mode = "all")
plot(TopicNetwork, 
     # 依照 degree 大小設定節點大小
     vertex.size = deg * 3,
     # 只顯示 degree 大於 5 的節點名稱
     vertex.label = ifelse(deg > 5, V(TopicNetwork)$name, NA))

# Histogram of node degree
hist(deg, breaks = 1:vcount(TopicNetwork)-1, main = "Histogram of node degree")

# Degree distribution
deg.dist <- degree_distribution(TopicNetwork, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.2, col="orange", xlab="Degree", ylab="Cumulative Frequency")

由圖可知只有 topic 1, topic 2, topic 3, topic 4 這四個節點會有較多的連結，其他節點大多只有一兩個連結，因為通常一個故事只會歸屬一種主題。

Centrality

Degree Centrality

number of ties

degree(TopicNetwork, mode = "all")

##      61      56      24       1       2      51      12      37      52      53 
##       1       1       1       1       1       1       1       1       1       1 
##       5      11      46      57       7      47      44      42      19      40 
##       2       1       1       1       2       1       1       1       1       1 
##      10      15       8      29      16      13       4      20      21      26 
##       1       1       1       1       1       1       1       1       1       1 
##       3      43      18      28      58      60      33      50      35      49 
##       1       1       1       1       1       1       1       1       1       1 
##      54      17      23       6      41      27      31      22      62      48 
##       1       1       1       1       1       1       1       1       1       1 
##       9      59      14      34      32      30      36      25      38      45 
##       1       1       1       1       1       1       1       1       1       1 
##      55      39 topic 1 topic 2 topic 3 topic 4 
##       1       1      17      16      14      17

centr_degree(TopicNetwork, mode = "all", normalized = T)

## $res
##  [1]  1  1  1  1  1  1  1  1  1  1  2  1  1  1  2  1  1  1  1  1  1  1  1  1  1
## [26]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [51]  1  1  1  1  1  1  1  1  1  1  1  1 17 16 14 17
## 
## $centralization
## [1] 0.2317016
## 
## $theoretical_max
## [1] 4290

各主題 (topic1, topic2, topic3, topic4) 的節點的 Degree Centrality 較高
res : 節點中心度
centralization
theoretical_max : 最大中心化分數

Eigenvector Centrality

centrality proportional to the sum of connection centralities

#eigen_centrality(TopicNetwork, directed = F, weights = NA)
#centr_eigen(TopicNetwork, directed = F, normalized = T)

Betweenness Centrality

centrality based on a broker position connecting others

betweenness(TopicNetwork, directed = F)

##      61      56      24       1       2      51      12      37      52      53 
##       0       0       0       0       0       0       0       0       0       0 
##       5      11      46      57       7      47      44      42      19      40 
##     561       0       0       0     544       0       0       0       0       0 
##      10      15       8      29      16      13       4      20      21      26 
##       0       0       0       0       0       0       0       0       0       0 
##       3      43      18      28      58      60      33      50      35      49 
##       0       0       0       0       0       0       0       0       0       0 
##      54      17      23       6      41      27      31      22      62      48 
##       0       0       0       0       0       0       0       0       0       0 
##       9      59      14      34      32      30      36      25      38      45 
##       0       0       0       0       0       0       0       0       0       0 
##      55      39 topic 1 topic 2 topic 3 topic 4 
##       0       0     936     630      91     664

edge_betweenness(TopicNetwork, directed = F)

##  [1]  50  50  50  50  50  50  50  50  50  50 594  50  50  50 578  50  50  50  50
## [20]  50  50  50  50  50  50  50  50  50  50  50  50 560  50  14  14  14  14  14
## [39]  14  14  14  14  14  14  14  14  14  50  50  50  50  50  50  50  50  50 578
## [58]  50  50  50  50  50  50  50

centr_betw(TopicNetwork, directed = F, normalized = T)

## $res
##  [1]   0   0   0   0   0   0   0   0   0   0 561   0   0   0 544   0   0   0   0
## [20]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [39]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [58]   0   0   0   0   0 936 630  91 664
## 
## $centralization
## [1] 0.4315828
## 
## $theoretical_max
## [1] 135200

各主題 (topic1, topic2, topic3, topic4) 節點的 Betweenness Centrality 最高
故事 5, 7 節點也具一定程度的 Betweenness Centrality
其他故事節點的 Betweenness Centrality 則為 0

Find cliques

net.sym <- as.undirected(TopicNetwork, mode = "collapse", edge.attr.comb = list(weight = "sum", "ignore"))

#cliques(net.sym) # list of cliques       
sapply(cliques(net.sym), length) # clique sizes

##   [1] 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
##  [38] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [75] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 2 1 2 1 2 1 2 2 1
## [112] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2

#largest_cliques(net.sym) # cliques with max number of nodes

vcol <- rep("grey80", vcount(net.sym))
vcol[unlist(largest_cliques(net.sym))] <- "gold"
plot(as.undirected(net.sym), vertex.label=V(net.sym)$name, vertex.color=vcol)

Community detection

ceb <- cluster_edge_betweenness(TopicNetwork)

## Warning in cluster_edge_betweenness(TopicNetwork): At community.c:
## 460 :Membership vector will be selected based on the lowest modularity score.

## Warning in cluster_edge_betweenness(TopicNetwork): At community.c:
## 467 :Modularity calculation with weighted edge betweenness community detection
## might not make sense -- modularity treats edge weights as similarities while
## edge betwenness treats them as distances

#dendPlot(ceb, mode="hclust")
plot(ceb, TopicNetwork)

結論

62 篇的格林童話，做文字與主題分析，會因為很多故事的用字，都使用相同的名詞或名稱，導致分析上，需要再作細部折解才有辨法看出主題的方向。

格林童話文字分析

格林童話文字分析

資料介紹

文本探索

載入套件

讀取資料

文字前處理

查看斷詞結果

排除停用字

統計字頻

文字雲分析

總體文字雲

區分正、負面情緒的文字雲 (採用 Bing 字典)

情緒分析

四個情緒字典介紹

NRC字典

準備資料

NRC 最常出現的負面情緒字

NRC 最常出現的正面情緒字

找出 “king” 最常出現的前三大故事

Afinn 字典

準備資料

每個故事情緒加總變化趨勢

Afinn 最常出現的負面情緒字

Afinn 最常出現的正面情緒字

Bing 字典

準備資料

Bing 最常出現的正/負面情緒字

Loughran 字典

Loughran 中各情緒類型包含多少字

Loughran 最常出現的正/負面情緒字

四字典各故事比較分析

正/負面情緒詞總數差異

準備資料

每個故事情緒詞總數比較

正面-負面情緒差異

準備資料

合併四個字典並劃出折線圖

情緒分析:主要針對NRC字典,Afinn字典,Bing字典,loughran字典

TF-IDF 分析

計算 term frequency

計算 TF-IDF

同一行裡最常一起出現的詞彙

LDA 分析

將資料轉成 DocumentTermMatrix (DTM) 格式

主題-文字分析

4 個主題 Beta 值

選出各主題前 5 高的文字

主題-文字分析-Beta值(seed=2000)

LDAvis分析(分10主題)

文件-主題分析

計算每個主題的故事數量

畫出各篇故事屬於各個 topic 的機率

網路分析

建立網路關係

畫出網路圖

網路屬性

節點屬性

設定連結屬性

Density

Reciprocity 相互性

Transitivity 遞移性

Diameter 直徑

mean_distance

degree

Centrality

Degree Centrality

Eigenvector Centrality

Betweenness Centrality

Find cliques

Community detection

結論