社群媒體分析讀書會作業-2

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"

前處理

載入packages

rm(list=ls(all=T))
# 整理
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidytext)
library(jiebaR)

## Loading required package: jiebaRD

library(gutenbergr)
library(stringr)
library(tmcn) # 簡轉繁

## # tmcn Version: 0.2-13

library(readr)
# 繪圖
library(wordcloud2)
library(ggridges)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggplot2)
library(forcats)

# 下載 "三國演義" 書籍，並且將text欄位為空的行給清除，以及將重複的語句清除
three_kingdom <- gutenberg_download(23950,mirror = "http://mirrors.xmission.com/gutenberg/") %>% 
  filter(text!="") %>% 
  distinct(gutenberg_id, text)

三國演義每章節的開頭都會有“第X回”的詞，但有些僅以“第X回”表示，有些則在“第X回”後面加上章節名稱，EX.「第一回：宴桃園豪傑三結義，斬黃巾英雄首立功」

按章節整理

# 按章節整理
three_kingdom <- three_kingdom %>% 
  mutate(chapter = cumsum(str_detect(three_kingdom$text, regex("^第.{1,3}回"))))
three_kingdom_doc = paste0(three_kingdom$text,collapse = "") 

# 若要將一個字串分割成多個字串，可以使用 strsplit 函數，第一個參數是輸入的字串，而第二個參數則是分隔字串：
three_kingdom_docVector = unlist(strsplit(three_kingdom_doc ,"[。.？！」「]"), use.names=FALSE)
head(three_kingdom_docVector)

## [1] "第一回：宴桃園豪傑三結義，斬黃巾英雄首立功　　詞曰：　　滾滾長江東逝水，浪花淘盡英雄"
## [2] "是非成敗轉頭空：青山依舊在，幾度夕陽紅"                                              
## [3] "白髮漁樵江渚上，慣看秋月春風"                                                        
## [4] "一壺濁酒喜相逢：古今多少事，都付笑談中"                                              
## [5] "　　話說天下大勢，分久必合，合久必分：周末七國分爭，并入於秦"                        
## [6] "及秦滅之後，楚、漢分爭，又并入於漢"

three_kingdom_doc <- as.data.frame(three_kingdom_docVector)
three_kingdom_doc$id <- 23950
three_kingdom_doc <- three_kingdom_doc %>% 
  transmute(id =id, text = three_kingdom_docVector)
three_kingdom_doc <- filter(three_kingdom_doc,text!= "")

head(three_kingdom_doc)

##      id
## 1 23950
## 2 23950
## 3 23950
## 4 23950
## 5 23950
## 6 23950
##                                                                                   text
## 1 第一回：宴桃園豪傑三結義，斬黃巾英雄首立功　　詞曰：　　滾滾長江東逝水，浪花淘盡英雄
## 2                                               是非成敗轉頭空：青山依舊在，幾度夕陽紅
## 3                                                         白髮漁樵江渚上，慣看秋月春風
## 4                                               一壺濁酒喜相逢：古今多少事，都付笑談中
## 5                         　　話說天下大勢，分久必合，合久必分：周末七國分爭，并入於秦
## 6                                                   及秦滅之後，楚、漢分爭，又并入於漢

根據上方整理出來的規則，我們可以使用正規表示式，將句子區分章節

three_kingdom_doc <- three_kingdom_doc %>% 
  mutate(chapter = cumsum(str_detect(three_kingdom_doc$text, regex("^第.{1,3}回"))))

下載下來的書已經完成斷句

使用三國演義專有名詞字典並且將其由簡體轉繁體

# 停用詞簡轉繁
three_kingdom_dict <- read_csv("three_kingdom.text",col_names =F )

## Parsed with column specification:
## cols(
##   X1 = col_character()
## )

three_kingdom_dict <- toTrad(three_kingdom_dict$X1)
write.table(three_kingdom_dict,file="three_kingdom_dict.txt",sep=",",row.names = F,col.names = F,fileEncoding="UTF-8")
jieba_tokenizer <- worker(user="three_kingdom_dict.txt", stop_word = "stop_words.txt")

斷詞

# 設定斷詞function
three_kingdom_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

# 斷詞
three_kingdom_doc$text <- as.character(three_kingdom_doc$text)
tokens <- three_kingdom_doc %>% 
  unnest_tokens(word, text, token=three_kingdom_tokenizer)
str(tokens)

## 'data.frame':    287947 obs. of  3 variables:
##  $ id     : num  23950 23950 23950 23950 23950 ...
##  $ chapter: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word   : chr  "第一回" "宴" "桃園" "豪傑" ...

head(tokens)

##        id chapter   word
## 1   23950       1 第一回
## 1.1 23950       1     宴
## 1.2 23950       1   桃園
## 1.3 23950       1   豪傑
## 1.4 23950       1     三
## 1.5 23950       1   結義

# 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算 
tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>%  
  filter(sum > 10) %>%
  arrange(desc(sum))

印出最常見的20個詞彙

head(tokens_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 曹操     932
##  2 孔明     843
##  3 將軍     776
##  4 卻說     642
##  5 玄德     519
##  6 關公     507
##  7 丞相     482
##  8 二人     458
##  9 不可     426
## 10 荊州     418
## 11 孔明曰   386
## 12 玄德曰   386
## 13 不能     384
## 14 如此     372
## 15 張飛     349
## 16 如何     344
## 17 商議     341
## 18 主公     327
## 19 軍士     320
## 20 操曰     307

繪圖 & 分析

出現最多次數的詞彙的文字雲

# 用出現次數大於200的繪圖為文字雲
tokens_count %>% 
  filter(sum>200)%>%
  wordcloud2()

人物分析

劉禪名字的分析

# 劉禪的部分
liuchan = tokens %>% 
  filter(word=="劉禪"|word=="公嗣")%>%
  group_by(chapter) %>% 
  summarize(sum = n()) %>% 
  mutate(name = "劉禪(公嗣)",total = sum(sum))
adou = tokens %>% 
  filter(word== "阿斗")%>%
  group_by(chapter) %>% 
  summarize(sum = n())%>% 
  mutate(name = "阿斗",total = sum(sum))
c = bind_rows(liuchan,adou)
c %>% ggplot(aes(x=chapter,y =sum,color = name))+geom_line()+xlab("chapter")+ylab("出現次數")+theme(text = element_text(family='STHeiti'))

d = c[,3:4] %>% distinct() 
d[1,1] = "劉禪"
d[3,1] = "公嗣"
d[3,2] = 0
d %>% ggplot(aes(x= name , y = total,fill = name))+geom_bar(stat="identity")+theme(text = element_text(family='STHeiti'))

其他人物大都都有較為正式的稱呼，但是「劉禪」字「公嗣」加起來沒「阿斗」，以此可見「劉禪」不配擁有姓名

三國中較為重要的九位人物分析

全書重要人物出現的「總次數」

# "'劉備', '諸葛亮', '曹操', '關雲長', '趙雲', '司馬懿', '周瑜', '張飛', '孫權'"
zhuge_tokens <- tokens_count %>%
  filter(word %in% c('諸葛亮','孔明曰','孔明笑', '孔明'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "諸葛亮")
liubei_tokens <- tokens_count %>%
  filter(word %in% c('劉備', '玄德', '玄德曰', '主公', '劉皇叔', '劉玄德','玄德大', '玄德問', '劉豫州', '玄德聞', '玄德乃'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "劉備")
caoca_tokens <- tokens_count %>%
  filter(word %in% c('曹操', '丞相', '操大怒', '曹丞相', '操大喜'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "曹操")
guanyu_tokens <- tokens_count %>%
  filter(word %in% c('關雲長', '雲長', '關公'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "關羽")
zhangfei_tokens <- tokens_count %>%
  filter(word %in% c('張飛', '翼德'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "張飛")
zhaoyun_tokens <- tokens_count %>%
  filter(word %in% c('趙雲', '子龍', '趙子龍'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "趙雲")
simayi_tokens <- tokens_count %>%
  filter(word %in% c('司馬懿', '司馬', '仲達'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "司馬懿")
zhouyu_tokens <- tokens_count %>%
  filter(word %in% c('公瑾', '周瑜'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "周瑜")
sunquan_tokens <- tokens_count %>%
  filter(word %in% c('仲謀', '孫權'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "孫權")

# 全書重要人物出現的「總次數」
allname <- rbind(zhuge_tokens,liubei_tokens,caoca_tokens,guanyu_tokens,
                 zhangfei_tokens,zhaoyun_tokens,simayi_tokens,zhouyu_tokens,sunquan_tokens)
head(allname,20)

## # A tibble: 9 x 2
##   allsum name  
##    <int> <chr> 
## 1   1432 諸葛亮
## 2   1763 劉備  
## 3   1505 曹操  
## 4    728 關羽  
## 5    364 張飛  
## 6    321 趙雲  
## 7    290 司馬懿
## 8    250 周瑜  
## 9    264 孫權

# Reorder following the value of another column:
plot <- allname %>%
  mutate(name = fct_reorder(name, allsum)) %>%
  ggplot( aes(x=name, y=allsum))+
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    coord_flip() +
    xlab("") +
    theme(text = element_text(family='STHeiti'))
plot

可以非常明顯的看出《三國演義》故事情節圍繞曹操、劉備、諸葛亮三大男主角展開。

各主要人物在「各個章節出現的次數」變化

# 各主要人物在「各個章節出現的次數」變化
# 以各個chapter和word來groupby
CW_tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(chapter,word) %>% 
  summarise(sum = n()) %>%  
  filter(sum > 10) %>%
  arrange(desc(sum))
# head(CW_tokens_count)

CW_zhuge_tokens <- CW_tokens_count %>%
  filter(word %in% c('諸葛亮','孔明曰','孔明笑', '孔明'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "諸葛亮")
CW_liubei_tokens <- CW_tokens_count %>%
  filter(word %in% c('劉備', '玄德', '玄德曰', '主公', '劉皇叔', '劉玄德','玄德大', '玄德問', '劉豫州', '玄德聞', '玄德乃'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "劉備")
CW_caoca_tokens <- CW_tokens_count %>%
  filter(word %in% c('曹操', '丞相', '操大怒', '曹丞相', '操大喜','阿瞞'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "曹操")
CW_guanyu_tokens <- CW_tokens_count %>%
  filter(word %in% c('關雲長', '雲長', '關公'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "關羽")
CW_zhangfei_tokens <- CW_tokens_count %>%
  filter(word %in% c('張飛', '翼德'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "張飛")
CW_zhaoyun_tokens <- CW_tokens_count %>%
  filter(word %in% c('趙雲', '子龍', '趙子龍'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "趙雲")
CW_simayi_tokens <- CW_tokens_count %>%
  filter(word %in% c('司馬懿', '司馬', '仲達'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "司馬懿")
CW_zhouyu_tokens <- CW_tokens_count %>%
  filter(word %in% c('公瑾', '周瑜'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "周瑜")
CW_sunquan_tokens <- CW_tokens_count %>%
  filter(word %in% c('仲謀', '孫權'))%>%
  summarise(allsum = sum(sum))%>%
  mutate(name = "孫權")

CW_allname <- rbind(CW_zhuge_tokens,CW_liubei_tokens,CW_caoca_tokens,CW_guanyu_tokens,
                    CW_zhangfei_tokens,CW_zhaoyun_tokens,CW_simayi_tokens,CW_zhouyu_tokens,CW_sunquan_tokens)
head(CW_allname)

## # A tibble: 6 x 3
##   chapter allsum name  
##     <int>  <int> <chr> 
## 1      34     20 諸葛亮
## 2      36     37 諸葛亮
## 3      40     48 諸葛亮
## 4      41     27 諸葛亮
## 5      42     20 諸葛亮
## 6      43     43 諸葛亮

各主其要人物在各章節的出現的數量的靜態折線圖

top_plot <- 
  CW_allname %>% 
  group_by(name)%>%
  ggplot(aes(x = chapter, y=allsum, fill="type", color=factor(name))) +
  geom_line() + 
  ggtitle("各主其要人物在各章節的出現的數量") + 
  xlab("章節") + 
  ylab("數量") +
  theme(text = element_text(family='STHeiti'))
top_plot

各個人物出現章節密度山脊線圖

ttop_plot <- CW_allname %>% 
  group_by(name)%>%
  ggplot(aes(x = chapter, y=name, fill=name)) +
  geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
  theme_ridges() +
  theme(
    legend.position="none",
    panel.spacing = unit(0.1, "lines"),
    strip.text.x = element_text(size = 8)
    ) +
  xlab("") +
  ylab("Assigned Probability (%)")+
  theme(text = element_text(family='STHeiti'))
ttop_plot

我們可以透過上方的山脊圖，比較清楚的看出，各個人物活躍的章節和時期

三大智力英雄的出現時期

three_smart_plot <- bind_rows(CW_zhuge_tokens, CW_simayi_tokens, CW_zhouyu_tokens) %>%
  ggplot(aes(x = chapter, y=allsum, fill=name,color = name)) +
  geom_point() +
  geom_line() +
  facet_wrap(~name, ncol = 1) + 
  ggtitle("智力型角色:「孔明」v.s.「司馬懿」v.s.「周瑜」") + 
  xlab("章節") + 
  ylab("出現次數") + 
  theme(text = element_text(family = "STHeiti"))
three_smart_plot

司馬懿的值得稱道的事跡有：在五丈原拖死了諸葛孔明，平亂孟達，公孫淵。他最大的貢獻自然是逐步掌握了曹魏的政權，讓他的孫子坐享其成，最後改朝換代, 所以也主要出現在比較後期的時候。

周瑜早年輔佐孫策平定江東，也是吳國開國立業的最大功臣。同時作為赤壁之戰的主帥確實是一手締造出了這三分局勢。而赤壁之戰就主要發生在四十回左右，也正式周瑜出場最多的時候

諸葛亮是劉備死後蜀漢的實際統治者，在內政軍事等方面都有所建，平定南方少數民族和北伐都鞏固了蜀漢政權，延長了三國的局面，是三國後期最具影響力的人物之一，所以在後期出現的也較多，也比較活躍。

各章節長度

以句子數量來計算

plot <- 
  bind_rows(
    three_kingdom %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量")+
  theme(text = element_text(family='STHeiti'))
plot

> 第七十一回用了最多的句子和單字，這章就是在講定軍山之戰，也就是劉備稱霸成為漢中王前的一役。

計算前60回合後60回的詞彙在全文中出現比率的差異(總共120章)

frequency <- tokens %>% mutate(part = ifelse(chapter<=60, "First 60", "Last 60")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>%
  gather(part, proportion, `Last 60`)
frequency

## # A tibble: 45,071 x 4
##    word   `First 60` part    proportion
##    <chr>       <dbl> <chr>        <dbl>
##  1 阿斗    0.000290  Last 60  0.0000129
##  2 阿房宮 NA         Last 60  0.0000129
##  3 阿父    0.0000104 Last 60 NA        
##  4 阿附    0.0000104 Last 60  0.0000387
##  5 阿會喃 NA         Last 60  0.000180 
##  6 阿瞞    0.0000207 Last 60 NA        
##  7 阿鴦   NA         Last 60  0.0000129
##  8 阿諛    0.0000104 Last 60 NA        
##  9 阿之原  0.0000104 Last 60 NA        
## 10 哀告    0.0000104 Last 60  0.0000644
## # … with 45,061 more rows

ggplot(frequency, aes(x = proportion, y = `First 60`, color = abs(`First 60` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 60", x = "Last 60")+
  theme(text = element_text(family='Heiti TC Light'))

## Warning: Removed 35802 rows containing missing values (geom_point).

## Warning: Removed 35803 rows containing missing values (geom_text).