(一)題目 鐵路警察遭刺死案

(二)研究動機 108/7/3-事件發生 25歲員警李承翰去年7月3日晚間,前往台鐵自強號列車,處理逃票糾紛時,不料,鄭姓乘客突然拿出尖刀,朝他的腹部猛刺,導致當場鮮血直流、臟器外露。負傷的李承翰因擔心乘客受到傷害,忍痛拚命壓制鄭男,經緊急送醫搶救後,仍因傷重宣告不治。 109/3/13-開庭 鐵路警察李承翰去年7月遭鄭姓嫌犯刺死案,今天下午開庭傳喚證身心科醫師作證,認為鄭是思覺失調症患者,當時行為受妄想和幻聽影響,攻擊時已喪失辨識能力,也喪失部分控制力,醫療團隊的證詞將是鄭姓嫌犯未來判決是否有罪的重要因素。 109/4/30-一審判決出爐 刺死鐵路警李承翰 嫌「思覺失調」無罪50萬交保! 法官考量鄭姓嫌犯因患有思覺失調症,將其判無罪(可上訴)、強制就醫5年,上訴期間內可撤銷羈押可50萬交保,如未繳50萬元繼續羈押。 109/5/1-醫師遭撻伐 殺鐵路警判無罪》鑑定醫師沈正哲遭出征 寫近2千字吐苦衷:專業沒被尊重

109/6/4-鐵路警李承翰父悲憤過世

因此我們希望透過分析數據,探討本事件中各階段,相關媒體報導與民眾輿論的主要內容、方向,以及事件轉折點造成議題、情緒變化之情況。

先安裝需要的packages,並匯入

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""

##資料來源:PTT八卦版,關鍵字「李承翰_鐵路警」

Posts <- read_csv('./李承翰_鐵路警_articleMetaData.csv')
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )
head(Posts)
## # A tibble: 6 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat commentNum  push   boo
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl> <dbl>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 2 [新聞]快訊/~ 2019-07-03 16:32:53 https~ DDDDRR    Gossi~       1496  1048   103
## 3 [問卦]刺死鐵~ 2019-07-03 16:40:34 https~ Elsie1999 Gossi~        290   120    17
## 4 Re:[新聞]~ 2019-07-03 16:42:58 https~ ziggs1222 Gossi~         29     4    20
## 5 [新聞]嘉義鐵~ 2019-07-03 17:16:55 https~ lilskies  Gossi~         29     2    11
## 6 [問卦]鐵路警~ 2019-07-03 17:37:29 https~ v963610   Gossi~         72    18    13
## # ... with 1 more variable: sentence <chr>
Reviews <- read_csv('./李承翰_鐵路警_articleReviews.csv')
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   cmtPoster = col_character(),
##   cmtStatus = col_character(),
##   cmtDate = col_datetime(format = ""),
##   cmtContent = col_character()
## )
## Warning: 16123 parsing failures.
##   row     col   expected          actual                                 file
## 11047 cmtDate date like  2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11048 cmtDate date like  2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11049 cmtDate date like  2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11050 cmtDate date like  2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11051 cmtDate date like  2020/3/13 22:56 './李承翰_鐵路警_articleReviews.csv'
## ..... ....... .......... ............... ....................................
## See problems(...) for more details.
head(Reviews)
## # A tibble: 6 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ gankgf    推       
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ saya2185  →        
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ iam0718   推       
## # ... with 2 more variables: cmtDate <dttm>, cmtContent <chr>

相關事件文章分布情況

Posts %>% 
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
  geom_point()

  • 整個事件有三個高峰
  • 1.108/7/3-事件發生
  • 2.109/4/30-一審判決出爐
  • 3.109/6/4-鐵路警李承翰父悲憤過世

Posts Tokenization

# 排除Re類型文章
Posts <- Posts %>%
  filter(!substr(artTitle,start=1,stop=3) %in% c("Re:"))

# 依"文章"進行斷詞
# 取得所有與"鐵路警察遭刺死案"有關之PTT文章,將同一天出現重複的文章去除。
#Posts <- Posts %>% 
#        distinct(artTitle, artDate, sentence)

# 匯入專用字典、停用字
mask_lexicon <- scan(file = "./mask_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

# 使用專用字典,並去除停用字。
jieba_tokenizer = worker(write = "NOFILE")  #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

tokens <- Posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

tokens %>%head(20)
## # A tibble: 20 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 11 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 12 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 13 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 14 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 15 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 16 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 17 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 18 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 19 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## 20 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
## # ... with 1 more variable: word <chr>
# 一審判決(2020/4/30)前後詞彙在全文中出現比率的差異
frequency <- tokens %>% 
  mutate(event = ifelse(artDate < as.Date('2020-04-30'), "First", "Last")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(event, word) %>%
  group_by(event) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(event, proportion) %>% 
  gather(event, proportion, `Last`)
frequency
## # A tibble: 4,599 x 4
##    word         First event proportion
##    <chr>        <dbl> <chr>      <dbl>
##  1 <U+7535><U+89C6>     NA        Last   0.0000919
##  2 丁允     NA        Last   0.000368 
##  3 丁世傑    0.000122 Last  NA        
##  4 丁怡銘   NA        Last   0.000276 
##  5 丁怡銘今 NA        Last   0.0000919
##  6 丁偉杰   NA        Last   0.000184 
##  7 丁盛豐    0.000122 Last  NA        
##  8 丁勝豐    0.000122 Last  NA        
##  9 了不起   NA        Last   0.0000919
## 10 了會      0.000122 Last  NA        
## # ... with 4,589 more rows
# 匯出圖表
ggplot(frequency, aes(x = proportion, y = `First`, color = abs(`First` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "一審判決前", x = "一審判決後")
## Warning: Removed 3601 rows containing missing values (geom_point).
## Warning: Removed 3602 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

  • 探討“一審判決前後”個別出現頻率較高的詞彙

  • 一審判決前:電擊槍、告別式、旅客

  • 一審判決後:法官、判決、父親、李增文(李父)

  • 媒體報導在“一審判決前”,主要有兩個方向:

  • 1.事件始末

  • 2.因公殉職以及鐵道員警執勤業務

  • 媒體報導在“一審判決後”,主要有兩個方向:

  • 1.案件解讀以及對社會影響

  • 2.李父過世

Posts
## # A tibble: 128 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~        133    57     8
##  2 [新聞]快訊/~ 2019-07-03 16:32:53 https~ DDDDRR    Gossi~       1496  1048   103
##  3 [問卦]刺死鐵~ 2019-07-03 16:40:34 https~ Elsie1999 Gossi~        290   120    17
##  4 [新聞]嘉義鐵~ 2019-07-03 17:16:55 https~ lilskies  Gossi~         29     2    11
##  5 [問卦]鐵路警~ 2019-07-03 17:37:29 https~ v963610   Gossi~         72    18    13
##  6 [問卦]殺鐵路~ 2019-07-03 17:41:58 https~ Lcyy      Gossi~         37    10     6
##  7 [新聞]【鐵路~ 2019-07-03 18:08:00 https~ vic2211   Gossi~        797   473    52
##  8 [新聞]【鐵路~ 2019-07-03 18:28:28 https~ ash99119~ Gossi~        183    70    47
##  9 [新聞]哀!身~ 2019-07-03 18:44:46 https~ DOOHDLIHC Gossi~         60    26     2
## 10 [新聞]台鐵持~ 2019-07-03 19:08:27 https~ friedrich Gossi~        280   103    68
## # ... with 118 more rows, and 1 more variable: sentence <chr>

Reviews Tokenization

allPoster <- c(Posts$artPoster, Reviews$cmtPoster)
#length(unique(allPoster))
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%Posts$artPoster, "poster", "replyer"))
#userList

# 把原文與回覆依據artUrl innerJoin起來
#posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
posts_Reviews<-Reviews
#posts_Reviews

#Reviews_sentences <- strsplit(Reviews$cmtContent,"[。!;?!?;,:]")

#Reviews_sentences <- data.frame(
#                        artTitle = rep(Reviews$artTitle, sapply(Reviews_sentences, length)),
#                        artDate = rep(Reviews$artDate, sapply(Reviews_sentences, length)), 
#                        artTime = rep(Reviews$artTime, sapply(Reviews_sentences, length)), 
#                        artUrl = rep(Reviews$artUrl, sapply(Reviews_sentences, length)),
#                        cmtContent = unlist(Reviews_sentences)
#                      ) %>%
#                      filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))

#Reviews_sentences$cmtContent <- as.character (Reviews_sentences$cmtContent)


# 匯入專用字典、停用字
mask_lexicon <- scan(file = "./mask_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')


# 使用專用字典,並去除停用字。
jieba_tokenizer = worker(write = "NOFILE")  #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
Reviews_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
tokens <- Reviews %>%
  unnest_tokens(word, cmtContent, token=Reviews_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) #%>%
  #count(artUrl, word) %>%
  #rename(count=n)
tokens %>%head(20)
## # A tibble: 20 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
##  1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ gankgf    推       
##  2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ saya2185  →        
##  3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
## 11 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
## 12 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
## 13 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
## 14 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ iam0718   推       
## 15 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ WolfTeac~ →        
## 16 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ WolfTeac~ →        
## 17 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ WolfTeac~ →        
## 18 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
## 19 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
## 20 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
## # ... with 2 more variables: cmtDate <dttm>, word <chr>
# 一審判決(2020/4/30)前後詞彙在全文中出現比率的差異
frequency <- tokens %>% 
  mutate(event = ifelse(artDate < as.Date('2020-04-30'), "First", "Last")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(event, word) %>%
  group_by(event) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(event, proportion) %>% 
  gather(event, proportion, `Last`)
frequency
## # A tibble: 14,920 x 4
##    word        First event proportion
##    <chr>       <dbl> <chr>      <dbl>
##  1 <U+6653>得   NA         Last   0.0000199
##  2 <U+6CA1>事    0.0000334 Last  NA        
##  3 <U+6CA1>罪    0.0000334 Last  NA        
##  4 <U+7962>們    0.0000334 Last  NA        
##  5 丁世傑  0.0000334 Last  NA        
##  6 丁蟹   NA         Last   0.0000199
##  7 了不起  0.000134  Last   0.0000596
##  8 了吧    0.000267  Last   0.000139 
##  9 了沒    0.0000668 Last   0.000159 
## 10 了呢   NA         Last   0.0000199
## # ... with 14,910 more rows
# 匯出圖表
ggplot(frequency, aes(x = proportion, y = `First`, color = abs(`First` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "一審判決前", x = "一審判決後")
## Warning: Removed 11314 rows containing missing values (geom_point).
## Warning: Removed 11315 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Reviews
## # A tibble: 27,169 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
##  1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ gankgf    推       
##  2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ saya2185  →        
##  3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
##  6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ iam0718   推       
##  7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ WolfTeac~ →        
##  8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
##  9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Mradult   推       
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
## # ... with 27,159 more rows, and 2 more variables: cmtDate <dttm>,
## #   cmtContent <chr>
Reviews
## # A tibble: 27,169 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
##  1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ gankgf    推       
##  2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ saya2185  →        
##  3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ xu3       →        
##  5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Leo4891   推       
##  6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ iam0718   推       
##  7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ WolfTeac~ →        
##  8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
##  9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ Mradult   推       
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro      Gossi~ darkbrig~ →        
## # ... with 27,159 more rows, and 2 more variables: cmtDate <dttm>,
## #   cmtContent <chr>

計算所有字在文集中的總詞頻

tokens <- Reviews %>%
  unnest_tokens(word, cmtContent, token=Reviews_tokenizer) %>%
  #filter(!str_detect(word, regex("[0-9a-zA-Z]"))) #%>%
  count(artDate,artUrl, word) %>%
  rename(count=n)

word_count <- tokens %>%
  select(artUrl,word,count) %>% 
  group_by(artUrl,word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

word_count
## # A tibble: 3,286 x 3
## # Groups:   artUrl [127]
##    artUrl                                                   word   count
##    <chr>                                                    <chr>  <int>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html rip      308
##  2 https://www.ptt.cc/bbs/Gossiping/M.1591280234.A.753.html 法官     237
##  3 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.html 法官     180
##  4 https://www.ptt.cc/bbs/Gossiping/M.1563244035.A.811.html qq       165
##  5 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.html 法官     142
##  6 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html r.i.p    137
##  7 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html r.i.p.   116
##  8 https://www.ptt.cc/bbs/Gossiping/M.1591334261.A.915.html 裝病     107
##  9 https://www.ptt.cc/bbs/Gossiping/M.1591341952.A.398.html 速食店   107
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562246171.A.40E.html 警察      98
## # ... with 3,276 more rows
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% 
  unlist()

mask_removed <- tokens %>% 
  filter(word %in% reserved_word)

tokens_dtm <- mask_removed %>% cast_dtm(artUrl, word, count)
tokens_dtm
## <<DocumentTermMatrix (documents: 154, terms: 2739)>>
## Non-/sparse entries: 29979/391827
## Sparsity           : 93%
## Maximal term length: 6
## Weighting          : term frequency (tf)

LDA分成4個主題

report_lda <- LDA(tokens_dtm, k = 4, control = list(seed = 1234))
# 移除常見詞彙
tidy(report_lda, matrix = "beta") %>%
  filter(! term %in% c("警察","垃圾","台灣","法官","殺人","以後","韓粉","蔡英文","25","問題","霸氣","政府","幹話","消費","幹你娘","")) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

針對四個主題,在事件的三個高峰,去探討主題的分佈

topic_name_a = c('精神病殺人無罪','支持用槍','司法改革','願死者安息')

tmResult_a <- posterior(report_lda)
doc_pro_a <- tmResult_a$topics 
dim(doc_pro_a) 
## [1] 154   4
#reviews
document_topics_a <- doc_pro_a[Reviews$artUrl,]
document_topics_df_a =data.frame(document_topics_a)
colnames(document_topics_df_a) = topic_name_a
rownames(document_topics_df_a) = NULL
news_topic_a = cbind(Reviews,document_topics_df_a)


mycolors_t <- colorRampPalette(brewer.pal(6, "Set3"))(12)


news_topic_a %>%
  filter(artDate=='2019-07-03'|artDate=='2020-04-30'|artDate=='2020-06-04')%>%
  group_by(artDate = format(artDate,'%Y%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity" ,  width = 0.8) + 
  ylab("proportion") + 
  scale_fill_manual(values=mycolors_t)+
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5))

再針對四個主題,對於經常回文的鄉民們,在主題上的分佈

alytopic_a<-news_topic_a[news_topic_a[11]>0.8 | news_topic_a[12]>0.8 | news_topic_a[13]>0.8 | news_topic_a[14]>0.8,]%>%
       filter(cmtStatus!="噓")


alytopic_re<-alytopic_a%>%
  mutate(topicna = ifelse(as.double(unlist(alytopic_a[11]))>0.8, "1",
                    ifelse(as.double(unlist(alytopic_a[12]))>0.8, "2",
                    ifelse(as.double(unlist(alytopic_a[13]))>0.8, "3",
                    ifelse(as.double(unlist(alytopic_a[14]))>0.8, "4", "0")))))%>%
  select(artUrl,cmtPoster,topicna)


alytopic_graf<-alytopic_re%>%
  group_by(cmtPoster, artUrl) %>% 
  filter(n()>5) %>%   #回文超過4次
  ungroup()%>%
  select(topicna,cmtPoster)%>% 
  unique()

g_alytopicAll<-graph_from_data_frame(d=alytopic_graf, directed=F)
ceb2 <- cluster_fast_greedy(g_alytopicAll) 

V(g_alytopicAll)$shape <- ifelse(V(g_alytopicAll)$name %in% c("1","2","3","4") , "square", "circle")
V(g_alytopicAll)$label <- ifelse(V(g_alytopicAll)$name %in% c("1","2","3","4") , V(g_alytopicAll)$name, "")
V(g_alytopicAll)$size <- ifelse( V(g_alytopicAll)$name %in% c("1","2","3","4") , 12,8 )


plot(ceb2,g_alytopicAll)

以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("./positive.txt")

# 負向字典txt檔
N <- read_file("./negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

與LIWC情緒字典做join 文集中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 512 x 4
## # Groups:   artUrl [77]
##    artUrl                                                  word  count sentiment
##    <chr>                                                   <chr> <int> <fct>    
##  1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 垃圾     78 negative 
##  2 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 難過     60 negative 
##  3 https://www.ptt.cc/bbs/Gossiping/M.1591327929.A.29C.ht~ 心痛     60 negative 
##  4 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.ht~ 難過     57 negative 
##  5 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 死刑     55 negative 
##  6 https://www.ptt.cc/bbs/Gossiping/M.1562206442.A.05D.ht~ 難過     47 negative 
##  7 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.ht~ 垃圾     46 negative 
##  8 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.ht~ 垃圾     43 negative 
##  9 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.ht~ 難過     41 negative 
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562506779.A.39D.ht~ 大膽     37 positive 
## # ... with 502 more rows

統計文章正、負面字的次數

tokens %>% inner_join(LIWC) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

tokens %>%
  filter(artDate == as.Date('2019/07/03')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

正向字眼的數量,很明顯的比負向字眼的數量還要少很多….

情緒文字雲

sent_cloud <- word_count %>%
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(count=sum(count)) %>%
  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)
## Joining, by = "word"

Word Correlation (文字相關性)

# 計算兩個詞彙同時出現的總次數

word_pairs <- word_count %>%
  pairwise_count(word, count, sort = TRUE)

word_pairs
## # A tibble: 41,880 x 4
## # Groups:   artUrl [88]
##    artUrl                                                   item1  item2      n
##    <chr>                                                    <chr>  <chr>  <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 殺人   老人       1
##  2 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 老人   殺人       1
##  3 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 殺警   判死       1
##  4 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 判死   殺警       1
##  5 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 補票   司法       1
##  6 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 司法   補票       1
##  7 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 媽的   年輕人     1
##  8 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 年輕人 媽的       1
##  9 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 希望   兇手       1
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 兇手   希望       1
## # ... with 41,870 more rows
ptt_word_cors <- word_count %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

ptt_word_cors_filter <- ptt_word_cors %>%
  #我們選擇幾個主題來尋找各自的相關字
  filter(item1 %in% c("警察", "法官", "醫生", "家屬", "精神病")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation))%>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1,  ncol = 2, scales = "free") +
  coord_flip()+
  #加入中文字型設定,避免中文字顯示錯誤。
  theme(text = element_text(family = "Heiti TC Light"))
## Selecting by correlation
ptt_word_cors_filter

# 顯示相關性大於0.4的組合
set.seed(2020)

ptt_word_cors %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

結論: