資料來源: 文字平台收集PTT Gossip/政黑/nCoV2019版文章、回覆
資料集： Corrected_articleMetaData.csv、Corrected_articleReviews.csv
關鍵字：蔡英文、陳時中、柯文哲、侯友宜
資料時間：2021-05-01 ~ 2021-06-05

壹、前情提要

近期歷經1個月的三級警戒，在網路口碑政治人物排行榜中，看到個政治人物的網路聲量，所以想要分析在疫情議題狀況下，這幾個政治人物的聲量以及正負面評價，我們利用PTT的八卦版、政黑板、以及Covid-19版作為目標看板分析。

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

安裝需要的packages

packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
library(wordcloud2)
library(purrr)
library(scales)

貳、疫情間政治人物社群網路、文字雲及情緒分析

載入文章和網友回覆資料

hou_posts <- read_csv("../data/PTT_侯友宜_articleMetaData.csv") # 文章 353
hou_reviews <- read_csv("../data/PTT_侯友宜_articleReviews.csv") # 回覆 40073

chen_posts <- read_csv("../data/c_articleMetaData.csv") # 文章 1169
chen_reviews <- read_csv("../data/c_articleReviews.csv") # 回覆 150864

kp_posts <- read_csv("../data/kp_articleMetaData.csv") # 文章 1230
kp_reviews <- read_csv("../data/kp_articleReviews.csv") # 回覆 159201

tsai_posts <- read_csv("../data/tsai_articleMetaData.csv") # 文章 703
tsai_reviews <- read_csv("../data/tsai_articleReviews.csv") # 回覆 93729

日發文折線圖

侯友宜

hou_posts %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="#69b3a2")+
    # 加上標示日期的線
    geom_vline(aes(xintercept = as.numeric(artDate[which(artDate ==       as.Date('2021-05-22'))[1]])),colour = "red")+
    geom_area(fill="#69b3a2", size =1, alpha = .7) + 
    guides(fill = guide_legend(reverse = TRUE))

    # geom_point(size = 1, shape = 22, colour = "darkred", fill = "pink")

5/22日討論度高可能原因為22日後中央公布校正回歸，新北市病情突然快速蔓延，病毒速度之快讓大家措手不及，甚至當天公布校正回歸案例還超過當天確診人數。

陳時中

chen_posts %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="#69b3a2")+
    # 加上標示日期的線
    geom_vline(aes(xintercept = as.numeric(artDate[which(artDate ==       as.Date('2021-05-28'))[1]])),colour = "red")+
    geom_area(fill="#69b3a2", size =1, alpha = .7) + 
    guides(fill = guide_legend(reverse = TRUE))

圖中可發現5/28為討論高峰，根據新聞可發現主要討論內容是因為首批莫德納疫苗抵台。而蔡英文總統也表示本批疫苗必須感謝陳時中指揮官，更表明了力挺陳時中的立場。

柯文哲

kp_posts %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="#69b3a2")+
    # 加上標示日期的線
    geom_vline(aes(xintercept = as.numeric(artDate[which(artDate ==       as.Date('2021-05-20'))[1]])),colour = "red")+
    geom_area(fill="#69b3a2", size =1, alpha = .7) + 
    guides(fill = guide_legend(reverse = TRUE))

5/20日討論度高可能原因為20日後中央公布校正回歸，台北市病情突然快速蔓延，病毒速度之快讓大家措手不及，甚至當天公布校正回歸案例還超過當天確診人數。

蔡英文

tsai_posts %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="#69b3a2")+
    # 加上標示日期的線
    geom_vline(aes(xintercept = as.numeric(artDate[which(artDate ==       as.Date('2021-05-31'))[1]])),colour = "darkred")+
    geom_area(fill="#69b3a2", size =1, alpha = .7) + 
    guides(fill = guide_legend(reverse = TRUE))

5/31日討論度高可能原因為蔡英文總統舉行記者會喊話，針對防疫、疫苗採購、施打，以及國產疫苗開發進行發表。也表明總統府對於有心人士炒作炒股等立場，呼籲有心人士提出相關證據。

LDA 主題分類

TOKEN

侯友宜token

文章斷句

# 文章斷句("\n\n"取代成"。")
hou_meta <- hou_posts %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
hou_sentences <- strsplit(hou_meta$sentence,"[。！；？!?;]")

# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
hou_sentences <- data.frame(
                        artUrl = rep(hou_meta$artUrl, sapply(hou_sentences, length)),
                        sentence = unlist(hou_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
                       # 如果有\t或\n就去掉

hou_sentences$sentence <- as.character(hou_sentences$sentence)

文章斷詞

# 文章斷詞
# load hou_lexicon(特定要斷開的詞，像是user_dict)
hou_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')
# load stop words
stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')

## Warning in scan(file = "../dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 '../dict/stop_words.txt' 中的輸入不正確

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

# 使用字典重新斷詞
new_user_word(jieba_tokenizer, c(hou_lexicon))

## [1] TRUE

# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 用剛剛初始化的斷詞器把sentence斷開
tokens <- hou_sentences %>%
    mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
    unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  count(artUrl, word) %>% # 計算每篇文章出現的字頻
  rename(count=n)
# tokens
# save.image(file = "../data/token_result.rdata")

斷詞結果可以先存起來，就不用再重跑一次

load("../data/token_result.rdata")

清理斷詞結果

◎根據詞頻，選擇只出現3字以上的字
◎整理成url,word,n的格式之後，就可以轉dtm
P.S. groupby by之後原本的字詞結構會不見，把詞頻另存在一個reserved_word裡面

freq = 3
# 依據字頻挑字
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

hou_removed <- tokens %>% 
  filter(word %in% reserved_word)

#hou_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
hou_dtm <- hou_removed %>% cast_dtm(artUrl, word, count)

陳時中token

文章斷句

# 文章斷句("\n\n"取代成"。")
chen_meta <- chen_posts %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
chen_sentences <- strsplit(chen_meta$sentence,"[。！；？!?;]")

# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
chen_sentences <- data.frame(
                        artUrl = rep(chen_meta$artUrl, sapply(chen_sentences, length)),
                        sentence = unlist(chen_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
                       # 如果有\t或\n就去掉

chen_sentences$sentence <- as.character(chen_sentences$sentence)

## 文章斷詞
# # load mask_lexicon(特定要斷開的詞，像是user_dict)
# chen_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# # load stop words
# stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# 
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
# 
# # 使用口罩字典重新斷詞
# new_user_word(jieba_tokenizer, c(chen_lexicon))
# 
# # tokenize function
# chi_tokenizer <- function(t) {
#   lapply(t, function(x) {
#     if(nchar(x)>1){
#       tokens <- segment(x, jieba_tokenizer)
#       tokens <- tokens[!tokens %in% stop_words]
#       # 去掉字串長度爲1的詞彙
#       tokens <- tokens[nchar(tokens)>1]
#       return(tokens)
#     }
#   })
# }
# 
# # 用剛剛初始化的斷詞器把sentence斷開
# tokens <- chen_sentences %>%
#     mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
#     mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
#     unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   count(artUrl, word) %>% # 計算每篇文章出現的字頻
#   rename(count=n)
# tokens
# save.image(file = "../data/token_result_chen.rdata")

load("../data/token_result_chen.rdata")

freq = 3
# 依據字頻挑字
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

chen_removed <- tokens %>% 
  filter(word %in% reserved_word)

#chen_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
chen_dtm <- chen_removed %>% cast_dtm(artUrl, word, count)

柯文哲token

文章斷句

# 文章斷句("\n\n"取代成"。")
kp_meta <- kp_posts %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
kp_sentences <- strsplit(kp_meta$sentence,"[。！；？!?;]")

# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
kp_sentences <- data.frame(
                        artUrl = rep(kp_meta$artUrl, sapply(kp_sentences, length)),
                        sentence = unlist(kp_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
                       # 如果有\t或\n就去掉

kp_sentences$sentence <- as.character(kp_sentences$sentence)

文章斷詞

# 文章斷詞
# # load kp_lexicon(特定要斷開的詞，像是user_dict)
# kp_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# # load stop words
# stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# 
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
# 
# # 使用字典重新斷詞
# new_user_word(jieba_tokenizer, c(kp_lexicon))
# 
# # tokenize function
# chi_tokenizer <- function(t) {
#   lapply(t, function(x) {
#     if(nchar(x)>1){
#       tokens <- segment(x, jieba_tokenizer)
#       tokens <- tokens[!tokens %in% stop_words]
#       # 去掉字串長度爲1的詞彙
#       tokens <- tokens[nchar(tokens)>1]
#       return(tokens)
#     }
#   })
# }
# 
# 
# # 用剛剛初始化的斷詞器把sentence斷開
# tokens <- kp_sentences %>%
#     mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
#     mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
#     unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   count(artUrl, word) %>% # 計算每篇文章出現的字頻
#   rename(count=n)
# tokens
# save.image(file = "../data/kp_token_result.rdata")

斷詞結果可以先存起來，就不用再重跑一次

load("../data/kp_token_result.rdata")

freq = 3
# 依據字頻挑字
kp_reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

kp_removed <- tokens %>% 
  filter(word %in% kp_reserved_word)

#hou_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
kp_dtm <- kp_removed %>% cast_dtm(artUrl, word, count)

蔡英文token

文章斷句

# 文章斷句("\n\n"取代成"。")
tsai_meta <- tsai_posts %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
tsai_sentences <- strsplit(tsai_meta$sentence,"[。！；？!?;]")

# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
tsai_sentences <- data.frame(
                        artUrl = rep(tsai_meta$artUrl, sapply(tsai_sentences, length)),
                        sentence = unlist(tsai_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
                       # 如果有\t或\n就去掉

tsai_sentences$sentence <- as.character(tsai_sentences$sentence)

文章斷詞

# 文章斷詞
# load kp_lexicon(特定要斷開的詞，像是user_dict)
# tsai_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# # load stop words
# stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# 
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
# 
# # 使用字典重新斷詞
# new_user_word(jieba_tokenizer, c(tsai_lexicon))
# 
# # tokenize function
# chi_tokenizer <- function(t) {
#   lapply(t, function(x) {
#     if(nchar(x)>1){
#       tokens <- segment(x, jieba_tokenizer)
#       tokens <- tokens[!tokens %in% stop_words]
#       # 去掉字串長度爲1的詞彙
#       tokens <- tokens[nchar(tokens)>1]
#       return(tokens)
#     }
#   })
# }
# 
# 
# # 用剛剛初始化的斷詞器把sentence斷開
# tokens <- tsai_sentences %>%
#     mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
#     mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
#     unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   count(artUrl, word) %>% # 計算每篇文章出現的字頻
#   rename(count=n)
# tokens
# save.image(file = "../data/tsai_token_result.rdata")

load("../data/tsai_token_result.rdata")

freq = 3
# 依據字頻挑字
tsai_reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

tsai_removed <- tokens %>% 
  filter(word %in% tsai_reserved_word)

#hou_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
tsai_dtm <- tsai_removed %>% cast_dtm(artUrl, word, count)

LDA 主題分析

侯友宜LDA

# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(hou_dtm, k = topic, control = list(seed = 2021))
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "../data/ldas_result.rdata") # 將模型輸出
# }

載入LDA結果

load("../data/ldas_result.rdata")

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

侯友宜LDAvis

library(text2vec)

## Warning: package 'text2vec' was built under R version 4.0.5

## 
## Attaching package: 'text2vec'

## The following object is masked from 'package:topicmodels':
## 
##     perplexity

## The following object is masked from 'package:igraph':
## 
##     normalize

# 文章斷詞
# # load hou_lexicon(特定要斷開的詞，像是user_dict)
# hou_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# # load stop words
# stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
#                    encoding='utf-8',fileEncoding='utf-8')
# 
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
# 
# # 使用字典重新斷詞
# new_user_word(jieba_tokenizer, c(hou_lexicon))
# 
# # tokenize function
# chi_tokenizer <- function(t) {
#   lapply(t, function(x) {
#     if(nchar(x)>1){
#       tokens <- segment(x, jieba_tokenizer)
#       tokens <- tokens[!tokens %in% stop_words]
#       # 去掉字串長度爲1的詞彙
#       tokens <- tokens[nchar(tokens)>1]
#       return(tokens)
#     }
#   })
# }
# 
# 
# # 用剛剛初始化的斷詞器把sentence斷開
# tokens <- hou_sentences %>%
#     mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
#     mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
#     unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   count(artUrl, word) %>% # 計算每篇文章出現的字頻
#   rename(count=n)

library(udpipe)

## Warning: package 'udpipe' was built under R version 4.0.5

tokens <- hou_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)

## [1] 353 292

set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [17:00:20.774] early stopping at 100 iteration 
## INFO  [17:00:20.900] early stopping at 50 iteration

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看前10主題字

##       [,1]     [,2]   [,3]       [,4]      
##  [1,] "防疫"   "疫情" "完整"     "中央"    
##  [2,] "侯友宜" "警戒" "新聞"     "公布"    
##  [3,] "陳時中" "熱區" "民調"     "個案"    
##  [4,] "旅館"   "三級" "醫護"     "指揮中心"
##  [5,] "方艙"   "管制" "施打"     "確診"    
##  [6,] "封城"   "口罩" "疫苗"     "疫苗"    
##  [7,] "醫院"   "做好" "市長"     "記者會"  
##  [8,] "民進黨" "市場" "新聞標題" "校正回歸"
##  [9,] "篩檢"   "場所" "政治"     "希望"    
## [10,] "部署"   "活動" "調查"     "時間"

lda_model$plot()

## Loading required namespace: servr

將剛處理好的dtm放入LDA函式分析

# LDA分成4個主題
hou_lda <- LDA(hou_dtm, k = 4, control = list(seed = 123))

。tidy(mask_lda, matrix = “beta”) # 取字 topic term beta值
。tidy(mask_lda, matrix=“gamma”) # 取主題 document topic gamma

topics_words <- tidy(hou_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

## # A tibble: 10 x 3
##    topic term      phi
##    <int> <chr>   <dbl>
##  1     2 新北   0.0308
##  2     4 侯友宜 0.0293
##  3     1 侯友宜 0.0285
##  4     2 侯友宜 0.0267
##  5     2 中央   0.0255
##  6     2 疫苗   0.0250
##  7     4 市長   0.0220
##  8     3 侯友宜 0.0209
##  9     2 防疫   0.0203
## 10     3 防疫   0.0196

陳時中LDA

# chen_ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   chen_lda <- LDA(chen_dtm, k = topic, control = list(seed = 2021))
#   chen_ldas =c(chen_ldas,chen_lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(chen_ldas,file = "../data/chen_ldas_result.rdata")
# }

載入LDA結果

load("../data/chen_ldas_result.rdata")

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(chen_ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

LDAvis

# 文章斷詞
# load hou_lexicon(特定要斷開的詞，像是user_dict)
chen_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')
# load stop words
stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')

## Warning in scan(file = "../dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 '../dict/stop_words.txt' 中的輸入不正確

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

# 使用字典重新斷詞
new_user_word(jieba_tokenizer, c(chen_lexicon))

## [1] TRUE

# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 用剛剛初始化的斷詞器把sentence斷開
chen_tokens <- chen_sentences %>%
    mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
    unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  count(artUrl, word) %>% # 計算每篇文章出現的字頻
  rename(count=n)

library(udpipe)
chen_tokens <- chen_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

dtf <- document_term_frequencies(chen_tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)

## [1] 1169  898

set.seed(2019)

topic_n = 3

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [17:01:19.148] early stopping at 130 iteration 
## INFO  [17:01:19.275] early stopping at 20 iteration

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看前10主題字

##       [,1]   [,2]     [,3]      
##  [1,] "疫苗" "陳時中" "確診"    
##  [2,] "採購" "台灣"   "個案"    
##  [3,] "國產" "防疫"   "本土"    
##  [4,] "萬劑" "柯文哲" "指揮中心"
##  [5,] "試驗" "疫情"   "陳時中"  
##  [6,] "施打" "中央"   "感染"    
##  [7,] "接種" "市長"   "檢疫"    
##  [8,] "az"   "部長"   "疫情"    
##  [9,] "美國" "不是"   "相關"    
## [10,] "原廠" "不要"   "華航"

lda_model$plot()

## Loading required namespace: servr

將剛處理好的dtm放入LDA函式分析

# LDA分成3個主題
chen_lda <- LDA(chen_dtm, k = 3, control = list(seed = 123))

。tidy(mask_lda, matrix = “beta”) # 取字 topic term beta值
。tidy(mask_lda, matrix=“gamma”) # 取主題 document topic gamma

topics_words <- tidy(chen_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

## # A tibble: 10 x 3
##    topic term      phi
##    <int> <chr>   <dbl>
##  1     2 新北   0.0302
##  2     1 侯友宜 0.0297
##  3     2 侯友宜 0.0267
##  4     2 中央   0.0257
##  5     1 疫苗   0.0226
##  6     2 確診   0.0224
##  7     3 侯友宜 0.0215
##  8     3 防疫   0.0205
##  9     2 防疫   0.0192
## 10     1 市長   0.0174

柯文哲LDA

# kp_ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   kp_lda <- LDA(kp_dtm, k = topic, control = list(seed = 2021))
#   kp_ldas =c(kp_ldas,kp_lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(kp_ldas,file = "../data/kp_ldas_result.rdata")
# }

載入LDA結果

load("../data/kp_ldas_result.rdata")

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(kp_ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

LDAvis

# 文章斷詞
# load hou_lexicon(特定要斷開的詞，像是user_dict)
kp_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')
# load stop words
stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')

## Warning in scan(file = "../dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 '../dict/stop_words.txt' 中的輸入不正確

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

# 使用字典重新斷詞
new_user_word(jieba_tokenizer, c(kp_lexicon))

## [1] TRUE

# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 用剛剛初始化的斷詞器把sentence斷開
tokens <- kp_sentences %>%
    mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
    unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  count(artUrl, word) %>% # 計算每篇文章出現的字頻
  rename(count=n)

library(udpipe)
kp_tokens <- kp_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

kp_dtf <- document_term_frequencies(kp_tokens, document = "artUrl", term = "word")
kp_dtm <- document_term_matrix(x = kp_dtf)
kp_dtm_clean <- dtm_remove_lowfreq(kp_dtm, minfreq = 30)
dim(kp_dtm_clean)

## [1] 1230  810

set.seed(2019)

topic_n = 3

kp_lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =kp_lda_model$fit_transform(kp_dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [17:02:17.946] early stopping at 150 iteration 
## INFO  [17:02:18.109] early stopping at 30 iteration

kp_lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看前10主題字

##       [,1]     [,2]     [,3]    
##  [1,] "柯文哲" "疫苗"   "確診"  
##  [2,] "台灣"   "新聞"   "防疫"  
##  [3,] "不是"   "完整"   "疫情"  
##  [4,] "民進黨" "柯文哲" "醫院"  
##  [5,] "政治"   "台北"   "感染"  
##  [6,] "美國"   "市長"   "旅館"  
##  [7,] "出來"   "記者"   "柯文哲"
##  [8,] "不會"   "媒體"   "台北市"
##  [9,] "覺得"   "萬劑"   "本土"  
## [10,] "問題"   "施打"   "醫療"

kp_lda_model$plot()

## Loading required namespace: servr

將剛處理好的dtm放入LDA函式分析

# LDA分成3個主題
kp_lda <- LDA(kp_dtm, k = 3, control = list(seed = 123))

。tidy(mask_lda, matrix = “beta”) # 取字 topic term beta值
。tidy(mask_lda, matrix=“gamma”) # 取主題 document topic gamma

kp_topics_words <- tidy(kp_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(kp_topics_words) <- c("topic", "term", "phi")
kp_topics_words %>% arrange(desc(phi)) %>% head(10)

## # A tibble: 10 x 3
##    topic term       phi
##    <int> <chr>    <dbl>
##  1     1 疫苗   0.0401 
##  2     2 柯文哲 0.0292 
##  3     1 柯文哲 0.0267 
##  4     3 柯文哲 0.0231 
##  5     3 防疫   0.0132 
##  6     3 確診   0.0122 
##  7     1 台灣   0.0115 
##  8     3 疫情   0.0114 
##  9     2 市長   0.0101 
## 10     1 中央   0.00985

蔡英文LDA

# tsai_ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   tsai_lda <- LDA(tsai_dtm, k = topic, control = list(seed = 2021))
#   tsai_ldas =c(tsai_ldas,tsai_lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(tsai_ldas,file = "../data/tsai_ldas_result.rdata")
# }

載入LDA結果

load("../data/tsai_ldas_result.rdata")

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(tsai_ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

LDAvis

# 文章斷詞
# load hou_lexicon(特定要斷開的詞，像是user_dict)
tsai_lexicon <- scan(file = "../dict/hou_lexicon.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')
# load stop words
stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')

## Warning in scan(file = "../dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 '../dict/stop_words.txt' 中的輸入不正確

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

# 使用字典重新斷詞
new_user_word(jieba_tokenizer, c(tsai_lexicon))

## [1] TRUE

# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 用剛剛初始化的斷詞器把sentence斷開
tokens <- tsai_sentences %>%
    mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
    unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  count(artUrl, word) %>% # 計算每篇文章出現的字頻
  rename(count=n)

library(udpipe)
tsai_tokens <- tsai_posts %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

tsai_dtf <- document_term_frequencies(tsai_tokens, document = "artUrl", term = "word")
tsai_dtm <- document_term_matrix(x = tsai_dtf)
tsai_dtm_clean <- dtm_remove_lowfreq(tsai_dtm, minfreq = 30)
dim(tsai_dtm_clean)

## [1] 703 601

set.seed(2019)

topic_n = 3

tsai_lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =tsai_lda_model$fit_transform(tsai_dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [17:03:07.934] early stopping at 130 iteration 
## INFO  [17:03:08.134] early stopping at 60 iteration

tsai_lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字

##       [,1]       [,2]     [,3]    
##  [1,] "防疫"     "民進黨" "疫苗"  
##  [2,] "疫情"     "蔡英文" "台灣"  
##  [3,] "蔡英文"   "經濟"   "蔡英文"
##  [4,] "完整"     "民主"   "國產"  
##  [5,] "新聞"     "總統"   "中國"  
##  [6,] "博士"     "民眾"   "政府"  
##  [7,] "訊息"     "執政"   "萬劑"  
##  [8,] "指揮中心" "政府"   "國際"  
##  [9,] "確診"     "政策"   "美國"  
## [10,] "備註"     "公投"   "採購"

tsai_lda_model$plot()

## Loading required namespace: servr

將剛處理好的dtm放入LDA函式分析

# LDA分成5個主題
tsai_lda <- LDA(tsai_dtm, k = 5, control = list(seed = 123))

。tidy(mask_lda, matrix = “beta”) # 取字 topic term beta值
。tidy(mask_lda, matrix=“gamma”) # 取主題 document topic gamma

tsai_topics_words <- tidy(tsai_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(tsai_topics_words) <- c("topic", "term", "phi")
tsai_topics_words %>% arrange(desc(phi)) %>% head(10)

## # A tibble: 10 x 3
##    topic term      phi
##    <int> <chr>   <dbl>
##  1     5 蔡英文 0.0366
##  2     4 疫苗   0.0346
##  3     2 疫苗   0.0294
##  4     1 疫苗   0.0238
##  5     4 蔡英文 0.0238
##  6     1 蔡英文 0.0207
##  7     3 蔡英文 0.0201
##  8     2 蔡英文 0.0197
##  9     4 防疫   0.0194
## 10     4 疫情   0.0180

取出個人物代表字詞(term)

侯友宜代表字詞

removed_word = c("侯友宜","市長","新北市","完整")

# 看各群的常用詞彙
tidy(hou_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  scale_x_reordered()

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

可以歸納出
topic 1 = “侯友宜針對新北爭取中央分配疫苗”
topic 2 = “與各政治人物比較並嘲諷其防疫作為”
topic 3 = “探討防疫旅館數量及方艙醫院情況”
topic 4 = “討論封城及升級第四級警戒相關”

代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣
hou_topics <- tidy(hou_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
hou_topics

## # A tibble: 353 x 3
## # Groups:   document [353]
##    document                                                 topic gamma
##    <chr>                                                    <int> <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1619936049.A.17B.html     1 0.775
##  2 https://www.ptt.cc/bbs/Gossiping/M.1619937748.A.74A.html     1 0.846
##  3 https://www.ptt.cc/bbs/Gossiping/M.1619938634.A.C28.html     1 0.994
##  4 https://www.ptt.cc/bbs/Gossiping/M.1619951400.A.666.html     1 0.978
##  5 https://www.ptt.cc/bbs/Gossiping/M.1620630471.A.57E.html     1 0.503
##  6 https://www.ptt.cc/bbs/Gossiping/M.1620722098.A.8F7.html     1 0.988
##  7 https://www.ptt.cc/bbs/Gossiping/M.1620825846.A.985.html     1 0.987
##  8 https://www.ptt.cc/bbs/Gossiping/M.1620990728.A.A89.html     1 0.603
##  9 https://www.ptt.cc/bbs/Gossiping/M.1620993004.A.3AA.html     1 0.877
## 10 https://www.ptt.cc/bbs/Gossiping/M.1621062643.A.A99.html     1 0.547
## # ... with 343 more rows

資料內容探索

posts_topic <- merge(x = hou_posts, y = hou_topics, by.x = "artUrl", by.y="document")

# 看一下各主題在說甚麼
set.seed(123)
posts_topic %>% # 主題一
  filter(topic==1) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(10)

##                                                    artTitle
## 1  Re:[新聞]單日激增106例！侯友宜曝：中央不同意增設方艙醫院
## 2                [討論]侯友宜說政治人物不應該介入警政人事？
## 3                                        [黑特]侯友宜的能力
## 4           Re:[新聞]新北首座「大型疫苗站」　侯友宜：一天可
## 5                [新聞]憂「時間落差」成防疫破口！侯友宜喊：
## 6                         Re:[討論]看來侯友宜2024會先閃過了
## 7                    [討論]認真說覺得侯友宜的防疫做的蠻爛的
## 8                                      [討論]侯友宜是草包嗎
## 9                                    [討論]黃33該學習侯友宜
## 10                                 [討論]侯友宜有金身護體？
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621323861.A.C9B.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1619862653.A.AAB.html
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1621823505.A.C13.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622436170.A.97F.html
## 5      https://www.ptt.cc/bbs/nCoV2019/M.1621669835.A.E36.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1622806351.A.C76.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1621559019.A.844.html
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1621562409.A.981.html
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1622614288.A.1B0.html
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1621576919.A.B21.html

posts_topic %>% # 主題二
  filter(topic==2) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(10)

##                                                  artTitle
## 1            [新聞]民進黨酸「裝忙」　侯友宜深夜發文嘆：別
## 2             [新聞]「1.8萬劑疫苗打完了！」侯友宜：新北警
## 3           Re:[新聞]侯友宜喊疫苗別刁難總統府駁斥：無此問
## 4       [新聞]遭質疑防疫旅館6都倒數侯友宜斥：以訛<U+B631>
## 5                  [討論]侯友宜知道自己是防疫六都倒數嗎？
## 6              [新聞]張小燕發文力挺侯友宜：疫苗是救命關鍵
## 7  [新聞]是否公布防疫旅館名單?侯友宜:重點是防疫有沒有做好
## 8              [新聞]侯友宜：防疫旅館絕對夠盼假消息莫以訛
## 9                              Re:[討論]侯友宜打臉柯韓粉?
## 10             [新聞]放話5天內打完卻還剩2萬劑疫苗侯友宜：
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621477470.A.B44.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1622575258.A.C49.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1622395747.A.F29.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1621428294.A.E3E.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1621651163.A.07D.html
## 6      https://www.ptt.cc/bbs/nCoV2019/M.1622435051.A.1FA.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1620225903.A.3FD.html
## 8      https://www.ptt.cc/bbs/nCoV2019/M.1621521400.A.9F2.html
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1621668217.A.57A.html
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1622731924.A.39A.html

posts_topic %>% # 主題三
  filter(topic==3) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(10)

##                                           artTitle
## 1                 [新聞]侯友宜：其他區確診不亞蘆洲
## 2     [新聞]羅致政：請問侯友宜是在「瞎忙還是裝忙？
## 3      [新聞]本土+16　侯友宜：八大行業、公共場所全
## 4       [新聞]侯友宜「封城說」新北綠議員批引起恐慌
## 5     [新聞]快訊／雙北一起封城？　侯友宜：中央要下
## 6  Re:[新聞]在路上不戴口罩？侯友宜撂重話：一律拍照
## 7       [新聞]侯友宜喊封城、準四級部署、蓋方艙醫院
## 8       [新聞]中和確診數多侯友宜：將增設社區篩檢站
## 9     [新聞]感染源不明！侯友宜強烈建議中央：針對高
## 10        [新聞]疫情升溫侯友宜：朝升準四級警戒整備
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621077971.A.B59.html
## 2     https://www.ptt.cc/bbs/Gossiping/M.1621479569.A.ED1.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1620805537.A.0A2.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1620827445.A.8A7.html
## 5     https://www.ptt.cc/bbs/Gossiping/M.1622260208.A.67B.html
## 6     https://www.ptt.cc/bbs/Gossiping/M.1621244276.A.298.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1621406990.A.9CB.html
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1621330474.A.B2A.html
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1620716593.A.9DC.html
## 10     https://www.ptt.cc/bbs/nCoV2019/M.1621242375.A.2BE.html

posts_topic %>% # 主題四
  filter(topic==4) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(10)

##                                          artTitle
## 1      [新聞]淡水高爾夫球場天天預約客滿侯友宜：太
## 2             Re:[新聞]鄭文燦人氣高侯友宜攻守兼具
## 3    [新聞]嚇！企業突被告知做「封城準備」侯友宜：
## 4        [討論]大家猜看看今天侯友宜會公佈足跡嗎？
## 5      [新聞]全國縣市長施政滿意度公布侯友宜、鄭文
## 6            [新聞]侯友宜指「中央下令不公布足跡」
## 7   [新聞]快訊／柯文哲、侯友宜共識：雙北停課至5月
## 8  [問卦]侯友宜：三級管制全力以赴建議納入桃園基隆
## 9      [新聞]疫情依然嚴峻侯友宜呼籲：不影響經濟下
## 10     [新聞]侯友宜指「中央下令不公布足跡」　指揮
##                                                         artUrl
## 1      https://www.ptt.cc/bbs/nCoV2019/M.1622023307.A.02B.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1620271962.A.06E.html
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1621655946.A.CCE.html
## 4  https://www.ptt.cc/bbs/HatePolitics/M.1621834861.A.850.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1620202167.A.0DE.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1621764409.A.789.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1621219090.A.7B7.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1621071612.A.6F3.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621352955.A.C94.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1621762547.A.F4C.html

各主題細目討論

主題一：
> 大多是侯友宜針對新北爭取中央分配疫苗的相關討論，如、「新北巿民接續離世侯友宜哽咽：中央不要再刁難民間買疫苗」、「金門縣長籲開放中國疫苗侯友宜：盼中央支持」、「5百萬疫苗要來了侯友宜：馬上施打」
主題二：
> 主要為與各政治人物比較並嘲諷其防疫作為，如、「感謝上蒼，我們還有柯文哲與侯友宜」、「侯友宜把輕症患者往桃園塞」、「新北投侯友宜的是不是後悔了？」
主題三：
> 探討防疫旅館數量及方艙醫院情況，如、「遭質疑防疫旅館6都倒數侯友宜斥：以訛1」、「侯友宜的方艙醫院蓋的怎樣了呢？」、「怒批侯友宜「落後部署」羅致政細數防疫」
主題四：
> 討論封城及升級第四級警戒相關，如、「侯友宜說該封城就封柯文哲:別亂喊」、「侯友宜轟封城指引模糊不清陳時中：應早點提出來」、「疫情飆升侯友宜：準4級警戒」

日期主題分析

畫出每天topic的分布，發現主題三為大家廣泛討論的

posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill")

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge")

## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

由上圖發現Gossiping以主題四關注封城及升級第四級警戒為多；HatePolitics以主題二各政治人物比較並嘲諷其防疫作為為重與政黑版相符；nCov2019發文篇數相對少但可以看得出來網友們討論以主題一中央防疫及地方施打疫苗相關，與政治相關的主題二在此版就沒有提及，由此分類的主題可以看出與發版位置高度相符。

陳時中代表字詞

removed_word = c("不是","每天","出來","覺得","陳時中") 

# 看各群的常用詞彙
tidy(chen_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  scale_x_reordered()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

可以歸納出
topic 1 = “對於國際疫苗的採購與國產疫苗以及其他相關討論”
topic 2 = “討論確診個案以及足跡”
topic 3 = “疫情現況與政治話題”

代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣
chen_topics <- tidy(chen_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
chen_topics

## # A tibble: 353 x 3
## # Groups:   document [353]
##    document                                                 topic gamma
##    <chr>                                                    <int> <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1619865250.A.7D8.html     1 0.421
##  2 https://www.ptt.cc/bbs/Gossiping/M.1619936049.A.17B.html     1 0.914
##  3 https://www.ptt.cc/bbs/Gossiping/M.1619937748.A.74A.html     1 0.995
##  4 https://www.ptt.cc/bbs/Gossiping/M.1619938634.A.C28.html     1 0.994
##  5 https://www.ptt.cc/bbs/Gossiping/M.1619951400.A.666.html     1 0.979
##  6 https://www.ptt.cc/bbs/Gossiping/M.1620124236.A.A43.html     1 0.997
##  7 https://www.ptt.cc/bbs/Gossiping/M.1620630471.A.57E.html     1 0.997
##  8 https://www.ptt.cc/bbs/Gossiping/M.1620722098.A.8F7.html     1 0.988
##  9 https://www.ptt.cc/bbs/Gossiping/M.1620825846.A.985.html     1 0.988
## 10 https://www.ptt.cc/bbs/Gossiping/M.1620990728.A.A89.html     1 0.517
## # ... with 343 more rows

資料內容探索

chen_posts_topic <- merge(x = chen_posts, y = chen_topics, by.x = "artUrl", by.y="document")

# 看一下各主題在說甚麼
set.seed(123)
chen_posts_topic %>% # 主題一
  filter(topic==1) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(10)

##                                           artTitle
## 1  Re:[新聞]侯友宜批指揮中心嚴重疏忽　陳時中語氣強
## 2     [討論]陳時中侯友宜柯文哲的成敗其實是三位一體
## 3  Re:[新聞]侯友宜批指揮中心嚴重疏忽　陳時中語氣強
## 4         [新聞]柯文哲不忍了氣炸開槓侯友宜、陳時中
## 5    Re:[新聞]侯友宜轟封城指引模糊不清陳時中：應早
## 6           [討論]一年前陳時中妙答侯友宜，很有意義
## 7     [新聞]侯友宜批指揮中心嚴重疏忽　陳時中語氣強
## 8     [新聞]民間捐疫苗陳時中設條件　侯友宜痛批：在
## 9    Re:[新聞]侯友宜轟封城指引模糊不清陳時中：應早
## 10              [新聞]新北市長民調侯友宜大勝陳時中
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1619951400.A.666.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1622356132.A.E4C.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1619937748.A.74A.html
## 4  https://www.ptt.cc/bbs/HatePolitics/M.1620813825.A.32C.html
## 5     https://www.ptt.cc/bbs/Gossiping/M.1621240174.A.D17.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1621245042.A.FA0.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1619936049.A.17B.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1622360835.A.AF9.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621238565.A.62E.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1621062643.A.A99.html

chen_posts_topic %>% # 主題二
  filter(topic==2) %>%
  select(artTitle, artUrl) %>%
  unique()

##                                     artTitle
## 1 [新聞]侯友宜批案1120指揮中心嚴重疏忽陳時中
## 2 [新聞]愛心挺醫護抗疫侯友宜：向陳時中爭取AZ
##                                                        artUrl
## 1    https://www.ptt.cc/bbs/Gossiping/M.1619863015.A.891.html
## 2 https://www.ptt.cc/bbs/HatePolitics/M.1621690115.A.B4F.html

chen_posts_topic %>% # 主題三
  filter(topic==3) %>%
  select(artTitle, artUrl) %>%
  unique()

##                                       artTitle
## 1   [新聞]侯友宜轟封城指引模糊不清陳時中：應早
## 2 [新聞]不甩陳時中、侯友宜！高爾夫球場遭重罰10
##                                                     artUrl
## 1 https://www.ptt.cc/bbs/Gossiping/M.1621236761.A.598.html
## 2 https://www.ptt.cc/bbs/Gossiping/M.1622168145.A.E6D.html

各主題細目討論

主題一：
> 主要探討對於國際疫苗的採購與國產疫苗，以及其他相關討論，如、「日本考慮給台灣疫苗」、「地方可否自購疫苗？」、「美衛生部長視訊陳時中表達願助台灣取得疫苗」
主題二：
> 討論確診個案以及足跡，如、「第三級可能性降低！」、「校正回歸降至89例」、「陽性率最高6.3%」
主題三：
> 疫情現況與政治話題，如、「陳時中到底是政客還是政治素人？」、「華航將停飛！陳時中:機師全召回14天檢疫」、「陳時中是不是打臉他自己說的話?」、「陳時中、蔡英文一年後對比惹心疼！」

日期主題分布

畫出每天topic的分布，發現主題三為大家廣泛討論的

chen_posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill")

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

chen_posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge")

## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

由上圖發現Gossiping以主題三疫情現況與政治話題為多；HatePolitics也與主題三關於嘲諷政治人物對於疫情政策有關；nCov2019主要以討論確診個案以及足跡這些較為客觀事實的內容有關，主題三較帶有政治色彩話題較少被提及。

柯文哲代表字詞

removed_word = c("柯文哲","市長","台北","台北市","不會","完整","問題","覺得","可能","不是","不要","中央") 

# 看各群的常用詞彙
tidy(kp_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  scale_x_reordered()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

可以歸納出
topic 1 = “關注在對於疫苗產地及施打順序的討論”
topic 2 = “政治及媒體相關報導”
topic 3 = “探討防疫相關政策與現況”

代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣
kp_topics <- tidy(kp_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
kp_topics

## # A tibble: 1,230 x 3
## # Groups:   document [1,230]
##    document                                                    topic gamma
##    <chr>                                                       <int> <dbl>
##  1 https://www.ptt.cc/bbs/HatePolitics/M.1619858975.A.CF0.html     1 0.923
##  2 https://www.ptt.cc/bbs/HatePolitics/M.1619861153.A.473.html     1 0.803
##  3 https://www.ptt.cc/bbs/HatePolitics/M.1619919198.A.E39.html     1 0.995
##  4 https://www.ptt.cc/bbs/HatePolitics/M.1619959284.A.E69.html     1 0.738
##  5 https://www.ptt.cc/bbs/HatePolitics/M.1619964761.A.259.html     1 0.744
##  6 https://www.ptt.cc/bbs/HatePolitics/M.1620013932.A.C12.html     1 0.996
##  7 https://www.ptt.cc/bbs/HatePolitics/M.1620028023.A.0CE.html     1 0.794
##  8 https://www.ptt.cc/bbs/HatePolitics/M.1620035490.A.130.html     1 0.994
##  9 https://www.ptt.cc/bbs/HatePolitics/M.1620092084.A.BC7.html     1 0.630
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1620124296.A.372.html     1 0.790
## # ... with 1,220 more rows

資料內容探索

kp_posts_topic <- merge(x = kp_posts, y = kp_topics, by.x = "artUrl", by.y="document")

# 看一下各主題在說甚麼
set.seed(123)
kp_posts_topic %>% # 主題一
  filter(topic==1) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                           artTitle
## 1                     [討論]侯友宜這次被柯文哲屌打
## 2  Re:[新聞]「再拖3個月、餓死比病死多」柯P：應考慮
## 3   Re:[新聞]【諾富特風暴】柯P開酸：不知機組員散在
## 4       [新聞]能否再撐兩個月等疫苗柯文哲：請中央政
## 5     [討論]我們可以看到柯文哲接下來改口稱讚中央嗎
## 6         [討論]突然覺得柯文哲是比較正常的政治人物
## 7   Re:[問卦]台灣的首長是不是只有柯P敢記者會嗆AIT?
## 8                           [討論]柯文哲真的很可惜
## 9    Re:[新聞]柯文哲稱航空人員應第一順位打疫苗李秉
## 10    [新聞]又槓上！被陳時中反指有立委關說　柯文哲
## 11            [新聞]醫護壓力破表柯文哲喊話加油撐住
## 12              [討論]雙北控制不力柯文哲還不下台？
## 13          Re:[問卦]你有被側翼洗腦過柯文哲變了嗎?
## 14  Re:[新聞]國產疫苗7月上路？　柯文哲吐真心話：我
## 15  Re:[新聞]國產疫苗7月上路？　柯文哲吐真心話：我
## 16      Re:[討論]柯文哲這波裝清高可以再收服幾%柯粉
## 17      [新聞]疫情擴散！柯文哲預言：北市狀況「三週
## 18   Re:[黑特]北市記者群質疑柯文哲開記者會只為作秀
## 19            [討論]柯文哲舔中這麼久，30萬劑也沒有
## 20      [新聞]15萬劑難消化？柯文哲痛批顏若芳：民進
##                                                         artUrl
## 1  https://www.ptt.cc/bbs/HatePolitics/M.1622644142.A.617.html
## 2     https://www.ptt.cc/bbs/Gossiping/M.1622461640.A.C0D.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1620318323.A.861.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622634177.A.A2D.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1622737054.A.91E.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1621696841.A.A11.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1622023393.A.3D2.html
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1621516893.A.5F9.html
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1620271019.A.312.html
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1620485056.A.978.html
## 11     https://www.ptt.cc/bbs/nCoV2019/M.1622124670.A.296.html
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1622345719.A.8D0.html
## 13    https://www.ptt.cc/bbs/Gossiping/M.1622296828.A.DAB.html
## 14    https://www.ptt.cc/bbs/Gossiping/M.1621907817.A.794.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1621909305.A.05E.html
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1620727968.A.ADC.html
## 17    https://www.ptt.cc/bbs/Gossiping/M.1622636559.A.BFE.html
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1622774691.A.FF5.html
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1622179842.A.BFE.html
## 20    https://www.ptt.cc/bbs/Gossiping/M.1622199050.A.289.html

kp_posts_topic %>% # 主題二
  filter(topic==2) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                              artTitle
## 1             [討論]柯文哲真正的問題是他"光說不做"...
## 2                         [討論]柯粉有玩柯文哲遊戲嗎?
## 3        [新聞]媒體問台北車站某大樓有確診者？柯文哲脫
## 4       Re:[新聞]拒陳家欽拔北市分局長柯文哲：拔這個拔
## 5       Re:[新聞]北市封城兵推柯文哲曝科技大老來電：停
## 6         [討論]科P為何不執行好市長的社子島曼哈頓計畫
## 7               [轉錄]海納百川》柯P終究輸給了醬缸文化
## 8      Re:[新聞]外勞要給特赦令！柯文哲宣布5萬名外勞篩
## 9       Re:[爆卦]LIVE北市防疫升至三級柯文哲臨時記者會
## 10          [新聞]柯文哲稱民調都是用買的侯友宜:民調是
## 11          Re:[討論]柯：中央如果對付新冠像對付柯文哲
## 12        [黑特]免費的簡訊實聯制柯文哲說要0.7元造謠??
## 13 [新聞]侯友宜防疫表現獲49.1%肯定居冠柯文哲31.4%居次
## 14        Re:[新聞]四級兵推柯文哲：停班難、停工不得了
## 15        Re:[新聞]民調低迷！柯文哲：我只是不肯買而已
## 16     Re:[爆卦]柯文哲:沒有甚麼地方沒有準確把數字回報
## 17               [討論]柯文哲為什麼急著查阿公店確診？
## 18       [新聞]疫情嚴峻！柯文哲啟動「捐款專戶」：用在
## 19     Re:[新聞]柯文哲PO換燈泡、遭解讀要蘇貞昌下台綠:
## 20    Re:[新聞]快訊／「由不得你！」柯文哲：所有醫護人
##                                                         artUrl
## 1  https://www.ptt.cc/bbs/HatePolitics/M.1622344134.A.670.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1621784013.A.F3B.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1620017660.A.41B.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1619875373.A.D12.html
## 5     https://www.ptt.cc/bbs/Gossiping/M.1622348717.A.8E2.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1620746980.A.146.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1620212379.A.1EB.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1620961230.A.6B1.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621067236.A.FBE.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1620124236.A.A43.html
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1622363526.A.24F.html
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1622685347.A.010.html
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1621923011.A.AFD.html
## 14    https://www.ptt.cc/bbs/Gossiping/M.1622433020.A.E73.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1620023818.A.8E1.html
## 16    https://www.ptt.cc/bbs/Gossiping/M.1621694033.A.883.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1620878013.A.085.html
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1621833902.A.D2B.html
## 19    https://www.ptt.cc/bbs/Gossiping/M.1620004029.A.B27.html
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1621504635.A.49A.html

kp_posts_topic %>% # 主題三
  filter(topic==3) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                                       artTitle
## 1                    [新聞]柯P建議改公布快篩陽性梁文傑酸：就像
## 2     Re:[新聞]稱公佈足跡意義不大柯文哲解釋原因再提2大防疫新招
## 3                 Re:[新聞]機組人員3天居家檢疫柯文哲：最擔心立
## 4                 [新聞]北市12區確診數曝光！柯文哲：萬華控制住
## 5                Re:[新聞]民望低？柯文哲：我只是不肯買民調不然
## 6                  [新聞]18小時內湧928人報名！柯文哲發出醫護召
## 7                Re:[爆卦]柯文哲：陽性率降、確診不必馬上送醫院
## 8                 [新聞]「雙北停課」防病毒擴散！柯文哲：社區已
## 9               Re:[新聞]北市快篩陽性率11%降至4.7%柯文哲忍不住
## 10 Re:[新聞]蘇貞昌盼全國同步停班停課標準柯文哲：他喜歡宣布就給
## 11                  [新聞]柯文哲宣布：20家急救責任醫院加開快篩
## 12                  [新聞]北市釋出醫療能量柯文哲：新北案例可能
## 13                            [轉錄]柯文哲FB：台北市第四級兵推
## 14                                [討論]柯P是不是越來越入戲...
## 15                  [新聞]柯文哲讚徵召退離醫護「護國天使」北市
## 16                [新聞]快訊／「由不得你！」柯文哲：所有醫護人
## 17                          [新聞]柯文哲：症狀輕微隔離服藥即可
## 18                  [新聞]台北市三級警戒柯文哲：相信民眾素質可
## 19             Re:[新聞]幕後》柯文哲向陳時中拍桌才爭取到設快篩
## 20               [新聞]快訊／「校正回歸」確診+269　柯文哲：採+
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621740051.A.F6A.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1621599185.A.B22.html
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1620362262.A.DF0.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622193970.A.269.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1620101820.A.D38.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1621493577.A.595.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1621326076.A.240.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1621332092.A.BA1.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621352493.A.C59.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1621320888.A.1BE.html
## 11    https://www.ptt.cc/bbs/Gossiping/M.1621759376.A.4DC.html
## 12    https://www.ptt.cc/bbs/Gossiping/M.1622802652.A.940.html
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1622385448.A.0FD.html
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1622290564.A.66B.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1621949841.A.26E.html
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1621501331.A.DA2.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1621326289.A.1B2.html
## 18    https://www.ptt.cc/bbs/Gossiping/M.1621070455.A.889.html
## 19    https://www.ptt.cc/bbs/Gossiping/M.1621507384.A.A64.html
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1621670405.A.4A9.html

各主題細目討論

主題一：
> 關注在對於疫苗產地及施打順序的討論，整體言論以主觀言論為多。除了客觀新聞，多帶有政治色彩。如、「侯友宜這次被柯文哲屌打」、「能否再撐兩個月等疫苗」、「突然覺得柯文哲是比較正常的政治人物」、「北市記者群質疑柯文哲開記者會只為作秀」
主題二：
> 政治及媒體相關報導，如、「柯文哲稱民調都是用買的」、「免費的簡訊實聯制柯文哲說要0.7元造謠??」、「侯友宜防疫表現獲49.1%肯定居冠柯文哲31.4%居次」、「民調低迷！柯文哲：我只是不肯買而已」
主題三：
> 探討防疫相關政策與現況，如、「稱公佈足跡意義不大柯文哲解釋原因再提2大防疫新招」、「北市12區確診數曝光！柯文哲：萬華控制住」、「柯文哲宣布：20家急救責任醫院加開快篩」、「雙北停課防病毒擴散！」

日期主題分布

畫出每天topic的分布，發現主題三為大家廣泛討論的

kp_posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill")

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

kp_posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge")

## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

由上圖發現Gossiping以主題一跟三關注於疫苗產地、施打順序與防疫的政策；HatePolitics以主題一、二為主，討論柯文哲的做法政黑版相符；nCov2019發文篇數相對少但可以看得出來網友們討論以主題三防疫現況與相關政策相關，與政治相關的主題二在此版就沒有提及，由此分類的主題可以看出與發版位置高度相符。

蔡英文代表字詞

removed_word = c("蔡英文","總統","不是","每天","出來","覺得","完整","不要","不會","有沒有","已經","可能","不能","疫苗","問題","一下","一定","根本") 

# 看各群的常用詞彙
tidy(tsai_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  scale_x_reordered()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

可以歸納出
topic 1 = “疫情、停電等新聞與究責問題”
topic 2 = “疫苗來源引發政黨問題”
topic 3 = “民調與滿意度”
topic 4 = “防疫策略及相關報導”
topic 5 = “蔡英文總統學歷議題”

代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣
tsai_topics <- tidy(tsai_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
tsai_topics

## # A tibble: 703 x 3
## # Groups:   document [703]
##    document                                                    topic gamma
##    <chr>                                                       <int> <dbl>
##  1 https://www.ptt.cc/bbs/HatePolitics/M.1619876677.A.C0F.html     1 0.715
##  2 https://www.ptt.cc/bbs/HatePolitics/M.1620003896.A.88D.html     1 0.910
##  3 https://www.ptt.cc/bbs/HatePolitics/M.1620146828.A.8B6.html     1 0.999
##  4 https://www.ptt.cc/bbs/HatePolitics/M.1620191140.A.DCA.html     1 0.905
##  5 https://www.ptt.cc/bbs/HatePolitics/M.1620463594.A.CF4.html     1 0.864
##  6 https://www.ptt.cc/bbs/HatePolitics/M.1620708904.A.6DC.html     1 0.753
##  7 https://www.ptt.cc/bbs/HatePolitics/M.1620734535.A.D6F.html     1 0.779
##  8 https://www.ptt.cc/bbs/HatePolitics/M.1620781459.A.B63.html     1 0.999
##  9 https://www.ptt.cc/bbs/HatePolitics/M.1620783169.A.94C.html     1 0.509
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1620792700.A.49C.html     1 0.999
## # ... with 693 more rows

資料內容探索

tsai_posts_topic <- merge(x = tsai_posts, y = tsai_topics, by.x = "artUrl", by.y="document")

# 看一下各主題在說甚麼
set.seed(123)
tsai_posts_topic %>% # 主題一
  filter(topic==1) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                                     artTitle
## 1                 [新聞]亞洲物理奧林匹亞改線上舉辦蔡英文：精
## 2                 [新聞]日本擬提供台灣AZ疫苗　蔡英文：由衷感
## 3               [黑特]蔡英文政府創校正回歸這名詞丟了我國的臉
## 4                 [新聞]柯文哲嗆蔡英文：疫苗不要講空泛數字逼
## 5                [新聞]時力立委.議員聯合喊話：請蔡英文站上第
## 6                 [討論]年輕人不要給蔡英文愛將陳時中造成困擾
## 7                      Re:[討論]任內兩次全台跳電的只有蔡英文
## 8                     [討論]蔡英文該使用緊急命令建設火力發電
## 9                 [討論]為了逼菜陰魂出來講話，故意造成屎傷！
## 10                [轉錄]【重磅快評】蔡英文說停電無關缺電，卻
## 11              [新聞]藍營施壓蔡英文？80歲新黨主席郁慕明赴大
## 12                        [討論]蔡英文：台灣不缺電是不爭事實
## 13                [新聞]柯文哲嗆蔡英文股票下降才露臉綠委：不
## 14 Re:[新聞]陳時中、蔡英文「一年後對比」惹心疼！網淚：這次一
## 15                [新聞]【全台大停電】興達電廠機組跳電蔡英文
## 16               [轉錄]蔡英文：今天是農曆3月23日，媽祖的誕辰
## 17                        [黑特]蔡英文應親自主持防疫指揮中心
## 18             Re:[新聞]柯文哲批蔡英文「高端股票跌停才出面」
## 19                       [討論]明天就是蔡英文520就職紀念日~~
## 20                 [討論]【老鵝金曲改編】缺電英(原曲:田馥甄/
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621231605.A.EB4.html
## 2     https://www.ptt.cc/bbs/Gossiping/M.1622244773.A.216.html
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1621855723.A.3F5.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622039483.A.E78.html
## 5      https://www.ptt.cc/bbs/nCoV2019/M.1622449039.A.0EB.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1622777855.A.6D7.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1620955861.A.E9F.html
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1620961017.A.501.html
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1622639150.A.D30.html
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1620964404.A.FC8.html
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1622516220.A.CFD.html
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1621299548.A.DA1.html
## 13    https://www.ptt.cc/bbs/Gossiping/M.1622609352.A.A05.html
## 14    https://www.ptt.cc/bbs/Gossiping/M.1621563284.A.445.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1620892459.A.F74.html
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1620146828.A.8B6.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1620708904.A.6DC.html
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1622734126.A.2D4.html
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1621412628.A.0C8.html
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1621781666.A.A8B.html

tsai_posts_topic %>% # 主題二
  filter(topic==2) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                          artTitle
## 1   Re:[新聞]呂秀蓮喊話蔡英文：每個人都要見閻王很
## 2     [新聞]親筆信箋謝黨員蔡英文:不忘初衷落實排黑
## 3      [新聞]國台辦對蔡英文說：民進黨「有何資格談
## 4      [討論]國產疫苗背後的利益多龐大?蔡英文踹共!
## 5        [討論]日本人的徒孫蔡英文就是在賭最後一把
## 6      [新聞]快訊／蔡英文為黑道入黨事件道歉了　下
## 7     [新聞]蔡英文：台灣人做到2件事就能有效嚇阻中
## 8      Re:[新聞]蔡英文執政滿5年民進黨列出10大政績
## 9   Re:[爆卦]蔡英文總統的疫苗進度報告圖來了(多圖)
## 10  Re:[新聞]馬英九竟嗆「蔡英文賣台」　經濟部吐真
## 11  Re:[新聞]馬英九竟嗆「蔡英文賣台」　經濟部吐真
## 12   [新聞]向原廠買疫苗才有保證？郭正亮嗆蔡英文：
## 13 Re:[新聞]蔡英文：已購買近3000萬劑疫苗8月底累積
## 14           [討論]蔡英文又要出來救國產疫苗了嗎？
## 15   [新聞]向原廠買疫苗才有保證？郭正亮嗆蔡英文：
## 16     [討論]跪求陳時中、蔡英文買1000萬劑大陸疫苗
## 17            Re:[討論]蔡英文選的贏當年馬英九嗎？
## 18    [新聞]疫苗購買進度3大QA一次看　蔡英文政府拒
## 19             [討論]假設蔡英文跟馬英九一樣舔中..
## 20  Re:[新聞]快訊／疫情拉警報！總統蔡英文13：30發
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1622492403.A.4FA.html
## 2     https://www.ptt.cc/bbs/Gossiping/M.1620277427.A.195.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1620736406.A.734.html
## 4  https://www.ptt.cc/bbs/HatePolitics/M.1622009685.A.16A.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1622015850.A.05C.html
## 6     https://www.ptt.cc/bbs/Gossiping/M.1620201474.A.A49.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1620212646.A.373.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1621520315.A.A23.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1622481745.A.CB1.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1620532737.A.1BE.html
## 11    https://www.ptt.cc/bbs/Gossiping/M.1620540814.A.93D.html
## 12    https://www.ptt.cc/bbs/Gossiping/M.1622124358.A.831.html
## 13    https://www.ptt.cc/bbs/Gossiping/M.1622076858.A.30D.html
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1622510641.A.AC8.html
## 15 https://www.ptt.cc/bbs/HatePolitics/M.1622127193.A.BDA.html
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1622172112.A.115.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1621356952.A.F6B.html
## 18    https://www.ptt.cc/bbs/Gossiping/M.1621397740.A.114.html
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1620517319.A.8FD.html
## 20    https://www.ptt.cc/bbs/Gossiping/M.1620799110.A.5D1.html

tsai_posts_topic %>% # 主題三
  filter(topic==3) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                           artTitle
## 1  Re:[新聞]蔡英文被疫苗之亂一路挨打忍到莫德納上飛
## 2            Re:[討論]假設蔡英文跟馬英九一樣舔中..
## 3       [新聞]遠見民調》總統、內閣民調創新高蔡英文
## 4             [討論]如果我染病掛了和蔡英文說我愛他
## 5               [黑特]蔡英文蘇貞昌該引咎辭職了吧？
## 6      [新聞]蔡英文承諾當爸媽「神隊友」　要貫徹0-6
## 7                 [討論]請問如何罷免蔡英文民進黨？
## 8       [新聞]進入社區感染蔡英文臉書貼反公投羅智強
## 9   Re:[新聞]5天停電兩次蔡英文再致歉：或許有些人為
## 10   Re:[新聞]民進黨公投口號來了蔡英文：四個不同意
## 11        [討論]疫情變第三級假博士蔡英文應立刻下台
## 12   Re:[新聞]陳時中、蔡英文「一年後對比」惹心疼！
## 13     [新聞]蔡英文支持度跌破5成　藍營分析：基本盤
## 14                [討論]逃兵販毒當民進黨蔡英文幹部
## 15      [新聞]總統蔡英文民調滿意度創新高3成6民眾卻
## 16     [新聞]蔡英文宣布資安即國安2.0戰略將成立資通
## 17             [討論]為什麼蔡英文對生技這麼癡迷啊?
## 18         [新聞]蔡英文執政滿5年民進黨列出10大政績
## 19 Re:[新聞]快訊／「防疫不是陳時中的事」　蔡英文：
## 20              [問卦]蔡英文執政到底做了什麼？？？
##                                                         artUrl
## 1  https://www.ptt.cc/bbs/HatePolitics/M.1622120616.A.254.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1620529847.A.167.html
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1620628501.A.ED1.html
## 4  https://www.ptt.cc/bbs/HatePolitics/M.1620796574.A.0F2.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1619924153.A.7BB.html
## 6     https://www.ptt.cc/bbs/Gossiping/M.1620566241.A.D47.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1621258168.A.708.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1620746621.A.461.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621403889.A.E05.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1620711902.A.CA8.html
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1621053971.A.C9D.html
## 12    https://www.ptt.cc/bbs/Gossiping/M.1621568410.A.894.html
## 13    https://www.ptt.cc/bbs/Gossiping/M.1621986723.A.B59.html
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1619836156.A.A7D.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1620627254.A.299.html
## 16    https://www.ptt.cc/bbs/Gossiping/M.1620127457.A.EED.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1622566733.A.E22.html
## 18    https://www.ptt.cc/bbs/Gossiping/M.1621516308.A.E42.html
## 19    https://www.ptt.cc/bbs/Gossiping/M.1620943904.A.1EF.html
## 20    https://www.ptt.cc/bbs/Gossiping/M.1622032929.A.CFB.html

tsai_posts_topic %>% # 主題四
  filter(topic==4) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                                    artTitle
## 1                                 Re:[爆卦]蔡英文：我很健康
## 2  [新聞]總統府：蔡英文、蘇貞昌與陳時中密切聯繫要求守住疫情
## 3                           Re:[爆卦]蔡英文總統發表談話LIVE
## 4              [新聞]快訊／蔡英文17：00發表談話　親上火線談
## 5               [新聞]41萬劑疫苗今起配送全國蔡英文:我已說過
## 6                [新聞]地方政府十萬火急要疫苗蔡英文：疫苗購
## 7                [新聞]蔡英文：黨公職緊盯假訊息不要打擊防疫
## 8                  [新聞]蔡英文也曾籲馬英九買國際疫苗？總統
## 9               [新聞]蔡英文520喊話：我會領導國軍跟大家一起
## 10              [新聞]駁斥確診謠言！蔡英文報告「防疫2重點」
## 11            Re:[新聞]快訊／明天要上班了　蔡英文拜託大家「
## 12               [新聞]網傳蔡英文確診還沒宣布刑事局追查假訊
## 13                       [討論]蔡英文：衛署不應護航國光疫苗
## 14                          [轉錄]蔡英文FB:少出門、拜託大家
## 15             [新聞]疫情增溫假訊息充斥蔡英文臉書：散播疫情
## 16               [新聞]蘇貞昌向蔡英文報告紓困：總統全力支持
## 17               [新聞]蔡英文指示全力處理：沒有所謂刁難問題
## 18               [新聞]地方頻喊要中國疫苗蔡英文13：30中常會
## 19          Re:[新聞]快訊／蔡英文：已做內部清查無炒股問題　
## 20               [爆卦]蔡英文總統的疫苗進度報告圖來了(多圖)
##                                                         artUrl
## 1     https://www.ptt.cc/bbs/Gossiping/M.1621695224.A.A27.html
## 2     https://www.ptt.cc/bbs/Gossiping/M.1620727056.A.45C.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1620955964.A.207.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622449336.A.AC6.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1622098978.A.8DF.html
## 6     https://www.ptt.cc/bbs/Gossiping/M.1622020354.A.366.html
## 7     https://www.ptt.cc/bbs/Gossiping/M.1621410135.A.276.html
## 8     https://www.ptt.cc/bbs/Gossiping/M.1622549781.A.A95.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621489594.A.37B.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1621699173.A.EE4.html
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1621349425.A.5BB.html
## 12    https://www.ptt.cc/bbs/Gossiping/M.1621613501.A.258.html
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1622523415.A.1EE.html
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1621938078.A.524.html
## 15 https://www.ptt.cc/bbs/HatePolitics/M.1621240270.A.394.html
## 16    https://www.ptt.cc/bbs/Gossiping/M.1622270796.A.4F7.html
## 17    https://www.ptt.cc/bbs/Gossiping/M.1622444176.A.62F.html
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1622002471.A.4A0.html
## 19    https://www.ptt.cc/bbs/Gossiping/M.1622458499.A.E1D.html
## 20    https://www.ptt.cc/bbs/Gossiping/M.1622481332.A.B9A.html

tsai_posts_topic %>% # 主題五
  filter(topic==5) %>%
  select(artTitle, artUrl) %>%
  unique() %>%
  sample_n(20)

##                                                 artTitle
## 1           [討論]蔡英文被國際媒體恥笑!疫情消失的四百天!
## 2                             [討論]蔡英文是不是佛系老闆
## 3        Re:[新聞]快訊／台灣最強防疫顧問來了！　蔡英文蛀
## 4                 [新聞]宅神送花槓上民進黨蔡英文回應了：
## 5                 [黑特]獅子會傳播疫情，貓控蔡英文下台！
## 6            [討論]假博士蔡英文想用疫情遮掩自己的論文門?
## 7                        [討論]蔡英文年輕時有漂亮過嗎???
## 8                               [創作]蔡英文拿菸燙賴清德
## 9  Re:[新聞]本周末國中會考如期舉行蔡英文請考生和家長安心
## 10       Re:[新聞]向上海復星買疫苗？蔡英文：只有跟原廠買
## 11          [新聞]蔡英文發布影片：謝謝你在疫情下每個堅持
## 12         [新聞]繼續保持「空無一人」！蔡英文推銷神級APP
## 13            Re:[討論]蔡英文論文以當前證據到底是真是假?
## 14            [新聞]今年Q1經濟成長率估達8.16％　蔡英文：
## 15            [新聞]民眾罵聲連連！蔡英文向全民道歉　一周
## 16         Re:[新聞]快訊／明天要上班了　蔡英文拜託大家「
## 17     [新聞]停電又停水！蔡英文發言…網友罵爆：廢話一堆！
## 18                [黑特]蔡英文的分流上班像風涼話or笑話？
## 19            [新聞]感謝「台灣人的堅持」！蔡英文臉書影片
## 20                  [黑特]蔡英文為什麼這次不聽柯建銘的話
##                                                         artUrl
## 1  https://www.ptt.cc/bbs/HatePolitics/M.1622593236.A.E51.html
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1621861472.A.898.html
## 3     https://www.ptt.cc/bbs/Gossiping/M.1622133872.A.6CC.html
## 4     https://www.ptt.cc/bbs/Gossiping/M.1622623848.A.4FF.html
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1620994382.A.69D.html
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1620815231.A.97C.html
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1622704608.A.07F.html
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1621589355.A.6EF.html
## 9     https://www.ptt.cc/bbs/Gossiping/M.1621122551.A.20C.html
## 10    https://www.ptt.cc/bbs/Gossiping/M.1622018600.A.240.html
## 11    https://www.ptt.cc/bbs/Gossiping/M.1622253969.A.205.html
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1621143073.A.287.html
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1620782352.A.4CD.html
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1619849141.A.CF6.html
## 15    https://www.ptt.cc/bbs/Gossiping/M.1621263793.A.28B.html
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1621178028.A.6CB.html
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1621023219.A.C49.html
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1621846990.A.FB2.html
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1622213556.A.71D.html
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1622291884.A.801.html

各主題細目討論

主題一：
> 主要探討疫情、停電等新聞以及究責問題，如、「暴增633例11人死亡創新高」、「5天停電兩次蔡英文再致歉、「蔡英文應出面說明有不進入4級警戒」、「蔡英文應對本土病例暴增下台」
主題二：
> 對於是否使用中國所提供疫苗產生討論，如、「蔡英文：國產疫苗是國家戰略」、「蔡英文不用92共識就能賺中國錢」、「馬英九竟嗆蔡英文賣台」、「民進黨內：蔡英文不願面對光環盡失」
主題三：
> 探討連任周年的滿意度及民調調查，如、「TVBS蔡英文總統連任周年滿意度民調」、「民進黨公投口號來了」、「蔡英文2022將無敵慘敗」、「蔡英文面臨疫情3大挑戰」、「蔡英文蘇貞昌該引咎辭職了吧？」、「看完蔡英文最新民調綠營人士嘆：2022民進黨慘了」
主題四：
> 針對疫情現況及防疫施打，如、「國產疫苗是國家戰略」、「蔡英文3項指示要求執政黨以身作則」、「」、「蘇貞昌向蔡英文報告紓困」、「感謝醫界投入第一線快篩」、「日本挑六四贈疫苗！」
主題五：
>主要討論蔡英文總統學歷真假問題，如、「假博士蔡英文賣台計畫曝光!」、「歐崇敬:蔡英文無心向學怎麽可能拿到博士」、「假博士蔡英文想用疫情遮掩自己」、「LSE校長秘書：查無蔡英文口委資料」

日期主題分布

畫出每天topic的分布，發現主題五為大家廣泛討論的

tsai_posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill")

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

tsai_posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge")

## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

由上圖發現Gossiping以主題四防疫相關為主；HatePolitics以主題五討論蔡英文總統學歷是否造價的問題與政黑版相符；nCov2019發文篇數相對少可發現網路針對蔡英文總統的討論比較偏向政治面而非疫情本身蔡英文總統的報導。

社群網路圖

用來檢視節點、連結之間的關係節點是網路中的個人參與者，連結則是參與者之間的關係

資料合併

# 文章和留言
hou_reviews <- hou_reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
# 把文章和留言依據artUrl innerJoin
posts_Reviews <- merge(x = hou_posts, y = hou_reviews, by = "artUrl")

# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = hou_topics, by.x = "artUrl", by.y="document")


chen_reviews <- chen_reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
chen_posts_Reviews <- merge(x = chen_posts, y = chen_reviews, by = "artUrl")
chen_posts_Reviews <- merge(x = chen_posts_Reviews, y = chen_topics, by.x = "artUrl", by.y="document")

kp_reviews <- kp_reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
kp_posts_Reviews <- merge(x = kp_posts, y = kp_reviews, by = "artUrl")
kp_posts_Reviews <- merge(x = kp_posts_Reviews, y = kp_topics, by.x = "artUrl", by.y="document")

tsai_reviews <- tsai_reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
tsai_posts_Reviews <- merge(x = tsai_posts, y = tsai_reviews, by = "artUrl")
tsai_posts_Reviews <- merge(x = tsai_posts_Reviews, y = tsai_topics, by.x = "artUrl", by.y="document")

取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位

link <- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)

chen_link <- chen_posts_Reviews %>% select(cmtPoster, artPoster, artUrl)

kp_link <- kp_posts_Reviews %>% select(cmtPoster, artPoster, artUrl)

tsai_link <- tsai_posts_Reviews %>% select(cmtPoster, artPoster, artUrl)

基本網路圖

建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork

## IGRAPH 9619316 DN-- 11213 40073 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 9619316 (vertex names):
##  [1] greensaru   ->bunengs ioupoiu     ->bunengs sustaining  ->bunengs
##  [4] ray2501     ->bunengs james732    ->bunengs diefishfish ->bunengs
##  [7] PippenRock  ->bunengs a210510     ->bunengs light20735  ->bunengs
## [10] ray2501     ->bunengs rix20150621 ->bunengs a210510     ->bunengs
## [13] aqsss       ->bunengs makimakimaki->bunengs lesnaree2   ->bunengs
## [16] a1091100075 ->bunengs grace132004 ->bunengs tanby       ->bunengs
## [19] soar1983    ->bunengs sd09090     ->bunengs FrankCastle ->bunengs
## [22] demangel    ->bunengs a29976137   ->bunengs uanniy      ->bunengs
## + ... omitted several edges

reviewNetwork <- graph_from_data_frame(d=chen_link, directed=T)
reviewNetwork

## IGRAPH 961dea8 DN-- 1875 2614 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 961dea8 (vertex names):
##  [1] piercingX   ->bunengs rustic5566  ->bunengs AndyMAX     ->bunengs
##  [4] fcuk9981    ->bunengs rjaws       ->bunengs naly0617    ->bunengs
##  [7] qaz223gy    ->bunengs kissung     ->bunengs em4         ->bunengs
## [10] wsjerry     ->bunengs lolic       ->bunengs SMG2016     ->bunengs
## [13] kenryu      ->bunengs hyperyoujo  ->bunengs zukidelko   ->bunengs
## [16] pchion2002  ->bunengs aidansky0989->bunengs source0209  ->bunengs
## [19] thatislife  ->bunengs shoes922    ->bunengs baek6415    ->bunengs
## [22] takechance  ->bunengs suge        ->bunengs Croy        ->bunengs
## + ... omitted several edges

reviewNetwork <- graph_from_data_frame(d=kp_link, directed=T)
reviewNetwork

## IGRAPH 9626bcb DN-- 23442 159201 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 9626bcb (vertex names):
##  [1] reina07    ->urban       ga652206   ->urban       roroccc    ->urban      
##  [4] ju06080719 ->urban       beachrabbit->urban       a210510    ->Wojnarowski
##  [7] highyes    ->Wojnarowski BLACKLIONS ->Wojnarowski linqqq007  ->Wojnarowski
## [10] zankuro    ->Wojnarowski victoryman ->Wojnarowski bouedx     ->Wojnarowski
## [13] sdg235     ->Wojnarowski luismars   ->Wojnarowski scum5566   ->Wojnarowski
## [16] a29976137  ->Wojnarowski philip2364 ->Wojnarowski DustToDust ->Wojnarowski
## [19] xiaohan    ->Wojnarowski emuless    ->Wojnarowski sugoi5566  ->Wojnarowski
## [22] j31712     ->Wojnarowski a210510    ->Wojnarowski ooxxman    ->Wojnarowski
## + ... omitted several edges

reviewNetwork <- graph_from_data_frame(d=tsai_link, directed=T)
reviewNetwork

## IGRAPH 9638d9f DN-- 18043 93729 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 9638d9f (vertex names):
##  [1] ThreekRoger ->lucifax    Findagreen  ->lucifax    loveponpon  ->lucifax   
##  [4] k23         ->lucifax    hunt5566    ->lucifax    ccyaztfe    ->lucifax   
##  [7] syearth     ->lucifax    hesashiaaron->lucifax    scum5566    ->lucifax   
## [10] zzro        ->lucifax    dahlia7357  ->lucifax    a0952864901 ->lucifax   
## [13] ojizz4u     ->lucifax    Beanoodle   ->lucifax    airyptt     ->lucifax   
## [16] GingerMeow  ->lucifax    yilin11     ->lucifax    hy654       ->Janice1998
## [19] ppptttqaz   ->Janice1998 louis5265   ->Janice1998 polestar0505->Janice1998
## [22] firemothra  ->Janice1998 puyo        ->Janice1998 em4         ->Janice1998
## + ... omitted several edges

資料篩選

資料篩選的方式：

文章:文章日期、留言數(commentNum)
link、node:degree

侯友宜

# 看一下留言數大概都多少(方便後面篩選)
hou_posts %>%
 # filter(commentNum<500) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

依據發文數或回覆數篩選post和review

# 帳號發文篇數
post_count = hou_posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
post_count

## # A tibble: 235 x 2
##    artPoster    count
##    <chr>        <int>
##  1 myfriend         9
##  2 win8719          7
##  3 Anus9527         6
##  4 kapasky          6
##  5 McCain           6
##  6 tingdou          6
##  7 yoning           6
##  8 cygerger         5
##  9 engxia           5
## 10 haehae311444     5
## # ... with 225 more rows

# 帳號回覆總數
review_count = hou_reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
review_count

## # A tibble: 11,115 x 2
##    cmtPoster   count
##    <chr>       <int>
##  1 TheoEpstein   220
##  2 borriss       152
##  3 ZHKWB         144
##  4 MVPGGYY       116
##  5 myfriend      113
##  6 MauriceHu     108
##  7 win8719       103
##  8 yufion         98
##  9 gogoegg        97
## 10 kterry01       90
## # ... with 11,105 more rows

# 發文者
poster_select <- post_count %>% filter(count >= 2)
hou_posts <- hou_posts %>%  filter(hou_posts$artPoster %in% poster_select$artPoster)

# 回覆者
reviewer_select <- review_count %>%  filter(count >= 20)
hou_reviews <- hou_reviews %>%  filter(hou_reviews$cmtPoster %in% reviewer_select$cmtPoster)

# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量 235

## [1] 235

length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 11115

## [1] 11115

allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 11213
length(unique(allPoster))

## [1] 11213

標記所有出現過得使用者

poster：只發過文、發過文+留過言
replyer：只留過言

# 如果曾發過文標註為poster，反之標註為replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%hou_posts$artPoster, "poster", "replyer"))
head(userList,3)

##       user    type
## 1  bunengs  poster
## 2 kuso2005 replyer
## 3  nianhua  poster

# 篩選回應數和發文次數
table(hou_posts$commentNum>=100)

## 
## FALSE  TRUE 
##   121    54

以日期篩選社群

為5/22討論度最高，我們挑出三個版當天的文章和回覆看看

link <- posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      # filter(artCat=="Gossiping") %>% 
      filter(artDate == as.Date('2021-05-22')) %>%
      # 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link

## # A tibble: 99 x 3
## # Groups:   cmtPoster, artUrl [99]
##    cmtPoster  artPoster artUrl                                                  
##    <chr>      <chr>     <chr>                                                   
##  1 hosen      xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  2 A80211ab   xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  3 curryhats  xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  4 clkdtm32   xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  5 jokebbs    xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  6 i376ers    xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  7 chiang0829 xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  8 trywish    xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
##  9 xager      xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
## 10 s13140709  xingting  https://www.ptt.cc/bbs/Gossiping/M.1621669033.A.120.html
## # ... with 89 more rows

篩選在link裡面有出現的使用者

filtered_user <- userList %>%
          # 篩選link中有出現的使用者
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user,3)

##         user    type
## 1   xingting replyer
## 2      monmo replyer
## 3 freeclouds replyer

使用者是否受到歡迎

PTT的回覆有三種，推文、噓文、箭頭，我們只要看推噓就好，因此把箭頭清掉，這樣資料筆數較少，所以我們把篩選的條件放寬一些。

filter_degree = 10 # 使用者degree

# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
      # filter(artCat=="Gossiping") %>% 
      filter(commentNum > 200) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 3) %>%
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來，跟前面做的事都一樣，因此不再細述

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
op <- par(family = "黑體-繁 中黑")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)

## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): font
## family not found in Windows font database

legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

par(op)

可以發現本次的討論中幾乎都是推文、噓文較少

網路圖

library(networkD3)

## Warning: package 'networkD3' was built under R version 4.0.5

links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  # 設定推噓顏色
             )

## Links is a tbl_df. Converting to a plain data frame.

# 一號(36天發2篇文)
hou_leader1 = hou_posts %>% filter(artPoster=="uesnba")
summary(hou_leader1)

##    artTitle            artDate             artTime            artUrl         
##  Length:2           Min.   :2021-05-12   Length:2          Length:2          
##  Class :character   1st Qu.:2021-05-17   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-22   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-22   Mode  :numeric                      
##                     3rd Qu.:2021-05-27                                       
##                     Max.   :2021-06-01                                       
##   artPoster            artCat            commentNum          push      
##  Length:2           Length:2           Min.   : 257.0   Min.   :117.0  
##  Class :character   Class :character   1st Qu.: 451.5   1st Qu.:257.8  
##  Mode  :character   Mode  :character   Median : 646.0   Median :398.5  
##                                        Mean   : 646.0   Mean   :398.5  
##                                        3rd Qu.: 840.5   3rd Qu.:539.2  
##                                        Max.   :1035.0   Max.   :680.0  
##       boo           sentence        
##  Min.   : 24.00   Length:2          
##  1st Qu.: 49.25   Class :character  
##  Median : 74.50   Mode  :character  
##  Mean   : 74.50                     
##  3rd Qu.: 99.75                     
##  Max.   :125.00

hou_leader1$artDate = as.Date(hou_leader1$artDate)
hou_leader1= hou_leader1 %>% mutate(days = as.Date(cut(artDate, "days")))
hou_leader1time = hou_leader1 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="uesnba" )

# 二號(36天發6篇文)
hou_leader2 = hou_posts %>% filter(artPoster=="tingdou")
summary(hou_leader2)

##    artTitle            artDate             artTime            artUrl         
##  Length:6           Min.   :2021-05-02   Length:6          Length:6          
##  Class :character   1st Qu.:2021-05-14   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-17   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-16   Mode  :numeric                      
##                     3rd Qu.:2021-05-22                                       
##                     Max.   :2021-05-25                                       
##   artPoster            artCat            commentNum           push      
##  Length:6           Length:6           Min.   :  21.00   Min.   :  3.0  
##  Class :character   Class :character   1st Qu.:  24.25   1st Qu.: 11.0  
##  Mode  :character   Mode  :character   Median :  75.00   Median : 42.0  
##                                        Mean   : 287.67   Mean   :141.8  
##                                        3rd Qu.: 230.00   3rd Qu.:100.0  
##                                        Max.   :1266.00   Max.   :644.0  
##       boo           sentence        
##  Min.   :  3.00   Length:6          
##  1st Qu.:  4.25   Class :character  
##  Median :  6.00   Mode  :character  
##  Mean   : 27.83                     
##  3rd Qu.: 25.00                     
##  Max.   :117.00

hou_leader2$artDate = as.Date(hou_leader2$artDate)
hou_leader2= hou_leader2 %>% mutate(days = as.Date(cut(artDate, "days")))
hou_leader2time = hou_leader2 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="tingdou" )

# 三號(36天發5篇文)
hou_leader3 = hou_posts %>% filter(artPoster=="haehae311444")
summary(hou_leader3)

##    artTitle            artDate             artTime            artUrl         
##  Length:5           Min.   :2021-05-18   Length:5          Length:5          
##  Class :character   1st Qu.:2021-05-22   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-25   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-23   Mode  :numeric                      
##                     3rd Qu.:2021-05-26                                       
##                     Max.   :2021-05-27                                       
##   artPoster            artCat            commentNum         push    
##  Length:5           Length:5           Min.   :112.0   Min.   : 45  
##  Class :character   Class :character   1st Qu.:120.0   1st Qu.: 52  
##  Mode  :character   Mode  :character   Median :242.0   Median :108  
##                                        Mean   :356.6   Mean   :112  
##                                        3rd Qu.:343.0   3rd Qu.:125  
##                                        Max.   :966.0   Max.   :230  
##       boo      sentence        
##  Min.   :0   Length:5          
##  1st Qu.:0   Class :character  
##  Median :0   Mode  :character  
##  Mean   :0                     
##  3rd Qu.:0                     
##  Max.   :0

hou_leader3$artDate = as.Date(hou_leader3$artDate)
hou_leader3= hou_leader3 %>% mutate(days = as.Date(cut(artDate, "days")))
hou_leader3time = hou_leader3 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="haehae311444" )

整合他們的發文趨勢圖

hou_leader = rbind(hou_leader1time,hou_leader2time,hou_leader3time)
hou_leader %>% ggplot(aes(x= days,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free")

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(hou_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(hou_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  wordcloud2()

  # filter(sum>3)  %>%
  # arrange(desc(sum))  %>% wordcloud2(minSize = 3)

“wordcloud”

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(hou_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(hou_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word == "一共")) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(hou_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(hou_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word %in% c("必要","一定","一起","發現"))) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

P <- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

hou_3 = hou_posts %>% filter(artPoster=="uesnba"|artPoster=="tingdou"|artPoster=="haehae311444")

devotion_gram <- hou_3 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer)
devotion_gram

## # A tibble: 2,420 x 10
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  2 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  3 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  4 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  5 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  6 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  7 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  8 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
##  9 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
## 10 [新聞]疫~ 2021-05-18 08:41:17 https:~ haehae31~ nCoV2~        242   108     0
## # ... with 2,410 more rows, and 1 more variable: word <chr>

sentiment_count = devotion_gram %>%
  select(artDate,word,artPoster) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment,artPoster) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.

sentiment_count %>%
  mutate(sentiment_count=ifelse(sentiment == "positive",1*count,-1*count)) %>% 
  ggplot(aes(artDate, sentiment_count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~artPoster, ncol = 1, scales = "free_y")

  # # 畫圖的部分
  # ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  # scale_color_manual() +
  # geom_col(position="dodge") + 
  # scale_x_date(labels = date_format("%m/%d")) +
  # labs(title = "sentiment of ptt",color = "情緒類別") +
  # facet_wrap(~artPoster, ncol = 1, scales="free_y")  # scale可以調整比例尺

侯友宜意見領袖

haehae311444，tingdou，uesnba

haehae311444: 主要發文多為轉貼與侯友宜相關新聞和臉書內容，從PPT發文內容來看此網友贊同侯友宜相關行為，所以情緒詞多為正面，負面情緒多餘批評其它政治人物。 tingdou: 文章大多發在政黑版所以一些情緒用詞明顯多於負面，主要諷刺防疫作為。 uesnba: 發文數少且文章為轉貼新聞相關並搭配圖片，所以正負面情緒平均。

陳時中

資料篩選的方式：

文章:文章日期、留言數(commentNum)
link、node:degree

# 看一下留言數大概都多少(方便後面篩選)
chen_posts %>%
 # filter(commentNum<500) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

依據發文數或回覆數篩選post和review

# 帳號發文篇數
chen_post_count = chen_posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
chen_post_count

## # A tibble: 707 x 2
##    artPoster   count
##    <chr>       <int>
##  1 Pietro         27
##  2 osalucard      19
##  3 joanzkow       17
##  4 chirex         10
##  5 yataiml001     10
##  6 B0858B          9
##  7 djdjdj          9
##  8 Wojnarowski     9
##  9 amilkamilk      8
## 10 ben108472       8
## # ... with 697 more rows

# 帳號回覆總數
chen_review_count = chen_reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
chen_review_count

## # A tibble: 22,864 x 2
##    cmtPoster   count
##    <chr>       <int>
##  1 TheoEpstein  1156
##  2 LI40          480
##  3 test23786     470
##  4 yufion        312
##  5 Pietro        296
##  6 suba1121      286
##  7 demitri       284
##  8 djdjdj        234
##  9 alan0204      233
## 10 zeumax        209
## # ... with 22,854 more rows

# 發文者
chen_poster_select <- chen_post_count %>% filter(count >= 2)
chen__posts <- chen_posts %>%  filter(chen_posts$artPoster %in% chen_poster_select$artPoster)

# 回覆者
chen_reviewer_select <- chen_review_count %>%  filter(count >= 20)
chen_reviews <- chen_reviews %>%  filter(chen_reviews$cmtPoster %in% chen_reviewer_select$cmtPoster)

# 檢視參與人數
length(unique(chen_posts_Reviews$artPoster)) # 發文者數量 705

## [1] 14

length(unique(chen_posts_Reviews$cmtPoster)) # 回覆者數量 22864

## [1] 1864

allPoster <- c(chen_posts_Reviews$artPoster, chen_posts_Reviews$cmtPoster) # 總參與人數 23125
length(unique(allPoster))

## [1] 1875

標記所有出現過得使用者

poster：只發過文、發過文+留過言
replyer：只留過言

# 如果曾發過文標註為poster，反之標註為replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%chen_posts$artPoster, "poster", "replyer"))
head(userList,3)

##      user   type
## 1 bunengs poster
## 2 nianhua poster
## 3 tingdou poster

# 篩選回應數和發文次數
table(chen_posts$commentNum>=100)

## 
## FALSE  TRUE 
##   809   360

以日期篩選社群

為5/28討論度最高，我們挑出三個版當天的文章和回覆看看

chen_link <- chen_posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      # filter(artCat=="Gossiping") %>%
      filter(artDate == as.Date('2021-05-28')) %>%
      # 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
chen_link

## # A tibble: 0 x 3
## # Groups:   cmtPoster, artUrl [0]
## # ... with 3 variables: cmtPoster <chr>, artPoster <chr>, artUrl <chr>

篩選在link裡面有出現的使用者

filtered_user <- userList %>%
          # 篩選link中有出現的使用者
          filter(user%in%chen_link$cmtPoster | user%in%chen_link$artPoster) %>%
          arrange(desc(type))

使用者是否受到歡迎

PTT的回覆有三種，推文、噓文、箭頭，我們只要看推噓就好，因此把箭頭清掉，這樣資料筆數較少，所以我們把篩選的條件放寬一些。

filter_degree = 12 # 使用者degree

# 過濾留言者對發文者的推噓程度
chen_link <- chen_posts_Reviews %>%
      # filter(artCat=="Gossiping") %>%
      filter(commentNum > 500) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 3) %>%
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來，跟前面做的事都一樣，因此不再細述

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%chen_link$cmtPoster | user%in%chen_link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=chen_link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
op <- par(family = "黑體-繁 中黑")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)

## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): font
## family not found in Windows font database

legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

par(op)

可以發現本次的討論中幾乎都是推文、噓文較少

網路圖

library(networkD3)
links = chen_link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  # 設定推噓顏色
             )

## Links is a tbl_df. Converting to a plain data frame.

# 一號(36天發9篇文)
chen_leader1 = chen_posts %>% filter(artPoster=="Wojnarowski")
summary(chen_leader1)

##    artTitle            artDate             artTime            artUrl         
##  Length:9           Min.   :2021-05-10   Length:9          Length:9          
##  Class :character   1st Qu.:2021-05-22   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-23   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-23   Mode  :numeric                      
##                     3rd Qu.:2021-05-27                                       
##                     Max.   :2021-06-03                                       
##   artPoster            artCat            commentNum         push       
##  Length:9           Length:9           Min.   : 16.0   Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:106.0   1st Qu.: 42.00  
##  Mode  :character   Mode  :character   Median :214.0   Median : 58.00  
##                                        Mean   :293.8   Mean   : 72.78  
##                                        3rd Qu.:500.0   3rd Qu.:112.00  
##                                        Max.   :722.0   Max.   :188.00  
##       boo           sentence        
##  Min.   :  0.00   Length:9          
##  1st Qu.:  6.00   Class :character  
##  Median :  8.00   Mode  :character  
##  Mean   : 64.33                     
##  3rd Qu.: 37.00                     
##  Max.   :420.00

chen_leader1$artDate = as.Date(chen_leader1$artDate)
chen_leader1= chen_leader1 %>% mutate(days = as.Date(cut(artDate, "days")))
chen_leader1time = chen_leader1 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="Wojnarowski" )

# 二號(36天發8篇文)
chen_leader2 = chen_posts %>% filter(artPoster=="ocean0817")
summary(chen_leader2)

##    artTitle            artDate             artTime            artUrl         
##  Length:8           Min.   :2021-05-23   Length:8          Length:8          
##  Class :character   1st Qu.:2021-05-27   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-29   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-29   Mode  :numeric                      
##                     3rd Qu.:2021-05-31                                       
##                     Max.   :2021-06-03                                       
##   artPoster            artCat            commentNum          push      
##  Length:8           Length:8           Min.   :  61.0   Min.   : 25.0  
##  Class :character   Class :character   1st Qu.: 122.2   1st Qu.: 44.0  
##  Mode  :character   Mode  :character   Median : 208.5   Median : 75.5  
##                                        Mean   : 437.0   Mean   :114.6  
##                                        3rd Qu.: 632.8   3rd Qu.:149.2  
##                                        Max.   :1438.0   Max.   :346.0  
##       boo      sentence        
##  Min.   :0   Length:8          
##  1st Qu.:0   Class :character  
##  Median :0   Mode  :character  
##  Mean   :0                     
##  3rd Qu.:0                     
##  Max.   :0

chen_leader2$artDate = as.Date(chen_leader2$artDate)
chen_leader2= chen_leader2 %>% mutate(days = as.Date(cut(artDate, "days")))
chen_leader2time = chen_leader2 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="ocean0817" )

# 三號(36天發6篇文)
chen_leader3 = chen_posts %>% filter(artPoster=="zkow")
summary(chen_leader3)

##    artTitle            artDate             artTime            artUrl         
##  Length:6           Min.   :2021-05-14   Length:6          Length:6          
##  Class :character   1st Qu.:2021-05-16   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-19   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-21   Mode  :numeric                      
##                     3rd Qu.:2021-05-26                                       
##                     Max.   :2021-06-03                                       
##   artPoster            artCat            commentNum          push       
##  Length:6           Length:6           Min.   :   7.0   Min.   :  2.00  
##  Class :character   Class :character   1st Qu.: 100.2   1st Qu.: 38.75  
##  Mode  :character   Mode  :character   Median : 167.0   Median : 68.00  
##                                        Mean   : 343.3   Mean   : 95.33  
##                                        3rd Qu.: 242.8   3rd Qu.: 86.75  
##                                        Max.   :1373.0   Max.   :312.00  
##       boo         sentence        
##  Min.   : 0.0   Length:6          
##  1st Qu.: 0.0   Class :character  
##  Median : 3.0   Mode  :character  
##  Mean   : 7.5                     
##  3rd Qu.:10.5                     
##  Max.   :27.0

chen_leader3$artDate = as.Date(chen_leader3$artDate)
chen_leader3= chen_leader3 %>% mutate(days = as.Date(cut(artDate, "days")))
chen_leader3time = chen_leader3 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="zkow" )

整合他們的發文趨勢圖

chen_leader = rbind(chen_leader1time,chen_leader2time,chen_leader3time)
chen_leader %>% ggplot(aes(x= days,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free")

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(chen_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(chen_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
devotion_words %>%
  filter(!(word == "陳時中")) %>% 
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>2)  %>%
  arrange(desc(sum))  %>% wordcloud2(minSize = 3)

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(chen_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(chen_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word == "一共")) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  filter(!(word == "陳時中")) %>% 
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(chen_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(chen_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  # filter(!(word %in% c("必要","一定","一起","發現"))) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  filter(!(word == "陳時中")) %>% 
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

chen_3 = chen_posts %>% filter(artPoster=="Wojnarowski"|artPoster=="ocean0817"|artPoster=="zkow")

devotion_gram <- chen_3 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer)
devotion_gram

## # A tibble: 4,988 x 10
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  2 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  3 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  4 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  5 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  6 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  7 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  8 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
##  9 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
## 10 [新聞]費~ 2021-05-10 03:53:18 https:~ Wojnarow~ HateP~        214    58    91
## # ... with 4,978 more rows, and 1 more variable: word <chr>

sentiment_count = devotion_gram %>%
  select(artDate,word,artPoster) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment,artPoster) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.

sentiment_count %>%
  mutate(sentiment_count=ifelse(sentiment == "positive",1*count,-1*count)) %>% 
  ggplot(aes(artDate, sentiment_count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~artPoster, ncol = 1, scales = "free_y")

  # # 畫圖的部分
  # ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  # scale_color_manual() +
  # geom_col(position="dodge") + 
  # scale_x_date(labels = date_format("%m/%d")) +
  # labs(title = "sentiment of ptt",color = "情緒類別") +
  # facet_wrap(~artPoster, ncol = 1, scales="free_y")  # scale可以調整比例尺

陳時中意見領袖

ocean0817, wojnarowski, zkow

ocean0817: 主要都在nCoV2019發文且文章結尾都有「推文前請詳閱板規，請勿謾罵、引戰、張貼非允許媒體之連結」，所以負面情緒詞較少，平均推數：114，平均噓數:0。
Wojnarowski: 主要在政黑版轉貼新聞並於結尾給出自己的評論，所以多為負面情緒，平均推數：72，平均噓數:64。
zkow: 主要在八卦版以問卦方式發文，所以正負情緒較平均，平均推數：95，平均噓數:7。

柯文哲

資料篩選的方式：

文章:文章日期、留言數(commentNum)
link、node:degree

# 看一下留言數大概都多少(方便後面篩選)
kp_posts %>%
 # filter(commentNum<500) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

依據發文數或回覆數篩選post和review

# 帳號發文篇數
post_count = kp_posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
post_count

## # A tibble: 649 x 2
##    artPoster  count
##    <chr>      <int>
##  1 xamous        30
##  2 sunyeah       28
##  3 TeacherLin    16
##  4 godroid       13
##  5 tml7415       11
##  6 a15661263     10
##  7 Pietro        10
##  8 yniori        10
##  9 ffreakk        9
## 10 kcbill         9
## # ... with 639 more rows

# 帳號回覆總數
review_count = kp_reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
review_count

## # A tibble: 23,248 x 2
##    cmtPoster    count
##    <chr>        <int>
##  1 Tiphareth      480
##  2 black205       438
##  3 vow70          437
##  4 quiet93        351
##  5 nike00000000   344
##  6 birdy590       283
##  7 MVPGGYY        277
##  8 creay222       275
##  9 rustic5566     270
## 10 romber         268
## # ... with 23,238 more rows

# 發文者
poster_select <- post_count %>% filter(count >= 2)
kp_posts <- kp_posts %>%  filter(kp_posts$artPoster %in% poster_select$artPoster)

# 回覆者
reviewer_select <- review_count %>%  filter(count >= 20)
kp_reviews <- kp_reviews %>%  filter(kp_reviews$cmtPoster %in% reviewer_select$cmtPoster)

# 檢視參與人數
length(unique(kp_posts_Reviews$artPoster)) # 發文者數量 648

## [1] 648

length(unique(kp_posts_Reviews$cmtPoster)) # 回覆者數量 23248

## [1] 23248

allPoster <- c(kp_posts_Reviews$artPoster, kp_posts_Reviews$cmtPoster) # 總參與人數 23442
length(unique(allPoster))

## [1] 23442

標記所有出現過得使用者

poster：只發過文、發過文+留過言
replyer：只留過言

# 如果曾發過文標註為poster，反之標註為replyer
kp_userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%kp_posts$artPoster, "poster", "replyer"))

# 篩選回應數和發文次數
table(kp_posts$commentNum>=100)

## 
## FALSE  TRUE 
##   585   217

以日期篩選社群

為5/20討論度最高，我們挑出三個版當天的文章和回覆看看

link <- kp_posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      # filter(artCat=="Gossiping") %>% 
      filter(artDate == as.Date('2021-05-20')) %>%
      # 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link

## # A tibble: 135 x 3
## # Groups:   cmtPoster, artUrl [135]
##    cmtPoster   artPoster artUrl                                                 
##    <chr>       <chr>     <chr>                                                  
##  1 Behind4     humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  2 Hall        humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  3 vow70       humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  4 cosmos506   humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  5 loveponpon  humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  6 Danto18     humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  7 protect6090 humbler   https://www.ptt.cc/bbs/Gossiping/M.1621491305.A.17F.ht~
##  8 novashine   zzahoward https://www.ptt.cc/bbs/Gossiping/M.1621491573.A.55C.ht~
##  9 hohohoha    zzahoward https://www.ptt.cc/bbs/Gossiping/M.1621491573.A.55C.ht~
## 10 yehpi       zzahoward https://www.ptt.cc/bbs/Gossiping/M.1621491573.A.55C.ht~
## # ... with 125 more rows

篩選在link裡面有出現的使用者

kp_filtered_user <- kp_userList %>%
          # 篩選link中有出現的使用者
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(kp_filtered_user,3)

##           user    type
## 1      Behind4 replyer
## 2     s9234032 replyer
## 3 johnny790218 replyer

使用者是否受到歡迎

PTT的回覆有三種，推文、噓文、箭頭，我們只要看推噓就好，因此把箭頭清掉，這樣資料筆數較少，所以我們把篩選的條件放寬一些。

filter_degree = 10 # 使用者degree

# 過濾留言者對發文者的推噓程度
link <- kp_posts_Reviews %>%
      # filter(artCat=="Gossiping") %>% 
      filter(commentNum > 1000) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 3) %>%
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來，跟前面做的事都一樣，因此不再細述

# 篩選link中有出現的使用者
kp_filtered_user <- kp_userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=kp_filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
op <- par(family = "黑體-繁 中黑")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)

## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): font
## family not found in Windows font database

legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

par(op)

可以發現本次的討論中幾乎都是推文較多

網路圖

library(networkD3)
links = link
nodes = kp_filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  # 設定推噓顏色
             )

## Links is a tbl_df. Converting to a plain data frame.

# 一號(21天發9篇文)
kp_leader1 = kp_posts %>% filter(artPoster=="kcbill")
summary(kp_leader1)

##    artTitle            artDate             artTime            artUrl         
##  Length:9           Min.   :2021-05-14   Length:9          Length:9          
##  Class :character   1st Qu.:2021-05-17   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-18   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-22   Mode  :numeric                      
##                     3rd Qu.:2021-05-29                                       
##                     Max.   :2021-06-03                                       
##   artPoster            artCat            commentNum        push      
##  Length:9           Length:9           Min.   :  49   Min.   : 10.0  
##  Class :character   Class :character   1st Qu.:  78   1st Qu.: 29.0  
##  Mode  :character   Mode  :character   Median : 109   Median : 48.0  
##                                        Mean   : 263   Mean   :111.8  
##                                        3rd Qu.: 196   3rd Qu.: 77.0  
##                                        Max.   :1217   Max.   :614.0  
##       boo           sentence        
##  Min.   :  0.00   Length:9          
##  1st Qu.:  2.00   Class :character  
##  Median :  9.00   Mode  :character  
##  Mean   : 19.89                     
##  3rd Qu.: 15.00                     
##  Max.   :117.00

kp_leader1$artDate = as.Date(kp_leader1$artDate)
kp_leader1= kp_leader1 %>% mutate(days = as.Date(cut(artDate, "days")))
kp_leader1time = kp_leader1 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="kcbill" )

# 二號(25天發5篇文)
kp_leader2 = kp_posts %>% filter(artPoster=="KZS")
summary(kp_leader2)

##    artTitle            artDate             artTime            artUrl         
##  Length:5           Min.   :2021-05-05   Length:5          Length:5          
##  Class :character   1st Qu.:2021-05-09   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-12   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-15   Mode  :numeric                      
##                     3rd Qu.:2021-05-19                                       
##                     Max.   :2021-05-30                                       
##   artPoster            artCat            commentNum          push      
##  Length:5           Length:5           Min.   : 308.0   Min.   :154.0  
##  Class :character   Class :character   1st Qu.: 382.0   1st Qu.:198.0  
##  Mode  :character   Mode  :character   Median : 410.0   Median :248.0  
##                                        Mean   : 629.4   Mean   :338.4  
##                                        3rd Qu.: 613.0   3rd Qu.:291.0  
##                                        Max.   :1434.0   Max.   :801.0  
##       boo        sentence        
##  Min.   : 33   Length:5          
##  1st Qu.: 38   Class :character  
##  Median : 43   Mode  :character  
##  Mean   : 72                     
##  3rd Qu.: 65                     
##  Max.   :181

kp_leader2$artDate = as.Date(kp_leader2$artDate)
kp_leader2= kp_leader2 %>% mutate(days = as.Date(cut(artDate, "days")))
kp_leader2time = kp_leader2 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="KZS" )

# 三號(25天發6篇文)
kp_leader3 = kp_posts %>% filter(artPoster=="sunchen0201")
summary(kp_leader3)

##    artTitle            artDate             artTime            artUrl         
##  Length:6           Min.   :2021-05-03   Length:6          Length:6          
##  Class :character   1st Qu.:2021-05-13   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-18   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-17   Mode  :numeric                      
##                     3rd Qu.:2021-05-21                                       
##                     Max.   :2021-05-29                                       
##   artPoster            artCat            commentNum          push      
##  Length:6           Length:6           Min.   :  39.0   Min.   : 17.0  
##  Class :character   Class :character   1st Qu.:  49.0   1st Qu.: 22.0  
##  Mode  :character   Mode  :character   Median : 568.0   Median :337.5  
##                                        Mean   : 596.2   Mean   :384.7  
##                                        3rd Qu.:1128.2   3rd Qu.:725.8  
##                                        Max.   :1211.0   Max.   :844.0  
##       boo          sentence        
##  Min.   : 2.00   Length:6          
##  1st Qu.: 4.00   Class :character  
##  Median :34.00   Mode  :character  
##  Mean   :37.33                     
##  3rd Qu.:64.75                     
##  Max.   :85.00

kp_leader3$artDate = as.Date(kp_leader3$artDate)
kp_leader3= kp_leader3 %>% mutate(days = as.Date(cut(artDate, "days")))
kp_leader3time = kp_leader3 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="sunchen0201" )

整合他們的發文趨勢圖

kp_leader = rbind(kp_leader1time,kp_leader2time,kp_leader3time)
kp_leader %>% ggplot(aes(x= days,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free")

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
kp_devotion_sentences <- strsplit(kp_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
kp_devotion_sentences <- data.frame(
  id = rep(kp_leader1$artUrl,sapply(kp_devotion_sentences, length)), 
  sentence = unlist(kp_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
kp_devotion_sentences$sentence <- as.character(kp_devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- kp_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word == "柯文哲")) %>%
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
kp_devotion_sentences <- strsplit(kp_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
kp_devotion_sentences <- data.frame(
  id = rep(kp_leader2$artUrl,sapply(kp_devotion_sentences, length)), 
  sentence = unlist(kp_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
kp_devotion_sentences$sentence <- as.character(kp_devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- kp_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word == "一共")) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

“wordcloud”

# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
kp_devotion_sentences <- strsplit(kp_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
kp_devotion_sentences <- data.frame(
  id = rep(kp_leader3$artUrl,sapply(kp_devotion_sentences, length)), 
  sentence = unlist(kp_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
kp_devotion_sentences$sentence <- as.character(kp_devotion_sentences$sentence)

# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- kp_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word %in% c("必要","一定","一起","發現","柯文哲"))) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

kp_3 = kp_posts %>% filter(artPoster=="kcbill"|artPoster=="KZS"|artPoster=="sunchen0201")

kp_devotion_gram <- kp_3 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer)
kp_devotion_gram

## # A tibble: 3,826 x 10
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  2 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  3 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  4 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  5 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  6 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  7 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  8 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
##  9 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
## 10 [討論]柯~ 2021-05-14 12:43:19 https:~ kcbill    HateP~         78    48     2
## # ... with 3,816 more rows, and 1 more variable: word <chr>

sentiment_count = kp_devotion_gram %>%
  select(artDate,word,artPoster) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment,artPoster) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.

sentiment_count %>%
  mutate(sentiment_count=ifelse(sentiment == "positive",1*count,-1*count)) %>% 
  ggplot(aes(artDate, sentiment_count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~artPoster, ncol = 1, scales = "free_y")

  # # 畫圖的部分
  # ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  # scale_color_manual() +
  # geom_col(position="dodge") + 
  # scale_x_date(labels = date_format("%m/%d")) +
  # labs(title = "sentiment of ptt",color = "情緒類別") +
  # facet_wrap(~artPoster, ncol = 1, scales="free_y")  # scale可以調整比例尺

柯文哲意見領袖

kcbill, KZS, sunchen0201

kcbill多以討論的方式於政黑版發文，平均推數：111，平均噓數：19。
KZS以問卦的方式於八卦版發文，平均推數：338，平均噓數：72，他們主要都針對疫情給出評論較少出現情緒詞。
sunchen0201發文內容以新聞加上圖片為主並於結尾附註「我們社會如此可恥，而我的國家滿是傷痛」，所以每篇負面情緒詞較正面多，平均推數：384，平均噓數：37。

蔡英文

資料篩選的方式：

文章:文章日期、留言數(commentNum)
link、node:degree

# 看一下留言數大概都多少(方便後面篩選)
tsai_posts %>%
 # filter(commentNum<500) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

依據發文數或回覆數篩選post和review

# 帳號發文篇數
post_count = tsai_posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
post_count

## # A tibble: 435 x 2
##    artPoster   count
##    <chr>       <int>
##  1 iamtony        35
##  2 Pattaya        15
##  3 tontontonni    11
##  4 B0858B          8
##  5 markban         8
##  6 EVEA            7
##  7 omanorboyo      7
##  8 zzyyxx77        7
##  9 Jesusmaycry     6
## 10 jordanlove      6
## # ... with 425 more rows

# 帳號回覆總數
review_count = tsai_reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
review_count

## # A tibble: 17,875 x 2
##    cmtPoster   count
##    <chr>       <int>
##  1 TheoEpstein   232
##  2 Annis812      184
##  3 TZUYIC        166
##  4 waijr         157
##  5 saltlake      154
##  6 trywish       152
##  7 rustic5566    140
##  8 BaRanKa       135
##  9 A80211ab      134
## 10 dai26         134
## # ... with 17,865 more rows

# 發文者
poster_select <- post_count %>% filter(count >= 2)
tsai_posts <- tsai_posts %>%  filter(tsai_posts$artPoster %in% poster_select$artPoster)

# 回覆者
reviewer_select <- review_count %>%  filter(count >= 20)
tsai_reviews <- tsai_reviews %>%  filter(tsai_reviews$cmtPoster %in% reviewer_select$cmtPoster)

# 檢視參與人數
length(unique(tsai_posts_Reviews$artPoster)) # 發文者數量 432

## [1] 432

length(unique(tsai_posts_Reviews$cmtPoster)) # 回覆者數量 17875

## [1] 17875

allPoster <- c(tsai_posts_Reviews$artPoster, tsai_posts_Reviews$cmtPoster) # 總參與人數 18043
length(unique(allPoster))

## [1] 18043

標記所有出現過得使用者

poster：只發過文、發過文+留過言
replyer：只留過言

# 如果曾發過文標註為poster，反之標註為replyer
tsai_userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%tsai_posts$artPoster, "poster", "replyer"))
head(tsai_userList,3)

##         user    type
## 1    lucifax replyer
## 2 Janice1998 replyer
## 3   go190214 replyer

# 篩選回應數和發文次數
table(tsai_posts$commentNum>=100)

## 
## FALSE  TRUE 
##   279   103

以日期篩選社群

為5/31討論度最高，我們挑出三個版當天的文章和回覆看看

link <- tsai_posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      # filter(artCat=="Gossiping") %>% 
      filter(artDate == as.Date('2021-05-31')) %>%
      # 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link

## # A tibble: 231 x 3
## # Groups:   cmtPoster, artUrl [231]
##    cmtPoster   artPoster artUrl                                                 
##    <chr>       <chr>     <chr>                                                  
##  1 chinhan1216 homru     https://www.ptt.cc/bbs/Gossiping/M.1622440004.A.F1D.ht~
##  2 violadepp   homru     https://www.ptt.cc/bbs/Gossiping/M.1622440004.A.F1D.ht~
##  3 kingstongyu homru     https://www.ptt.cc/bbs/Gossiping/M.1622440004.A.F1D.ht~
##  4 gust0985    homru     https://www.ptt.cc/bbs/Gossiping/M.1622440004.A.F1D.ht~
##  5 sam8921502  homru     https://www.ptt.cc/bbs/Gossiping/M.1622440004.A.F1D.ht~
##  6 newmp4      v929598   https://www.ptt.cc/bbs/Gossiping/M.1622444176.A.62F.ht~
##  7 pal1231     v929598   https://www.ptt.cc/bbs/Gossiping/M.1622444176.A.62F.ht~
##  8 treeman47   v929598   https://www.ptt.cc/bbs/Gossiping/M.1622444176.A.62F.ht~
##  9 other9343   ipipwrong https://www.ptt.cc/bbs/Gossiping/M.1622445770.A.A18.ht~
## 10 LaoEr       ipipwrong https://www.ptt.cc/bbs/Gossiping/M.1622445770.A.A18.ht~
## # ... with 221 more rows

篩選在link裡面有出現的使用者

tsai_filtered_user <- tsai_userList %>%
          # 篩選link中有出現的使用者
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

使用者是否受到歡迎

PTT的回覆有三種，推文、噓文、箭頭，我們只要看推噓就好，因此把箭頭清掉，這樣資料筆數較少，所以我們把篩選的條件放寬一些。

filter_degree = 10 # 使用者degree

# 過濾留言者對發文者的推噓程度
tsai_link <- tsai_posts_Reviews %>%
      # filter(artCat=="Gossiping") %>% 
      filter(commentNum > 500) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 3) %>%
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來，跟前面做的事都一樣，因此不再細述

# 篩選link中有出現的使用者
tsai_filtered_user <- tsai_userList %>%
          filter(user%in%tsai_link$cmtPoster | user%in%tsai_link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=tsai_link, v=tsai_filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
op <- par(family = "黑體-繁 中黑")
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)

## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): font
## family not found in Windows font database

legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

par(op)

可以發現本次的討論中幾乎都是推文較多

library(networkD3)
links = tsai_link
nodes = tsai_filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  # 設定推噓顏色
             )

## Links is a tbl_df. Converting to a plain data frame.

蔡英文意見領袖

# 一號(22天發4篇文)
tsai_leader1 = tsai_posts %>% filter(artPoster=="BlueBird5566")
summary(tsai_leader1)

##    artTitle            artDate             artTime            artUrl         
##  Length:4           Min.   :2021-05-09   Length:4          Length:4          
##  Class :character   1st Qu.:2021-05-15   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-18   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-19   Mode  :numeric                      
##                     3rd Qu.:2021-05-21                                       
##                     Max.   :2021-05-31                                       
##   artPoster            artCat            commentNum         push      
##  Length:4           Length:4           Min.   :  7.0   Min.   : 4.00  
##  Class :character   Class :character   1st Qu.: 35.5   1st Qu.:21.25  
##  Mode  :character   Mode  :character   Median : 73.0   Median :43.00  
##                                        Mean   :125.2   Mean   :45.25  
##                                        3rd Qu.:162.8   3rd Qu.:67.00  
##                                        Max.   :348.0   Max.   :91.00  
##       boo          sentence        
##  Min.   : 0.00   Length:4          
##  1st Qu.: 1.50   Class :character  
##  Median : 7.00   Mode  :character  
##  Mean   :14.25                     
##  3rd Qu.:19.75                     
##  Max.   :43.00

tsai_leader1$artDate = as.Date(tsai_leader1$artDate)
tsai_leader1= tsai_leader1 %>% mutate(days = as.Date(cut(artDate, "days")))
tsai_leader1time = tsai_leader1 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="BlueBird5566" )

# 二號(9天發2篇文)
tsai_leader2 = tsai_posts %>% filter(artPoster=="kivan00")
summary(tsai_leader2)

##    artTitle            artDate             artTime            artUrl         
##  Length:2           Min.   :2021-05-18   Length:2          Length:2          
##  Class :character   1st Qu.:2021-05-20   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-22   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-22   Mode  :numeric                      
##                     3rd Qu.:2021-05-24                                       
##                     Max.   :2021-05-27                                       
##   artPoster            artCat            commentNum          push       
##  Length:2           Length:2           Min.   : 296.0   Min.   : 55.00  
##  Class :character   Class :character   1st Qu.: 586.2   1st Qu.: 83.75  
##  Mode  :character   Mode  :character   Median : 876.5   Median :112.50  
##                                        Mean   : 876.5   Mean   :112.50  
##                                        3rd Qu.:1166.8   3rd Qu.:141.25  
##                                        Max.   :1457.0   Max.   :170.00  
##       boo          sentence        
##  Min.   :142.0   Length:2          
##  1st Qu.:311.5   Class :character  
##  Median :481.0   Mode  :character  
##  Mean   :481.0                     
##  3rd Qu.:650.5                     
##  Max.   :820.0

tsai_leader2$artDate = as.Date(tsai_leader2$artDate)
tsai_leader2= tsai_leader2 %>% mutate(days = as.Date(cut(artDate, "days")))
tsai_leader2time = tsai_leader2 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="kivan00" )

# 三號(26天發5篇文)
tsai_leader3 = tsai_posts %>% filter(artPoster=="iammatrix")
summary(tsai_leader3)

##    artTitle            artDate             artTime            artUrl         
##  Length:5           Min.   :2021-05-05   Length:5          Length:5          
##  Class :character   1st Qu.:2021-05-18   Class1:hms        Class :character  
##  Mode  :character   Median :2021-05-20   Class2:difftime   Mode  :character  
##                     Mean   :2021-05-21   Mode  :numeric                      
##                     3rd Qu.:2021-05-31                                       
##                     Max.   :2021-05-31                                       
##   artPoster            artCat            commentNum         push      
##  Length:5           Length:5           Min.   : 43.0   Min.   : 24.0  
##  Class :character   Class :character   1st Qu.: 80.0   1st Qu.: 30.0  
##  Mode  :character   Mode  :character   Median :223.0   Median :149.0  
##                                        Mean   :240.8   Mean   :144.4  
##                                        3rd Qu.:343.0   3rd Qu.:204.0  
##                                        Max.   :515.0   Max.   :315.0  
##       boo         sentence        
##  Min.   : 1.0   Length:5          
##  1st Qu.: 2.0   Class :character  
##  Median : 8.0   Mode  :character  
##  Mean   :13.4                     
##  3rd Qu.:23.0                     
##  Max.   :33.0

tsai_leader3$artDate = as.Date(tsai_leader3$artDate)
tsai_leader3= tsai_leader3 %>% mutate(days = as.Date(cut(artDate, "days")))
tsai_leader3time = tsai_leader3 %>%group_by(days) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="iammatrix" )

# 整合他們的發文趨勢圖
tsai_leader = rbind(tsai_leader1time,tsai_leader2time,tsai_leader3time)
tsai_leader %>% ggplot(aes(x= days,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free")

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
tsai_devotion_sentences <- strsplit(tsai_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
tsai_devotion_sentences <- data.frame(
  id = rep(tsai_leader1$artUrl,sapply(tsai_devotion_sentences, length)), 
  sentence = unlist(tsai_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
tsai_devotion_sentences$sentence <- as.character(tsai_devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- tsai_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  filter(sum>1)  %>%
  arrange(desc(sum))  %>% wordcloud2()

“wordcloud”

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
tsai_devotion_sentences <- strsplit(tsai_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
tsai_devotion_sentences <- data.frame(
  id = rep(tsai_leader2$artUrl,sapply(tsai_devotion_sentences, length)), 
  sentence = unlist(tsai_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
tsai_devotion_sentences$sentence <- as.character(tsai_devotion_sentences$sentence)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- tsai_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word %in% c("蔡英文","必泰","並在"))) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  wordcloud2()

“wordcloud”

# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
tsai_devotion_sentences <- strsplit(tsai_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
tsai_devotion_sentences <- data.frame(
  id = rep(tsai_leader3$artUrl,sapply(tsai_devotion_sentences, length)), 
  sentence = unlist(tsai_devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
tsai_devotion_sentences$sentence <- as.character(tsai_devotion_sentences$sentence)

# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- tsai_devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  filter(!(word %in% c("出一","不算","一起","一點","蔡英文","一堆","被表裡"))) %>% 
  count(id, word, sort = TRUE)
devotion_words %>%
  group_by(word) %>%
  summarise(sum = n())%>%
  wordcloud2()

BlueBird5566、kivan00、 iammatrix

tsai_3 = tsai_posts %>% filter(artPoster=="BlueBird5566"|artPoster=="kivan00"|artPoster=="iammatrix")

tsai_devotion_gram <- tsai_3 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer)
tsai_devotion_gram

## # A tibble: 2,110 x 10
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  2 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  3 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  4 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  5 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  6 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  7 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  8 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
##  9 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
## 10 Re:[新聞~ 2021-05-05 05:03:20 https:~ iammatrix Gossi~         43    24     2
## # ... with 2,100 more rows, and 1 more variable: word <chr>

sentiment_count = tsai_devotion_gram %>%
  select(artDate,word,artPoster) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment,artPoster) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.

sentiment_count %>%
  mutate(sentiment_count=ifelse(sentiment == "positive",1*count,-1*count)) %>% 
  ggplot(aes(artDate, sentiment_count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~artPoster, ncol = 1, scales = "free_y")

  # # 畫圖的部分
  # ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  # scale_color_manual() +
  # geom_col(position="dodge") + 
  # scale_x_date(labels = date_format("%m/%d")) +
  # labs(title = "sentiment of ptt",color = "情緒類別") +
  # facet_wrap(~artPoster, ncol = 1, scales="free_y")  # scale可以調整比例尺

BlueBird5566暱稱為唯一支持蔡英文，所以情緒分析偏正面，平均推數：45，平均噓數：14; iammatrix平均推數：112，平均噓數：481;kivan00平均推數：144，平均噓數：13，都主要以分享新聞或引述發文，但意外的是正面情緒詞多於負面。

參、結論

1.台灣黨派對立嚴重，四位關鍵人都有互相喊話
2.兩岸對立嚴重，中國疫苗台灣人是否該接受的心裡成本高
3.蔡英文重要被探討的非疫情本身

期末-社群網路針對疫情分析政治人物聲量(蔡英文、陳時中、柯文哲、侯友宜)

賴品臻、黃子芹、宋芝瑩、蔡沛靜

2021/6/15

壹、前情提要

貳、疫情間政治人物社群網路、文字雲及情緒分析

日發文折線圖

侯友宜

陳時中

柯文哲

蔡英文

LDA 主題分類

TOKEN

侯友宜token

陳時中token

柯文哲token

蔡英文token

LDA 主題分析

侯友宜LDA

侯友宜LDAvis

陳時中LDA

LDAvis

柯文哲LDA

LDAvis

蔡英文LDA

LDAvis

取出個人物代表字詞(term)

侯友宜代表字詞

代表主題(topic)

資料內容探索

各主題細目討論

日期主題分析

陳時中代表字詞

代表主題(topic)

資料內容探索

各主題細目討論

日期主題分布

柯文哲代表字詞

代表主題(topic)

資料內容探索

各主題細目討論

日期主題分布

蔡英文代表字詞

代表主題(topic)

資料內容探索

各主題細目討論

日期主題分布

社群網路圖

基本網路圖

資料篩選

侯友宜

以日期篩選社群

使用者是否受到歡迎

網路圖

侯友宜意見領袖

haehae311444，tingdou，uesnba

陳時中

以日期篩選社群

使用者是否受到歡迎

網路圖

陳時中意見領袖

ocean0817, wojnarowski, zkow

柯文哲

以日期篩選社群

使用者是否受到歡迎

網路圖

柯文哲意見領袖

kcbill, KZS, sunchen0201

蔡英文

以日期篩選社群

使用者是否受到歡迎

蔡英文意見領袖

BlueBird5566、kivan00、 iammatrix

參、結論