讀取資料

載入資料

# 文章資料
HongKong <- fread("articleMetaData.csv", encoding = "UTF-8")
HongKong$artDate = HongKong$artDate %>% as.Date("%Y/%m/%d") # 將日期欄位格式由chr轉為date

#回覆資料
HongKong_review <- fread("articleReviews.csv", encoding = "UTF-8")

# 選取需要的欄位
HongKong_review <- HongKong_review %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
HongKong_review2 <- fread("articleReviews.csv", encoding = "UTF-8")

order <- fread("hongkong3.csv", encoding = "UTF-8")
order$artDate = order$artDate %>% as.Date("%Y/%m/%d")
order

##                                               artTitle    artDate  artTime
##      1:    [新聞]寶礦力挺反送中？陸偶像女團GNZ48終止合 2019-07-11 14:14:16
##      2:    [新聞]寶礦力挺反送中？陸偶像女團GNZ48終止合 2019-07-11 14:14:16
##      3:    [新聞]寶礦力挺反送中？陸偶像女團GNZ48終止合 2019-07-11 14:14:16
##      4:    [新聞]寶礦力挺反送中？陸偶像女團GNZ48終止合 2019-07-11 14:14:16
##      5:    [新聞]寶礦力挺反送中？陸偶像女團GNZ48終止合 2019-07-11 14:14:16
##     ---                                                                   
## 763867: Re:[新聞]香港女吐心聲「不想移民台灣」！196字淚 2020-05-28 06:46:32
## 763868: Re:[新聞]香港女吐心聲「不想移民台灣」！196字淚 2020-05-28 06:46:32
## 763869: Re:[新聞]香港女吐心聲「不想移民台灣」！196字淚 2020-05-28 06:46:32
## 763870: Re:[新聞]香港女吐心聲「不想移民台灣」！196字淚 2020-05-28 06:46:32
## 763871: Re:[新聞]香港女吐心聲「不想移民台灣」！196字淚 2020-05-28 06:46:32
##                                                           artUrl   word count
##      1: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html 寶礦力     8
##      2: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html   廣告     5
##      3: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html   水特     4
##      4: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html   完整     3
##      5: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html  GNZ48     3
##     ---                                                                      
## 763867: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html   友善     1
## 763868: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html   港人     1
## 763869: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html   移民     1
## 763870: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html   慢走     1
## 763871: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html   不送     1

#情緒分析 ### 以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")

# 負向字典txt檔
N <- read_file("negative.txt")

#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

chenglap_data <- HongKong %>% 
  filter(artPoster == "chenglap")

chenglap_sentence <- chenglap_data %>% 
  select(artUrl,sentence)
  
chenglap_sentence <-strsplit(chenglap_sentence$sentence,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
chenglap_sentence  <- data.frame(
  artUrl = rep(chenglap_data$artUrl, sapply(chenglap_sentence, length)), 
  sentence = unlist(chenglap_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
chenglap_sentence$sentence <- as.character(chenglap_sentence$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
chenglap_word <- chenglap_sentence %>%
  unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

chenglap_article_sent <- chenglap_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

chenglap_article_sent <-chenglap_article_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(artsentiment = positive - negative)

chenglap_review <- HongKong_review2 %>% 
  filter(artPoster == "chenglap")

chenglap_review_content <- chenglap_review %>% 
  select(artUrl,cmtContent)
  
chenglap_review_content <-strsplit(chenglap_review_content$cmtContent,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
chenglap_review_content <- data.frame(
  artUrl = rep(chenglap_review$artUrl, sapply(chenglap_review_content, length)), 
  cmtContent = unlist(chenglap_review_content)) %>%
  filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
chenglap_review_content$cmtContent <- as.character(chenglap_review_content$cmtContent)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
chenglap_review_word <- chenglap_review_content %>%
  unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

chenglap_review_sent <- chenglap_review_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

chenglap_review_sent <-chenglap_review_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(cmtsentiment = positive - negative)

chenglap_atr_cmt_sen <- 
  merge(x = chenglap_article_sent, y = chenglap_review_sent, by = "artUrl") %>% 
  select(artUrl,artsentiment,cmtsentiment)

chenglap_atr_cmt_sen <- chenglap_atr_cmt_sen %>% 
  gather(sentiment,n,artsentiment:cmtsentiment) %>% 
  mutate(sentiment = gsub("sentiment","",sentiment)) %>% 
  arrange(artUrl,sentiment)

chenglap_sen_plot <- chenglap_atr_cmt_sen %>% 
  ggplot(aes(artUrl,n, fill = sentiment)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
  ggtitle("chenglap發文情緒與回覆情緒比較")

gaucher_data <- HongKong %>% 
  filter(artPoster == "gaucher")

gaucher_sentence <- gaucher_data %>% 
  select(artUrl,sentence)
  
gaucher_sentence <-strsplit(gaucher_sentence$sentence,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
gaucher_sentence  <- data.frame(
  artUrl = rep(gaucher_data$artUrl, sapply(gaucher_sentence, length)), 
  sentence = unlist(gaucher_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
gaucher_sentence$sentence <- as.character(gaucher_sentence$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
gaucher_word <- gaucher_sentence %>%
  unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

gaucher_article_sent <- gaucher_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

gaucher_article_sent <-gaucher_article_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(artsentiment = positive - negative)

gaucher_review <- HongKong_review2 %>% 
  filter(artPoster == "gaucher")

gaucher_review_content <- gaucher_review %>% 
  select(artUrl,cmtContent)
  
gaucher_review_content <-strsplit(gaucher_review_content$cmtContent,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
gaucher_review_content <- data.frame(
  artUrl = rep(gaucher_review$artUrl, sapply(gaucher_review_content, length)), 
  cmtContent = unlist(gaucher_review_content)) %>%
  filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
gaucher_review_content$cmtContent <- as.character(gaucher_review_content$cmtContent)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
gaucher_review_word <- gaucher_review_content %>%
  unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

gaucher_review_sent <- gaucher_review_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

gaucher_review_sent <-gaucher_review_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(cmtsentiment = positive - negative)

gaucher_atr_cmt_sen <- 
  merge(x = gaucher_article_sent, y = gaucher_review_sent, by = "artUrl") %>% 
  select(artUrl,artsentiment,cmtsentiment)

gaucher_atr_cmt_sen <- gaucher_atr_cmt_sen %>% 
  gather(sentiment,n,artsentiment:cmtsentiment) %>% 
  mutate(sentiment = gsub("sentiment","",sentiment)) %>% 
  arrange(artUrl,sentiment)

gaucher_sen_plot <- gaucher_atr_cmt_sen %>% 
  ggplot(aes(artUrl,n, fill = sentiment)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
  ggtitle("gaucher發文情緒與回覆情緒比較")

windsine_data <- HongKong %>% 
  filter(artPoster == "windsine")

windsine_sentence <- windsine_data %>% 
  select(artUrl,sentence)
  
windsine_sentence <-strsplit(windsine_sentence$sentence,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
windsine_sentence  <- data.frame(
  artUrl = rep(windsine_data$artUrl, sapply(windsine_sentence, length)), 
  sentence = unlist(windsine_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
windsine_sentence$sentence <- as.character(windsine_sentence$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
windsine_word <- windsine_sentence %>%
  unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

windsine_article_sent <- windsine_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

windsine_article_sent <-windsine_article_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(artsentiment = positive - negative)

windsine_review <- HongKong_review2 %>% 
  filter(artPoster == "windsine")

windsine_review_content <- windsine_review %>% 
  select(artUrl,cmtContent)
  
windsine_review_content <-strsplit(windsine_review_content$cmtContent,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
windsine_review_content <- data.frame(
  artUrl = rep(windsine_review$artUrl, sapply(windsine_review_content, length)), 
  cmtContent = unlist(windsine_review_content)) %>%
  filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
windsine_review_content$cmtContent <- as.character(windsine_review_content$cmtContent)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
windsine_review_word <- windsine_review_content %>%
  unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

windsine_review_sent <- windsine_review_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

windsine_review_sent <-windsine_review_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(cmtsentiment = positive - negative)

windsine_atr_cmt_sen <- 
  merge(x = windsine_article_sent, y = windsine_review_sent, by = "artUrl") %>% 
  select(artUrl,artsentiment,cmtsentiment)

windsine_atr_cmt_sen <- windsine_atr_cmt_sen %>% 
  gather(sentiment,n,artsentiment:cmtsentiment) %>% 
  mutate(sentiment = gsub("sentiment","",sentiment)) %>% 
  arrange(artUrl,sentiment)

windsine_sen_plot <- windsine_atr_cmt_sen %>% 
  ggplot(aes(artUrl,n, fill = sentiment)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
  ggtitle("windsine發文情緒與回覆情緒比較")

KAKAii_data <- HongKong %>% 
  filter(artPoster == "KAKAii")

KAKAii_sentence <- KAKAii_data %>% 
  select(artUrl,sentence)
  
KAKAii_sentence <-strsplit(KAKAii_sentence$sentence,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
KAKAii_sentence  <- data.frame(
  artUrl = rep(KAKAii_data$artUrl, sapply(KAKAii_sentence, length)), 
  sentence = unlist(KAKAii_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
KAKAii_sentence$sentence <- as.character(KAKAii_sentence$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
KAKAii_word <- KAKAii_sentence %>%
  unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

KAKAii_article_sent <- KAKAii_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

KAKAii_article_sent <-KAKAii_article_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(artsentiment = positive - negative)

KAKAii_review <- HongKong_review2 %>% 
  filter(artPoster == "KAKAii")

KAKAii_review_content <- KAKAii_review %>% 
  select(artUrl,cmtContent)
  
KAKAii_review_content <-strsplit(KAKAii_review_content$cmtContent,"[。！；？!?;]")

# 將每個句子與所屬的文章連結配對起來，整理成 dataframe
KAKAii_review_content <- data.frame(
  artUrl = rep(KAKAii_review$artUrl, sapply(KAKAii_review_content, length)), 
  cmtContent = unlist(KAKAii_review_content)) %>%
  filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
KAKAii_review_content$cmtContent <- as.character(KAKAii_review_content$cmtContent)


# 進行斷詞，並計算各詞彙在各文章中出現的次數
KAKAii_review_word <- KAKAii_review_content %>%
  unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

KAKAii_review_sent <- KAKAii_review_word %>% 
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=sum(n))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

KAKAii_review_sent <-KAKAii_review_sent %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate(cmtsentiment = positive - negative)

KAKAii_atr_cmt_sen <- 
  merge(x = KAKAii_article_sent, y = KAKAii_review_sent, by = "artUrl") %>% 
  select(artUrl,artsentiment,cmtsentiment)

KAKAii_atr_cmt_sen <- KAKAii_atr_cmt_sen %>% 
  gather(sentiment,n,artsentiment:cmtsentiment) %>% 
  mutate(sentiment = gsub("sentiment","",sentiment)) %>% 
  arrange(artUrl,sentiment)

KAKAii_sen_plot <- KAKAii_atr_cmt_sen %>% 
  ggplot(aes(artUrl,n, fill = sentiment)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
  ggtitle("KAKAii發文情緒與回覆情緒比較")

par(mfrow=c(2,2), mar=c(0,0,0,0))   # plot four figures - 2 rows, 2 columns

plot(chenglap_sen_plot)

plot(gaucher_sen_plot)

plot(windsine_sen_plot)

plot(KAKAii_sen_plot)

# 畫出文字雲
KAKAii_word %>%
   group_by(word) %>%
   summarise(sum = n()) %>%
   filter(sum > 2)  %>%
   arrange(desc(sum)) %>%
   wordcloud2()

sentiment

Tzuting

2020/6/14

系統設置

安裝 Packages

讀取資料

載入資料