## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
packages = c("readr", "tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "igraph","knitr", "webshot", "purrr", "ramify", "RColorBrewer", "htmlwidgets", "servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入packages
library(readr)
library(tm)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
require(data.table)
require(wordcloud2)
require(scales)
require(reshape2)
require(widyr)
require(ggraph)
require(wordcloud)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 文章資料
HongKong <- fread("articleMetaData.csv", encoding = "UTF-8")
HongKong$artDate = HongKong$artDate %>% as.Date("%Y/%m/%d") # 將日期欄位格式由chr轉為date
#回覆資料
HongKong_review <- fread("articleReviews.csv", encoding = "UTF-8")
# 選取需要的欄位
HongKong_review <- HongKong_review %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
HongKong_review2 <- fread("articleReviews.csv", encoding = "UTF-8")
order <- fread("hongkong3.csv", encoding = "UTF-8")
order$artDate = order$artDate %>% as.Date("%Y/%m/%d")
order
## artTitle artDate artTime
## 1: [新聞]寶礦力挺反送中?陸偶像女團GNZ48終止合 2019-07-11 14:14:16
## 2: [新聞]寶礦力挺反送中?陸偶像女團GNZ48終止合 2019-07-11 14:14:16
## 3: [新聞]寶礦力挺反送中?陸偶像女團GNZ48終止合 2019-07-11 14:14:16
## 4: [新聞]寶礦力挺反送中?陸偶像女團GNZ48終止合 2019-07-11 14:14:16
## 5: [新聞]寶礦力挺反送中?陸偶像女團GNZ48終止合 2019-07-11 14:14:16
## ---
## 763867: Re:[新聞]香港女吐心聲「不想移民台灣」!196字淚 2020-05-28 06:46:32
## 763868: Re:[新聞]香港女吐心聲「不想移民台灣」!196字淚 2020-05-28 06:46:32
## 763869: Re:[新聞]香港女吐心聲「不想移民台灣」!196字淚 2020-05-28 06:46:32
## 763870: Re:[新聞]香港女吐心聲「不想移民台灣」!196字淚 2020-05-28 06:46:32
## 763871: Re:[新聞]香港女吐心聲「不想移民台灣」!196字淚 2020-05-28 06:46:32
## artUrl word count
## 1: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html 寶礦力 8
## 2: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html 廣告 5
## 3: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html 水特 4
## 4: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html 完整 3
## 5: https://www.ptt.cc/bbs/Gossiping/M.1562883620.A.B88.html GNZ48 3
## ---
## 763867: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html 友善 1
## 763868: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html 港人 1
## 763869: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html 移民 1
## 763870: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html 慢走 1
## 763871: https://www.ptt.cc/bbs/Gossiping/M.1590648394.A.2B5.html 不送 1
#情緒分析 ### 以LIWC字典判斷文集中的word屬於正面字還是負面字
# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
chenglap_data <- HongKong %>%
filter(artPoster == "chenglap")
chenglap_sentence <- chenglap_data %>%
select(artUrl,sentence)
chenglap_sentence <-strsplit(chenglap_sentence$sentence,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
chenglap_sentence <- data.frame(
artUrl = rep(chenglap_data$artUrl, sapply(chenglap_sentence, length)),
sentence = unlist(chenglap_sentence)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
chenglap_sentence$sentence <- as.character(chenglap_sentence$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
chenglap_word <- chenglap_sentence %>%
unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
chenglap_article_sent <- chenglap_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
chenglap_article_sent <-chenglap_article_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(artsentiment = positive - negative)
chenglap_review <- HongKong_review2 %>%
filter(artPoster == "chenglap")
chenglap_review_content <- chenglap_review %>%
select(artUrl,cmtContent)
chenglap_review_content <-strsplit(chenglap_review_content$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
chenglap_review_content <- data.frame(
artUrl = rep(chenglap_review$artUrl, sapply(chenglap_review_content, length)),
cmtContent = unlist(chenglap_review_content)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
chenglap_review_content$cmtContent <- as.character(chenglap_review_content$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
chenglap_review_word <- chenglap_review_content %>%
unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
chenglap_review_sent <- chenglap_review_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
chenglap_review_sent <-chenglap_review_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(cmtsentiment = positive - negative)
chenglap_atr_cmt_sen <-
merge(x = chenglap_article_sent, y = chenglap_review_sent, by = "artUrl") %>%
select(artUrl,artsentiment,cmtsentiment)
chenglap_atr_cmt_sen <- chenglap_atr_cmt_sen %>%
gather(sentiment,n,artsentiment:cmtsentiment) %>%
mutate(sentiment = gsub("sentiment","",sentiment)) %>%
arrange(artUrl,sentiment)
chenglap_sen_plot <- chenglap_atr_cmt_sen %>%
ggplot(aes(artUrl,n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
ggtitle("chenglap發文情緒與回覆情緒比較")
gaucher_data <- HongKong %>%
filter(artPoster == "gaucher")
gaucher_sentence <- gaucher_data %>%
select(artUrl,sentence)
gaucher_sentence <-strsplit(gaucher_sentence$sentence,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
gaucher_sentence <- data.frame(
artUrl = rep(gaucher_data$artUrl, sapply(gaucher_sentence, length)),
sentence = unlist(gaucher_sentence)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
gaucher_sentence$sentence <- as.character(gaucher_sentence$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
gaucher_word <- gaucher_sentence %>%
unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
gaucher_article_sent <- gaucher_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
gaucher_article_sent <-gaucher_article_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(artsentiment = positive - negative)
gaucher_review <- HongKong_review2 %>%
filter(artPoster == "gaucher")
gaucher_review_content <- gaucher_review %>%
select(artUrl,cmtContent)
gaucher_review_content <-strsplit(gaucher_review_content$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
gaucher_review_content <- data.frame(
artUrl = rep(gaucher_review$artUrl, sapply(gaucher_review_content, length)),
cmtContent = unlist(gaucher_review_content)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
gaucher_review_content$cmtContent <- as.character(gaucher_review_content$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
gaucher_review_word <- gaucher_review_content %>%
unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
gaucher_review_sent <- gaucher_review_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
gaucher_review_sent <-gaucher_review_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(cmtsentiment = positive - negative)
gaucher_atr_cmt_sen <-
merge(x = gaucher_article_sent, y = gaucher_review_sent, by = "artUrl") %>%
select(artUrl,artsentiment,cmtsentiment)
gaucher_atr_cmt_sen <- gaucher_atr_cmt_sen %>%
gather(sentiment,n,artsentiment:cmtsentiment) %>%
mutate(sentiment = gsub("sentiment","",sentiment)) %>%
arrange(artUrl,sentiment)
gaucher_sen_plot <- gaucher_atr_cmt_sen %>%
ggplot(aes(artUrl,n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
ggtitle("gaucher發文情緒與回覆情緒比較")
windsine_data <- HongKong %>%
filter(artPoster == "windsine")
windsine_sentence <- windsine_data %>%
select(artUrl,sentence)
windsine_sentence <-strsplit(windsine_sentence$sentence,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
windsine_sentence <- data.frame(
artUrl = rep(windsine_data$artUrl, sapply(windsine_sentence, length)),
sentence = unlist(windsine_sentence)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
windsine_sentence$sentence <- as.character(windsine_sentence$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
windsine_word <- windsine_sentence %>%
unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
windsine_article_sent <- windsine_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
windsine_article_sent <-windsine_article_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(artsentiment = positive - negative)
windsine_review <- HongKong_review2 %>%
filter(artPoster == "windsine")
windsine_review_content <- windsine_review %>%
select(artUrl,cmtContent)
windsine_review_content <-strsplit(windsine_review_content$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
windsine_review_content <- data.frame(
artUrl = rep(windsine_review$artUrl, sapply(windsine_review_content, length)),
cmtContent = unlist(windsine_review_content)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
windsine_review_content$cmtContent <- as.character(windsine_review_content$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
windsine_review_word <- windsine_review_content %>%
unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
windsine_review_sent <- windsine_review_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
windsine_review_sent <-windsine_review_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(cmtsentiment = positive - negative)
windsine_atr_cmt_sen <-
merge(x = windsine_article_sent, y = windsine_review_sent, by = "artUrl") %>%
select(artUrl,artsentiment,cmtsentiment)
windsine_atr_cmt_sen <- windsine_atr_cmt_sen %>%
gather(sentiment,n,artsentiment:cmtsentiment) %>%
mutate(sentiment = gsub("sentiment","",sentiment)) %>%
arrange(artUrl,sentiment)
windsine_sen_plot <- windsine_atr_cmt_sen %>%
ggplot(aes(artUrl,n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
ggtitle("windsine發文情緒與回覆情緒比較")
KAKAii_data <- HongKong %>%
filter(artPoster == "KAKAii")
KAKAii_sentence <- KAKAii_data %>%
select(artUrl,sentence)
KAKAii_sentence <-strsplit(KAKAii_sentence$sentence,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
KAKAii_sentence <- data.frame(
artUrl = rep(KAKAii_data$artUrl, sapply(KAKAii_sentence, length)),
sentence = unlist(KAKAii_sentence)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
KAKAii_sentence$sentence <- as.character(KAKAii_sentence$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="HongKong_lexicon.txt", stop_word = "stop_words.txt", write = "NOFILE")
HongKong_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
KAKAii_word <- KAKAii_sentence %>%
unnest_tokens(word, sentence, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
KAKAii_article_sent <- KAKAii_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
KAKAii_article_sent <-KAKAii_article_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(artsentiment = positive - negative)
KAKAii_review <- HongKong_review2 %>%
filter(artPoster == "KAKAii")
KAKAii_review_content <- KAKAii_review %>%
select(artUrl,cmtContent)
KAKAii_review_content <-strsplit(KAKAii_review_content$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
KAKAii_review_content <- data.frame(
artUrl = rep(KAKAii_review$artUrl, sapply(KAKAii_review_content, length)),
cmtContent = unlist(KAKAii_review_content)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
KAKAii_review_content$cmtContent <- as.character(KAKAii_review_content$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
KAKAii_review_word <- KAKAii_review_content %>%
unnest_tokens(word, cmtContent, token=HongKong_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
KAKAii_review_sent <- KAKAii_review_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
KAKAii_review_sent <-KAKAii_review_sent %>%
spread(sentiment, count, fill = 0) %>%
mutate(cmtsentiment = positive - negative)
KAKAii_atr_cmt_sen <-
merge(x = KAKAii_article_sent, y = KAKAii_review_sent, by = "artUrl") %>%
select(artUrl,artsentiment,cmtsentiment)
KAKAii_atr_cmt_sen <- KAKAii_atr_cmt_sen %>%
gather(sentiment,n,artsentiment:cmtsentiment) %>%
mutate(sentiment = gsub("sentiment","",sentiment)) %>%
arrange(artUrl,sentiment)
KAKAii_sen_plot <- KAKAii_atr_cmt_sen %>%
ggplot(aes(artUrl,n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 1, scales = "free_y") +
ggtitle("KAKAii發文情緒與回覆情緒比較")
par(mfrow=c(2,2), mar=c(0,0,0,0)) # plot four figures - 2 rows, 2 columns
plot(chenglap_sen_plot)
plot(gaucher_sen_plot)
plot(windsine_sen_plot)
plot(KAKAii_sen_plot)
# 畫出文字雲
KAKAii_word %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 2) %>%
arrange(desc(sum)) %>%
wordcloud2()