避免中文亂碼
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(data.table)
library(reshape2)
library(scales)
這次用的資料集是冰與火之歌在PTT上的討論資料,爬標題的關鍵字有:
GoT、Game of Thrones、冰與火之歌、權力遊戲 來分析看看討論板上對這部影集的觀感以及社群網絡關係。
posts: 用文本分析平台抓下來的PTT 文章資料reviews: 用文本分析平台抓下來的PTT 推噓文資料posts <- read_csv("data/got_posts.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
reviews <- read_csv("data/got_reviews.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentPoster = col_character(),
## commentStatus = col_character(),
## commentDate = col_datetime(format = ""),
## commentContent = col_character()
## )
reviews只挑選:
reviews <- select(reviews, artUrl, commentPoster, commentStatus, commentContent)
length(unique(posts$artPoster))
## [1] 122
資料中有185篇貼文,其中有122個發文者
length(unique(reviews$commentPoster))
## [1] 2224
資料中有9998則回覆,其中有2224位使用者
allUsers <- c(posts$artPoster, reviews$commentPoster)
allUsers %>%
unique() %>%
length()
## [1] 2266
總共參與者為2266位
userList <- data.frame(user = unique(allUsers)) %>%
mutate(type = ifelse(user %in% posts$artPoster, "poster", "replyer"))
head(userList)
## user type
## 1 drgraffiti poster
## 2 H23324216 poster
## 3 solomn poster
## 4 tontontonni poster
## 5 look1225 poster
## 6 ducklingwu poster
先簡單將每個user分成發文者與回覆者,有發文者歸類為poster,沒有發過文則是replyer。
got_meta_post <- posts %>%
mutate(sentence = gsub("[\n]{2, }", "。", sentence))
head(got_meta_post)
## # A tibble: 6 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl>
## 1 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 2 Re:[心得]… 2019-04-15 09:50 https… H23324216 EAser… 68 32
## 3 [請益]GOT… 2019-04-17 03:12 https… solomn EAser… 23 14
## 4 [討論]為何G… 2019-04-18 03:54 https… tontonto… EAser… 107 43
## 5 Re:[討論]… 2019-04-18 07:24 https… look1225 EAser… 52 15
## 6 [閒聊]Got… 2019-04-19 04:59 https… duckling… EAser… 28 16
## # … with 2 more variables: boo <dbl>, sentence <chr>
將讀取的GoT文章資料,依\n切成句子。
got_sentences <- strsplit(got_meta_post$sentence, "[。!;?!?;]")
got_sentences <- data.frame(
artUrl = rep(got_meta_post$artUrl, sapply(got_sentences, length)),
sentence = unlist(got_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
got_sentences$sentence <- as.character(got_sentences$sentence)
got_lexicon <- scan(file = "dict/got_lexicon.txt", what = character(), sep='\n',
encoding = 'utf-8', fileEncoding = 'utf-8')
字典自行新增的冰與火之歌一些專有名詞,例如常看到的人名、暱稱等:
囧、阿雅、無垢者、龍母等等的約24個
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# 使用冰與火之歌字典重新斷詞
new_user_word(jieba_tokenizer, c(got_lexicon))
## [1] TRUE
# tokenize function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x) > 1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens) > 1 | tokens == "囧" | tokens == "冏"]
return(tokens)
}
})
}
tokens <- got_sentences %>%
unnest_tokens(word, sentence, token = chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
head(tokens)
## # A tibble: 6 x 3
## artUrl word count
## <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 幫忙 2
## 2 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 北境 2
## 3 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 被叫 1
## 4 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 並且 1
## 5 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 不給 1
## 6 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 不過 1
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
got_removed <- tokens %>%
filter( word %in% reserved_word)
got_dtm <- got_removed %>% cast_dtm(artUrl, word, count)
got_dtm
## <<DocumentTermMatrix (documents: 185, terms: 1192)>>
## Non-/sparse entries: 12256/208264
## Sparsity : 94%
## Maximal term length: 5
## Weighting : term frequency (tf)
整理出中文的stop words
stop_word <- c("可以","他們","沒有","真的","就是","覺得","我們","知道","因為","非常","還是","然後","所以","一個","非常","這樣","應該","如果","只是","自己","這個","可能","認為","不過","什麼","不是","完全","結果","怎麼","不會","已經","還有","現在","大家","其實","但是","想要","雖然")
got_lda <- LDA(got_dtm, k = 2, control = list(seed = 1234))
看各群的常用詞彙
tidy(got_lda, matrix = "beta") %>%
filter(!term %in% c("冰與火之歌", "權力", "遊戲"), !term %in% stop_word) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(text = element_text(family = "黑體-繁 中黑"))
透過分成兩類主題,我們可以發現一類的討論內容比較偏向討論劇情本身,另一類則是關於影集拍攝、觀眾等戲外的討論。
got_topics <- tidy(got_lda, matrix = "gamma") %>%
# 在tidy function中使用參數"gamma"來取得 theta矩陣。
group_by(document) %>%
top_n(1, wt = gamma)
head(got_topics)
## # A tibble: 6 x 3
## # Groups: document [6]
## document topic gamma
## <chr> <int> <dbl>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.A.4B9.html 1 0.799
## 2 https://www.ptt.cc/bbs/EAseries/M.1555350999.A.61F.html 1 0.950
## 3 https://www.ptt.cc/bbs/EAseries/M.1555588854.A.755.html 1 0.545
## 4 https://www.ptt.cc/bbs/EAseries/M.1555601456.A.E16.html 1 0.515
## 5 https://www.ptt.cc/bbs/EAseries/M.1555679137.A.1AB.html 1 0.730
## 6 https://www.ptt.cc/bbs/EAseries/M.1555814295.A.4CB.html 1 0.597
可以看出我們用LDA幫文章分出了兩類,以及各文章的gamma,接下來我們可以依照分出的主題來畫出社群網絡圖。
posts_reviews <- merge(posts, reviews, by = "artUrl")
link <- posts_reviews %>%
select(commentPoster, artPoster, artUrl)
reviewNetwork <- graph_from_data_frame(d = link, directed = T)
plot(reviewNetwork)
先簡單將回文者與發文者的關係直接畫出圖,看不出什麼東西。
plot(reviewNetwork, vertex.size = 2, edge.arrow.size = .2, vertex.label = NA)
因為資料量太多,我們篩選其中一天的資料。選擇冰與火之歌播出之日 5/13。
link <- posts_reviews %>%
filter(artDate == "2019-05-13") %>%
select(commentPoster, artPoster, artUrl)
篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user)
## user type
## 1 sundazlas replyer
## 2 Ipadhotwater replyer
## 3 neverli replyer
## 4 yujimin replyer
## 5 AppleAlice replyer
## 6 real4 replyer
reviewNetwork <- graph_from_data_frame(d = link, v = filtered_user, directed=F)
plot(reviewNetwork, vertex.size = 2, edge.arrow.size = .2, vertex.label = NA)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type == "poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
vertex.label = ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA),
vertex.label.ces = .5)
V(reviewNetwork)[degree(reviewNetwork) > 10]
## + 28/362 vertices, named, from 7a9c585:
## [1] orzisme pattda jcshie sasiru0959 MadAngel
## [6] celeris mainline tontontonni hahaha0204 ivorysoap
## [11] risingtide bloodrance attilalin smallsun10 HuangJ
## [16] s58565254 nomorefoggy dejamisvu steven655267 kwinner
## [21] mitdoh alepp123 minipig1127 sammon tinywill
## [26] a382773 XristianBale czchen
posts_reviews_topic <- merge(x = posts_reviews, y = got_topics,
by.x = "artUrl", by.y = "document")
link <- posts_reviews_topic %>%
filter(artDate=='2019-05-13') %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
filtered_user <- userList %>%
filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user)
## user type
## 1 sundazlas replyer
## 2 Ipadhotwater replyer
## 3 neverli replyer
## 4 yujimin replyer
## 5 AppleAlice replyer
## 6 real4 replyer
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
## 畫出社群網路圖
set.seed(5431)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA), vertex.label.ces=.5)
## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("topic 1","topic 2"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
# ptt的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- posts_reviews_topic %>%
filter(artDate == '2019-05-13', commentStatus != "→") %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
## 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$commentPoster | user%in%link$artPoster) %>%
arrange(desc(type))
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
## 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$commentStatus == "推", "lightgreen", "palevioletred")
## 畫出社群網路圖
set.seed(488)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA), vertex.label.ces=.5, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("like","unlike"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
#載入情緒辭典
positive <- read_file("dict/positive.txt") %>%
strsplit("[,]") %>%
unlist() %>%
data.frame(word = ., sentiment = "positive")
negative <- read_file("dict/negative.txt") %>%
strsplit("[,]") %>%
unlist() %>%
data.frame(word = ., sentiment = "negative")
LIWC_ch <- rbind(positive, negative)
先讀GoT留言資料,並依\n切成句子。
got_meta_review <- read_csv("data/got_reviews.csv") %>%
mutate(commentContent = gsub("[\n]{2, }", "。", commentContent))
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentPoster = col_character(),
## commentStatus = col_character(),
## commentDate = col_datetime(format = ""),
## commentContent = col_character()
## )
# 斷句處理
got_sentences2 <- strsplit(got_meta_review$commentContent, "[。!;?!?;]")
#got_sentences2
got_sentences2 <- data.frame(
artUrl = rep(got_meta_review$artUrl, sapply(got_sentences2, length)),
commentContent = unlist(got_sentences2),
commentPoster = rep(got_meta_review$commentPoster, sapply(got_sentences2, length))
) %>%
filter(!str_detect(commentContent, regex("^(\t|\n| )*$")))
got_sentences2$commentContent <- as.character(got_sentences2$commentContent)
#got_sentences2
tokens2 <- got_sentences2 %>%
unnest_tokens(word, commentContent, token = chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl,commentPoster, word) %>%
rename(count=n)
tokens2
## # A tibble: 47,454 x 4
## artUrl commentPoster word count
## <fct> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 變三殺… 1
## 2 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 波頓 1
## 3 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 成長 1
## 4 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 個人 1
## 5 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 後期 1
## 6 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 腳色 1
## 7 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 敬佩 1
## 8 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 可以 1
## 9 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 裡要 1
## 10 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 龍母 1
## # … with 47,444 more rows
## 清理斷詞結果
reserved_word2 <- tokens2 %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
got_removed2 <- tokens2 %>%
filter( word %in% reserved_word)
got_removed2
## # A tibble: 26,110 x 4
## artUrl commentPoster word count
## <fct> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 成長 1
## 2 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 個人 1
## 3 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 後期 1
## 4 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 腳色 1
## 5 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 可以 1
## 6 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 龍母 1
## 7 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 認為 1
## 8 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 瑟曦 1
## 9 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 珊莎 1
## 10 https://www.ptt.cc/bbs/EAseries/M.1555340543.… adie 什麼 1
## # … with 26,100 more rows
#將情緒與詞合併
got_word_sentiment <- got_removed2 %>%
inner_join(LIWC_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
head(got_word_sentiment, 10)
## # A tibble: 10 x 5
## artUrl commentPoster word count sentiment
## <fct> <fct> <chr> <int> <fct>
## 1 https://www.ptt.cc/bbs/EAseries/M.1… adie 可以 1 positive
## 2 https://www.ptt.cc/bbs/EAseries/M.1… adie 私生子… 1 negative
## 3 https://www.ptt.cc/bbs/EAseries/M.1… adm123 抱怨 1 negative
## 4 https://www.ptt.cc/bbs/EAseries/M.1… adm123 敵人 1 negative
## 5 https://www.ptt.cc/bbs/EAseries/M.1… adm123 奇怪 1 negative
## 6 https://www.ptt.cc/bbs/EAseries/M.1… adm123 願意 1 positive
## 7 https://www.ptt.cc/bbs/EAseries/M.1… adm123 重要 1 positive
## 8 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort 不好 1 negative
## 9 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort 不爽 1 negative
## 10 https://www.ptt.cc/bbs/EAseries/M.1… aquacomfort 私生子… 1 negative
got_sentiment <- got_word_sentiment %>%
left_join(posts, by="artUrl")
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
head(got_sentiment, 10)
## # A tibble: 10 x 14
## artUrl commentPoster word count sentiment artTitle artDate artTime
## <chr> <fct> <chr> <int> <fct> <chr> <date> <time>
## 1 https… adie 可以 1 positive [心得]GoT… 2019-04-15 06:56
## 2 https… adie 私生子… 1 negative [心得]GoT… 2019-04-15 06:56
## 3 https… adm123 抱怨 1 negative [心得]GoT… 2019-04-15 06:56
## 4 https… adm123 敵人 1 negative [心得]GoT… 2019-04-15 06:56
## 5 https… adm123 奇怪 1 negative [心得]GoT… 2019-04-15 06:56
## 6 https… adm123 願意 1 positive [心得]GoT… 2019-04-15 06:56
## 7 https… adm123 重要 1 positive [心得]GoT… 2019-04-15 06:56
## 8 https… aquacomfort 不好 1 negative [心得]GoT… 2019-04-15 06:56
## 9 https… aquacomfort 不爽 1 negative [心得]GoT… 2019-04-15 06:56
## 10 https… aquacomfort 私生子… 1 negative [心得]GoT… 2019-04-15 06:56
## # … with 6 more variables: artPoster <chr>, artCat <chr>,
## # commentNum <dbl>, push <dbl>, boo <dbl>, sentence <chr>
# ptt的回覆者情緒是正面或負面
link2 <- got_sentiment %>%
filter(artDate == '2019-05-13') %>%
select(commentPoster, artPoster, artUrl, sentiment)
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
## 篩選link中有出現的使用者
filtered_user2 <- userList %>%
filter(user%in%link2$commentPoster | user%in%link2$artPoster) %>%
arrange(desc(type))
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link2, v=filtered_user2, directed=F)
## 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type == "poster", "gold", "lightblue")
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$sentiment == "positive", "lightgreen", "palevioletred")
set.seed(788)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,
vertex.label=ifelse(degree(reviewNetwork) > 10, V(reviewNetwork)$label, NA), vertex.label.ces=.5, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("positive","negative"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
all_dates <-
expand.grid(seq(as.Date(min(posts$artDate)), as.Date(max(posts$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
got_data <- posts %>%
left_join(got_word_sentiment)
## Joining, by = "artUrl"
## Warning: Column `artUrl` joining character vector and factor, coercing into
## character vector
got_data$artDate= got_data$artDate %>% as.Date("%Y/%m/%d")
got_data
## # A tibble: 2,081 x 14
## artTitle artDate artTime artUrl artPoster artCat commentNum push
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl>
## 1 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 2 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 3 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 4 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 5 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 6 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 7 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 8 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 9 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## 10 [心得]GoT… 2019-04-15 06:56 https… drgraffi… EAser… 279 100
## # … with 2,071 more rows, and 6 more variables: boo <dbl>,
## # sentence <chr>, commentPoster <fct>, word <chr>, count <int>,
## # sentiment <fct>
plot_table <- got_data %>%
select(artDate, word, count) %>%
inner_join(LIWC_ch) %>%
group_by(artDate, sentiment) %>%
summarise(count = sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#沒有資料的日期將count設為0
plot_table <- all_dates %>%
merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
plot_table
## artDate sentiment count
## 1 2019-04-15 positive 56
## 2 2019-04-15 negative 41
## 3 2019-04-16 positive 0
## 4 2019-04-16 negative 0
## 5 2019-04-17 positive 0
## 6 2019-04-17 negative 0
## 7 2019-04-18 positive 20
## 8 2019-04-18 negative 9
## 9 2019-04-19 positive 12
## 10 2019-04-19 negative 14
## 11 2019-04-20 positive 9
## 12 2019-04-20 negative 4
## 13 2019-04-21 positive 23
## 14 2019-04-21 negative 26
## 15 2019-04-22 positive 43
## 16 2019-04-22 negative 18
## 17 2019-04-23 positive 56
## 18 2019-04-23 negative 46
## 19 2019-04-24 positive 20
## 20 2019-04-24 negative 3
## 21 2019-04-25 positive 18
## 22 2019-04-25 negative 5
## 23 2019-04-26 positive 3
## 24 2019-04-26 negative 4
## 25 2019-04-27 positive 2
## 26 2019-04-27 negative 2
## 27 2019-04-28 positive 43
## 28 2019-04-28 negative 23
## 29 2019-04-29 positive 151
## 30 2019-04-29 negative 135
## 31 2019-04-30 positive 90
## 32 2019-04-30 negative 43
## 33 2019-05-01 positive 1
## 34 2019-05-01 negative 0
## 35 2019-05-02 positive 34
## 36 2019-05-02 negative 27
## 37 2019-05-03 positive 20
## 38 2019-05-03 negative 1
## 39 2019-05-04 positive 3
## 40 2019-05-04 negative 2
## 41 2019-05-05 positive 10
## 42 2019-05-05 negative 5
## 43 2019-05-06 positive 58
## 44 2019-05-06 negative 47
## 45 2019-05-07 positive 39
## 46 2019-05-07 negative 27
## 47 2019-05-08 positive 80
## 48 2019-05-08 negative 58
## 49 2019-05-09 positive 72
## 50 2019-05-09 negative 21
## 51 2019-05-10 positive 9
## 52 2019-05-10 negative 3
## 53 2019-05-11 positive 19
## 54 2019-05-11 negative 2
## 55 2019-05-12 positive 40
## 56 2019-05-12 negative 30
## 57 2019-05-13 positive 99
## 58 2019-05-13 negative 66
## 59 2019-05-14 positive 125
## 60 2019-05-14 negative 94
## 61 2019-05-15 positive 88
## 62 2019-05-15 negative 43
## 63 2019-05-16 positive 78
## 64 2019-05-16 negative 44
## 65 2019-05-17 positive 61
## 66 2019-05-17 negative 26
## 67 2019-05-18 positive 14
## 68 2019-05-18 negative 6
ggplot(plot_table, aes(x = artDate, y = count, colour = sentiment)) +
geom_line() +
facet_wrap(~ sentiment)
由上圖可知,因為冰與火之歌在每週一推出一集,4/22、4/29、5/6、5/13都有較高的情緒表現,正負面的情緒都很高。
got_data %>%
select(artDate, word, count) %>%
inner_join(LIWC_ch) %>%
# Count by word and sentiment
count(word, sentiment) %>%
# Group by sentiment
group_by(sentiment) %>%
# Take the top 10 words for each sentiment
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
# Set up the plot with aes()
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
theme(text = element_text(family = "黑體-繁 中黑"))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Selecting by n