library(readxl)
july <- read_excel("data/07.xlsx", sheet = "工作表1")
august <- read_excel("data/08.xlsx", sheet = "工作表1")
september <- read_excel("data/09.xlsx", sheet = "工作表1")
october <- read_excel("data/10.xlsx", sheet = "工作表1")
november <- read_excel("data/11.xlsx", sheet = "工作表1")
Reference from: https://www.datacamp.com/community/tutorials/r-tutorial-read-excel-into-r#readxl
# 新增月份資料
july$month <- "201807"
august$month <- "201808"
september$month <- "201809"
october$month <- "201810"
november$month <- "201811"
# 彙整成一dataframe
df_all <- rbind(july, august, september, october, november)
colnames(df_all)[1:5] <- c("note", "dpm", "day", "mid", "id")
df_all <- df_all %>%
# 政治人物貼文
filter(is.na(note)) %>%
# 篩選需要的欄位
select(url, month, day, content, from)
# 整理日期格式
df_all$date <- ymd(paste0(df_all$month, df_all$day))
df_all$month <- NULL
df_all$day <- NULL
df_sentences <- df_all %>%
# 刪除 emoji 標點符號
mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>%
# 刪除非 UTF-8 編碼格式
mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence),
content = NULL) %>%
# 過濾tab與換行符號
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# 轉成character class
df_sentences$uurl <- as.character(df_sentences$url)
df_sentences$sentence <- as.character(df_sentences$sentence)
# 候選人提起自己
their_names <- scan(file = "dict/their_names.txt", what = character(),
sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
# 候選人所有名字
candidate_name <- scan(file = "dict/candidate_name_new.txt", what = character(),
sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
customized_dict <- scan(file = "dict/customized.txt", what = character(),
sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
將載入的字典轉換成dataframe,並合併成一個字典
their_names_df <- data.frame(word=their_names)
candidate_name_df <- data.frame(word=candidate_name)
all_keys_df <- rbind(their_names_df, candidate_name_df)
jieba_tokenizer <- worker()
# 將字典寫入jieba
new_user_word(jieba_tokenizer, c(customized_dict))
## [1] TRUE
斷詞函式
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
# 過濾掉只有出現一次的字,保留研究用字「我」
tokens <- tokens[nchar(tokens) > 1 | tokens %in% '我']
return(tokens)
})
}
tokens <- df_sentences %>%
# 斷詞
unnest_tokens(word, sentence, token = chi_tokenizer) %>%
# 過濾英文與數字
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
# 資料格式轉換
mutate(word = as.character(word)) %>%
# 計算每篇文章字頻
count(url, word) %>%
rename(count = n)
# Mapping回去token為哪個候選人、日期
tokens <- tokens %>%
left_join(select(df_all, url, from, date)) %>%
filter(!is.na(word))
## Joining, by = "url"
總字頻計算
tokens_count <- tokens %>%
group_by(word, from) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>%
arrange(from, desc(sum))
「我」關鍵字各候選人比較(總貼文)
tokens_count %>%
filter(word == "我") %>%
ggplot(aes(word, sum, fill = from)) +
geom_col() +
facet_grid(from ~ .) +
xlab(NULL) +
coord_flip()
「我們」關鍵字各候選人比較(總貼文)
tokens_count %>%
filter(word == "我們") %>%
ggplot(aes(word, sum, fill = from)) +
geom_col() +
facet_grid(from ~ .) +
xlab(NULL) +
coord_flip()
隨時間提及我之候選人比較
month_term_counts <- tokens %>%
mutate(month = month(date)) %>%
group_by(word, month) %>%
mutate(month_total = sum(count))
# month_term_counts %>%
# filter(word %in% "我") %>%
# ggplot(aes(month, count / month_total)) +
# geom_point() +
# geom_smooth(na.rm = T) +
# facet_wrap(~ from, scales = "free_y") +
# scale_y_continuous(labels = scales::percent_format()) +
# ylab("% frequency of word in inaugural address")
# 停用字
stopwords <- scan(file = "dict/stopwords_tc.txt", what = character(),
sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
head(stopwords_df)
## word
## 1 啊
## 2 阿
## 3 哎
## 4 哎呀
## 5 哎喲
## 6 唉
整理格式
data("NTUSD")
positive <- data.frame(word = NTUSD$positive_cht, sentiment="positive")
negative <- data.frame(word = NTUSD$negative_cht, sentiment="negative")
NTUSD_ch <- rbind(positive, negative)
head(NTUSD_ch)
## word sentiment
## 1 一帆風順 positive
## 2 一帆風順的 positive
## 3 一流 positive
## 4 一致 positive
## 5 一致的 positive
## 6 了不起 positive
台大字典正負面字總數與所佔比例,負面情緒詞是遠遠高於正面的。
NTUSD_ch %>%
group_by(sentiment) %>%
tally() %>%
ungroup() %>%
mutate(total = sum(n),
proportion = n/total)
## # A tibble: 2 x 4
## sentiment n total proportion
## <fct> <int> <int> <dbl>
## 1 positive 2810 11087 0.253
## 2 negative 8277 11087 0.747
LIWC字典,因為字典不完全,故暫不使用。
# p <- read_file("dict/liwc/positive.txt")
# n <- read_file("dict/liwc/negative.txt")
# positive <- strsplit(p, ",")[[1]]
# negative <- strsplit(n, ",")[[1]]
# positive <- data.frame(word = positive, sentiments = "positive", stringsAsFactors = FALSE)
# negative <- data.frame(word = negative, sentiemtns = "negative", stringsAsFactors = FALSE)
# colnames(negative) = c("word","sentiment")
# colnames(positive) = c("word","sentiment")
# LIWC_ch <- rbind(positive, negative)
word_count <- tokens %>%
# 去除stopwords
anti_join(stopwords_df) %>%
select(word, count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count > 10)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 文字雲
wordcloud2(word_count)
總貼文正負面常見前15個詞彙
word_count_sent <- word_count %>%
# 合併情緒字典
inner_join(NTUSD_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 去除重複的列
word_count_sent <- word_count_sent[!duplicated(word_count_sent),]
word_count_sent %>%
group_by(sentiment) %>%
top_n(15, wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to Sentiment (SUM)", x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
文本中正負面情緒詞的總數與佔比
word_count_sent %>%
group_by(sentiment) %>%
summarise(n = sum(count)) %>% ungroup() %>%
mutate(total = sum(n),
proportion = n/total)
## # A tibble: 2 x 4
## sentiment n total proportion
## <fct> <int> <int> <dbl>
## 1 positive 26957 33663 0.801
## 2 negative 6706 33663 0.199
發現雖字典負面情緒字較多,但在文本中正面的詞彙出現數是遠高於負面的
tokens_sent <- tokens %>% inner_join(NTUSD_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
tokens_sent <- tokens_sent[!duplicated(tokens_sent),]
sentiment_weight <- tokens_sent %>%
# 正面情緒詞給1;負面情緒詞給-1
mutate(sentiment=ifelse(sentiment %in% "positive", 1, -1))
依照各候選人統計每篇文章平均情緒分數
sentiment_weight %>%
group_by(from) %>%
summarise(mean_wt = mean(sentiment, na.rm = T)) %>%
arrange(desc(mean_wt)) %>%
ggplot(aes(x=from, y=mean_wt, fill=from)) +
geom_col(show.legend = F) +
labs(x=NULL) +
theme(axis.text.x = element_text(size=10, angle=30))
我們的丁守中看起來是比較負面的喔,鄭文燦是最正面燦爛的。
但這樣看不準,因為正面詞彙的比例是遠大於負面的,這時候只有丁守中是負面的就相當奇怪,不過我們還是先標準化看看。
總體國民幸福感(Gross national happiness, GNH)
GNH的操作型定義為「經過標準化後的『正面詞彙量與負面詞彙量之差』」(standardized difference between the use of positive and negative words)
\[GNH_d = \frac{\mu_{pd}-\mu_p}{\sigma_p}-\frac{\mu_{nd}-\mu_n}{\sigma_n}\]
特定一天的GNH=(特定某天的正面情緒詞平均數-總天數的正面情緒詞平均數/總天數的正面情緒詞標準差-(特定某天的負面情緒詞平均數-總天數的負面情緒詞平均數)/總天數的負面情緒詞標準差
Wordiness
以則數為單位,計算狀態的正、負面詞彙再整則貼文中的占比(a positivity (percent of words that were positive) and negativity (ibid) score)。
計算方式:
- Positivity=正面詞彙字數/總貼文字數 - Negativity=負面詞彙字數/總貼文字數
這邊計算`wordiness`有個問題,Positivity=正面詞彙字數/總貼文字數,這個總貼文字數包含非情緒詞典的詞嗎?
還是就是正負面總共字數?
# 正負面情緒詞總數
wordiness <- tokens_sent %>%
group_by(from, sentiment) %>%
summarize(senti_count = sum(count)) %>% #rename(senti_count = n) %>%
inner_join(
# 情緒總字數
tokens_sent %>% group_by(from) %>% summarize(n=sum(count)) %>% select(from, n) %>% rename(senti_total = n)
) %>%
inner_join(
# 總貼文字數
tokens %>% group_by(from) %>% summarize(n=sum(count)) %>% select(from, n)
) %>%
mutate(wordiness = senti_count/senti_total,
wordiness_all = senti_count/n)
## Joining, by = "from"
## Joining, by = "from"
head(wordiness)
## # A tibble: 6 x 7
## # Groups: from [3]
## from sentiment senti_count senti_total n wordiness wordiness_all
## <chr> <fct> <int> <int> <int> <dbl> <dbl>
## 1 丁守中粉絲團 positive 2330 4834 35998 0.482 0.0647
## 2 丁守中粉絲團 negative 2504 4834 35998 0.518 0.0696
## 3 侯友宜 positive 1449 2152 16614 0.673 0.0872
## 4 侯友宜 negative 703 2152 16614 0.327 0.0423
## 5 姚文智翻台北 positive 1651 2384 18575 0.693 0.0889
## 6 姚文智翻台北 negative 733 2384 18575 0.307 0.0395
wordiness %>%
ggplot(aes(from, wordiness, fill = sentiment)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_light()
先從wordiness判斷,紅色為正面詞彙、藍色為負面詞彙。看得出各候選人整體來說正面詞彙比較多,只有丁守中正負數量差不多,甚至負面多一點,以致情緒分數來說只有丁守中是比較負面的。
那我們來看看丁守中到底多用什麼詞彙使他負面情緒分數這麼高吧!
sentiment_weight %>%
filter(from == "丁守中粉絲團" & sentiment == -1) %>%
group_by(word) %>%
summarize(count = sum(count)) %>%
filter(count > 2) %>%
wordcloud2()
從負面字詞的文字雲來看,發現最多字是「問題」,丁守中很愛講問題問題,還有嚴重、不能、不是、不要、鬥爭等等的,還有一個很荒謬的是「台獨」也算負面詞?!台大情緒字典偷渡政治被抓包啊。那我們來追追看阿中的嚴重問題到底是哪些問題吧。
sentiment_weight %>%
filter(from == "丁守中粉絲團" & word == "問題") %>%
n_distinct(word)
## [1] 57
光有提到問題的文章數量在4個月內達57篇,丁守中在本資料總貼文數為393,也有15%的比例。
https://www.facebook.com/600540963315152/posts/1711190072250230 第一個問題就是學生住宿問題,是操弄政治,ok負面,給過,此文章為批鬥對手。 http://www.facebook.com/600540963315152/posts/1714361781933059 此問題為社子島問題,沒批鬥。 以此類推,有時間可以把他提到的問題看完,至少到現在看得出台北的未來果然不在丁守中。
接下來計算每個候選人每天的總體幸福感GNH。
GNH_day <- tokens_sent %>%
# 每天正負情緒詞平均
group_by(from, date, sentiment) %>%
summarize(mu_day = mean(count)) %>%
dcast(from + date ~ sentiment, value.var = "mu_day", fill = 0) %>%
mutate(month = month(date)) %>%
inner_join(
# 正負月平均
tokens_sent %>%
mutate(month=month(date)) %>%
group_by(from, month, sentiment) %>%
summarize(mu_mon = mean(count)) %>%
dcast(from + month ~ sentiment, value.var = "mu_mon") %>%
rename(mu_p = positive, mu_n = negative)
) %>%
inner_join(
# 正負月標準差
tokens_sent %>%
mutate(month=month(date)) %>%
group_by(from, month, sentiment) %>%
summarize(sd_mon = sd(count)) %>%
dcast(from + month ~ sentiment, value.var = "sd_mon") %>%
rename(sd_p = positive, sd_n = negative)
) %>% select(-month) %>%
mutate(GNH = (positive - mu_p)/sd_p - (negative - mu_n)/sd_n)
## Joining, by = c("from", "month")
## Joining, by = c("from", "month")
head(GNH_day)
## from date positive negative mu_p mu_n sd_p sd_n
## 1 丁守中粉絲團 2018-07-02 1.000000 0.000000 1.21875 1.252 0.7815269 0.8241437
## 2 丁守中粉絲團 2018-07-03 1.250000 1.000000 1.21875 1.252 0.7815269 0.8241437
## 3 丁守中粉絲團 2018-07-05 1.111111 1.000000 1.21875 1.252 0.7815269 0.8241437
## 4 丁守中粉絲團 2018-07-06 1.400000 1.200000 1.21875 1.252 0.7815269 0.8241437
## 5 丁守中粉絲團 2018-07-08 1.192308 1.142857 1.21875 1.252 0.7815269 0.8241437
## 6 丁守中粉絲團 2018-07-09 1.000000 1.000000 1.21875 1.252 0.7815269 0.8241437
## GNH
## 1 1.23925174
## 2 0.34575774
## 3 0.16804296
## 4 0.29501359
## 5 0.09859767
## 6 0.02587113
都算出來了那就來看看每位候選人每天的GNH變化吧
GNH_day %>%
filter(from %in% c("丁守中粉絲團", "柯文哲", "姚文智翻台北")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "台北市長候選人GNH", x = "月份", color="候選人")
GNH_day %>%
filter(from %in% c("蘇貞昌", "侯友宜")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "新北市長候選人GNH", x = "月份", color="候選人")
GNH_day %>%
filter(from %in% c("鄭文燦", "陳學聖")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "桃園市長候選人GNH", x = "月份", color="候選人")
GNH_day %>%
filter(from %in% c("盧秀燕", "林佳龍")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "台中市長候選人GNH", x = "月份", color="候選人")
GNH_day %>%
filter(from %in% c("立法委員 黃偉哲", "高思博 A Po", "林義豐MarkLin")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "台南市長候選人GNH", x = "月份", color="候選人")
## Warning: Removed 2 rows containing missing values (geom_path).
Mark Lin 在8/4負相當多,原來他發了這篇文http://www.facebook.com/1912587195640543/posts/2196850957214164,追蹤在台大情緒字典裡定義為負面詞,「熱線追蹤」出現很多次導致這篇文章相當負面啊。
相對於A po在8/4這天po了這篇文http://www.facebook.com/339483189800311/posts/556060864809208,因為出現「智慧」、「完整」,沒有什麼負面詞,以致正面GNH相當高的情況。
GNH_day %>%
filter(from %in% c("陳其邁 Chen Chi-Mai", "韓國瑜")) %>%
ggplot(aes(x = date, y = GNH, color = from)) +
geom_line(alpha=0.8) +
# scale_colour_brewer(type = "div", palette = 4) +
theme_light() +
labs(title = "高雄市長候選人GNH", x = "月份", color="候選人")
GNH_day %>%
filter(from %in% c("韓國瑜"), GNH < 0) %>%
select(date, GNH) %>%
arrange(GNH)
## date GNH
## 1 2018-07-19 -6.38707833
## 2 2018-07-06 -2.70627753
## 3 2018-07-25 -2.70627753
## 4 2018-11-02 -0.97095484
## 5 2018-09-06 -0.95547765
## 6 2018-08-18 -0.83412764
## 7 2018-09-21 -0.83169236
## 8 2018-10-22 -0.79197830
## 9 2018-10-06 -0.70583193
## 10 2018-08-26 -0.61045324
## 11 2018-08-19 -0.52801746
## 12 2018-11-15 -0.47081343
## 13 2018-09-13 -0.45151582
## 14 2018-08-23 -0.44948985
## 15 2018-08-22 -0.39504435
## 16 2018-11-18 -0.39325456
## 17 2018-11-21 -0.39325456
## 18 2018-11-13 -0.38242561
## 19 2018-08-31 -0.25829515
## 20 2018-11-20 -0.21022988
## 21 2018-07-10 -0.19927614
## 22 2018-07-30 -0.09137144
## 23 2018-09-01 -0.07102411
## 24 2018-09-04 -0.07102411
## 25 2018-09-05 -0.07102411
## 26 2018-09-07 -0.07102411
## 27 2018-09-11 -0.07102411
## 28 2018-09-12 -0.07102411
## 29 2018-09-16 -0.07102411
## 30 2018-07-07 -0.04061455
## 31 2018-07-13 -0.04061455
## 32 2018-07-14 -0.04061455
## 33 2018-07-15 -0.04061455
## 34 2018-07-23 -0.04061455
## 35 2018-07-26 -0.04061455
## 36 2018-10-02 -0.01135355
## 37 2018-10-11 -0.01135355
## 38 2018-10-31 -0.01135355
sentiment_weight %>% filter(from == "韓國瑜", date=="2018-07-19 ")
## # A tibble: 3 x 6
## url word count from date sentiment
## <chr> <chr> <int> <chr> <date> <dbl>
## 1 http://www.facebook.com/18630235239348… 幫助 1 韓國瑜… 2018-07-19 1
## 2 http://www.facebook.com/18630235239348… 成功 1 韓國瑜… 2018-07-19 1
## 3 http://www.facebook.com/18630235239348… 挑戰 5 韓國瑜… 2018-07-19 -1
tokens_sent %>%
mutate(month = month(date)) %>%
group_by(from, month) %>%
summarise(total_posts = n_distinct(url)) %>%
# 給予選區欄位
mutate(area = ifelse(from %in% c("丁守中粉絲團", "姚文智翻台北", "柯文哲"), "台北", ifelse(from %in% c("侯友宜", "蘇貞昌"), "新北", ifelse(from %in% c("鄭文燦", "陳學聖"), "桃園", ifelse(from %in% c("盧秀燕", "林佳龍"), "台中", ifelse(from %in% c("立法委員 黃偉哲", "林義豐MarkLin", "高思博 A Po"), "台南", ifelse(from %in% c("韓國瑜", "陳其邁 Chen Chi-Mai"), "高雄", "None"))))))) %>%
# 作圖
ggplot(aes(x=month, y=total_posts, color=from)) +
geom_line() +
geom_point() +
facet_wrap(~ area, nrow = 3) +
labs(title = "各選區候選人每月貼文數量",
x = "月份", y = "總貼文數量", color="候選人")
在這邊因為我們有月份資料,總天數我設定by每個候選人by月的正、負面字彙總數
# 計算各後選人每月、正負詞彙總數
tokens_sent %>%
mutate(month = month(date)) %>%
group_by(from, month, sentiment) %>%
summarise(total_words = sum(count)) %>%
inner_join(
# 計算各後選人每月、正負文章總數
tokens_sent %>%
mutate(month = month(date)) %>%
group_by(from, month) %>%
summarise(total_posts = n_distinct(url))
)
## Joining, by = c("from", "month")
## # A tibble: 140 x 5
## # Groups: from, month [70]
## from month sentiment total_words total_posts
## <chr> <dbl> <fct> <int> <int>
## 1 丁守中粉絲團 7 positive 312 48
## 2 丁守中粉絲團 7 negative 313 48
## 3 丁守中粉絲團 8 positive 698 77
## 4 丁守中粉絲團 8 negative 769 77
## 5 丁守中粉絲團 9 positive 475 83
## 6 丁守中粉絲團 9 negative 490 83
## 7 丁守中粉絲團 10 positive 615 99
## 8 丁守中粉絲團 10 negative 635 99
## 9 丁守中粉絲團 11 positive 230 27
## 10 丁守中粉絲團 11 negative 297 27
## # … with 130 more rows
# 計算每天的情緒分數
tokens_sent_score <- tokens_sent %>%
# 每天正負情緒分數
mutate(swt = count * ifelse(sentiment %in% "positive", 1, -1)) %>%
group_by(from, date) %>%
summarize(swt = sum(swt))
for (person in c("丁守中粉絲團", "柯文哲", "姚文智翻台北")) {
p <- GNH_day %>% filter(from==person) %>%
left_join(tokens_sent_score %>%
filter(from==person) %>%
select(from, date, swt)) %>%
ggplot(aes(x=date)) +
geom_line(aes(y = swt, color = "Weight")) +
geom_line(aes(y = GNH*10, color = "GNH")) +
scale_y_continuous(sec.axis = sec_axis(~., name = "GNH*10")) +
scale_color_manual(values = c("purple", "#E69F00")) +
labs(title = paste0(person, "貼文情緒分數總分&GNH"), x = "月份")
print(p)
}
## Joining, by = c("from", "date")
## Joining, by = c("from", "date")
## Joining, by = c("from", "date")
GNH的確讓分數波動看起來較小,但有些不一定反映真實情況
for (person in c("侯友宜", "蘇貞昌")) {
p <- GNH_day %>% filter(from==person) %>%
left_join(tokens_sent_score %>%
filter(from==person) %>%
select(from, date, swt)) %>%
ggplot(aes(x=date)) +
geom_line(aes(y = swt, color = "Weight")) +
geom_line(aes(y = GNH*10, color = "GNH")) +
scale_y_continuous(sec.axis = sec_axis(~., name = "GNH*10")) +
scale_color_manual(values = c("purple", "#E69F00")) +
labs(title = paste0(person, "貼文情緒分數總分&GNH"), x = "月份")
print(p)
}
## Joining, by = c("from", "date")
## Joining, by = c("from", "date")
GNH_day %>%
group_by(from) %>%
summarize(GNH = mean(GNH, na.rm = TRUE)) %>%
arrange(desc(GNH)) %>%
mutate(from = factor(from, levels = rev(unique(from)))) %>%
ggplot(aes(from, GNH))+
geom_col() +
coord_flip() +
xlab(NULL)
#### 候選人GNH與情緒分數關係
cmp_wst_gnh <- sentiment_weight %>%
bind_tf_idf(word, url, count) %>%
arrange(desc(tf_idf), word) %>%
group_by(from, word) %>%
summarize(sentiment = mean(sentiment),
tf = mean(tf),
tf_idf = mean(tf_idf)) %>%
mutate(wst = tf_idf * sentiment) %>%
group_by(from) %>%
summarise(wst=mean(wst)) %>%
inner_join(
GNH_day %>%
group_by(from) %>%
summarize(GNH = mean(GNH, na.rm = TRUE))
)
## Joining, by = "from"
cmp_wst_gnh %>%
ggplot(aes(wst, GNH)) +
geom_point(aes(color = from))
sentiment_weight %>%
bind_tf_idf(word, url, count) %>%
filter(!duplicated(.)) %>%
group_by(from) %>%
top_n(15) %>% ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
select(from, word, tf_idf) %>%
ggplot(aes(word, tf_idf, fill = from)) +
geom_col(show.legend = FALSE) +
facet_wrap(~from, scales = "free") +
labs(y = "TF-IFD", x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Selecting by tf_idf
sentiment_weight %>%
bind_tf_idf(word, url, count) %>%
arrange(desc(tf_idf), word) %>%
group_by(from, word) %>%
summarize(sentiment = mean(sentiment),
tf = mean(tf),
tf_idf = mean(tf_idf)) %>%
mutate(wst = tf_idf * sentiment) %>%
group_by(from) %>%
summarise(wst=mean(wst)) %>%
arrange(desc(wst)) %>%
mutate(from = factor(from, levels = rev(unique(from)))) %>%
ggplot(aes(from, wst))+
geom_col() +
coord_flip() +
xlab(NULL)
看出情緒分數乘上權重後,柯文哲、丁守中、高思博與陳學聖貼文是較為負面的
df_dtm <- tokens %>%
cast_dtm(url, word, count)
df_dtm
## <<DocumentTermMatrix (documents: 4364, terms: 37148)>>
## Non-/sparse entries: 268539/161845333
## Sparsity : 100%
## Maximal term length: 9
## Weighting : term frequency (tf)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
df_dtm <- removeSparseTerms(df_dtm, sparse=.99)
# Find the sum of words in each Document
row_totals <- apply(df_dtm , 1, sum)
# remove all docs without words
df_dtm <- df_dtm[row_totals > 0, ]
df_dtm
## <<DocumentTermMatrix (documents: 4353, terms: 1049)>>
## Non-/sparse entries: 139103/4427194
## Sparsity : 97%
## Maximal term length: 4
## Weighting : term frequency (tf)
df_lda <- LDA(df_dtm, k = 8, control = list(seed = 1234))
dtm_topic <- tidy(df_lda, matrix = "beta") %>%
distinct(topic, term, beta) %>%
filter(!term %in% stopwords)
dtm_topic <- dtm_topic[!duplicated(dtm_topic),]
dtm_topic %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(text = element_text(family = "黑體-繁 中黑"))
library(text2vec)
##
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
##
## perplexity
library(lda)
article_words <- lapply(as.character(df_sentences$sentence),
function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!str_detect(tokens, regex("[0-9a-zA-Z]"))]
# 去除字串長度為1的字
tokens <- tokens[nchar(tokens) > 1]
tokens <- tokens[!(tokens %in% stopwords)]
return(tokens)
})
it <- itoken(article_words)
v <- create_vocabulary(it)
v <- prune_vocabulary(v, term_count_min = 10, doc_proportion_max = 0.2)
vectorizer <- vocab_vectorizer(v)
dtm <- create_dtm(it, vectorizer, type = "dgTMatrix")
lda_model <- LDA$new(n_topics = 4,
doc_topic_prior = 0.1,
topic_word_prior = 0.01)
doc_topic_distr <- lda_model$fit_transform(x = dtm, n_iter = 1000,
convergence_tol = 0.001, n_check_convergence = 25,
progressbar = FALSE)
## INFO [2020-03-02 07:54:20] iter 25 loglikelihood = -1734162.907
## INFO [2020-03-02 07:54:20] iter 50 loglikelihood = -1708880.586
## INFO [2020-03-02 07:54:21] iter 75 loglikelihood = -1698748.711
## INFO [2020-03-02 07:54:21] iter 100 loglikelihood = -1695153.285
## INFO [2020-03-02 07:54:22] iter 125 loglikelihood = -1692619.740
## INFO [2020-03-02 07:54:23] iter 150 loglikelihood = -1691371.136
## INFO [2020-03-02 07:54:23] early stopping at 150 iteration
barplot(doc_topic_distr[1, ], xlab = "topic",
ylab = "proportion", ylim = c(0, 1),
names.arg = 1:ncol(doc_topic_distr))
lda_model$plot()
## Loading required namespace: servr