This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
load("data.rda")
df = df %>% select(id,Message)
colnames(df) = c("artUrl","Message")
sample_sentences <- strsplit(df$Message,"[。!;?!?;]")
sample_sentences<- data.frame(
artUrl = rep(df$artUrl, sapply(sample_sentences, length)),
sentence = unlist(sample_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
sample_sentences$sentence <- as.character(sample_sentences$sentence)
lexicon <- scan(file = "./data/news_lexicon.txt", what=character(),sep="\n",
encoding='UTF-8')
lexicon
## [1] "<U+FEFF>又老又窮\n賴清德\n林佳龍\n韓國瑜\n陳其邁\n蔡英文\n柯文哲\n丁守中\n姚文智\n蘇貞昌\n朱立倫\n陳學聖\n鄭文燦\n侯友宜\n選戰"
## [2] "抹黑"
## [3] "高雄發大財\n發大財\n盧秀燕\n貨出得去"
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(user = "./data/news_lexicon.txt",write = "NOFILE",stop_word = "stop_words.txt")
# 使用還願字典重新斷詞
# new_user_word(jieba_tokenizer, c(lexicon))
# tokenize function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
tokens <- sample_sentences %>%
unnest_tokens(word, sentence,token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens
news <- tokens %>%
mutate(artId = group_indices(., artUrl))
news
news_dtm <-news %>% cast_dtm(artId, word, count)
news_dtm
## <<DocumentTermMatrix (documents: 26856, terms: 39744)>>
## Non-/sparse entries: 405114/1066959750
## Sparsity : 100%
## Maximal term length: 9
## Weighting : term frequency (tf)
news_lda <- LDA(news_dtm, k = 6, control = list(seed = 1234))
news_topics <- tidy(news_lda, matrix = "beta")
remove_words <- c("選戰", "新聞","頭殼","新頭殼","選舉","生活圈","市長","下載","直播","候選人","戰情","大家","點到")
news_top_terms <- news_topics %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic,-beta)
news_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
>從圖表可得知,這六個topic大部分都是與各直轄市長選舉有關。但是有四個topic都有提到韓國瑜,代表他在新聞媒體討論的熱度依舊很高。
beta_spread <- news_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic4 > .0004 | topic6 > .0004) %>%
mutate(log_ratio = log2(topic6 / topic4))
針對topic 4, topic6 進行分析,分析哪些詞彙 很常出現在topic 4,但很少出現在topic6的詞彙。
news_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio), beta_spread %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
news_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()
> 左下角為topic4常出現的詞,右上角為top6常出現的字,可以發現topic4主要討論焦點為北農總經理的事情。
reserved_word <- news %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
news_removed <- news %>%
filter( word %in% reserved_word)
news_dtm_removed <- news_removed %>% cast_dtm(artId, word, count)
news_lda_removed <- LDA(news_dtm_removed, k = 6, control = list(seed = 1234))
news_topics_removed <- tidy(news_lda_removed, matrix = "beta")
移除所有出現在三篇文章以下的詞彙
news_topics_removed %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
>我們可看圖表發現,topic1、topic5出現的政治人物只有柯文哲,與其他topic不同,其他的topic明顯都是各直轄市市長選舉,所以我把topic1及topic5各字詞的beta值做log_ratio的比較。
news_topics_removed <- tidy(news_lda_removed, matrix = "beta")
beta_spread_removed <- news_topics_removed %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic5 > .0004) %>%
mutate(log_ratio = log2(topic5 / topic1))
news_removed_topic_ratio <- rbind(beta_spread_removed %>% top_n(10,wt = log_ratio), beta_spread_removed %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
news_removed_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
coord_flip()
> 左下角的詞為topic1常出現但是topic5少出現的詞,右上角反之,我們仔細看看左下角的字詞,可發現topic1主要就是在討論柯文哲當年的器官案。
news_documents_removed <- tidy(news_lda_removed, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
news_documents_removed
gamma值代表的是這篇文章中有多少比例的詞是出自於特定topic
news_documents_removed$document<- news_documents_removed$document %>% as.integer()
news_documents_removed %>%
group_by(document) %>%
top_n(1,gamma) %>%
arrange(topic) %>%
inner_join(news %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
inner_join(df, by="artUrl") %>%
select(topic, Message)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
## Adding missing grouping variables: `document`
使用比率較高的topic作為代表topic,觀察不同Topic的本文
news_documents_removed$document <- news_documents_removed$document %>% as.integer()
news_documents_removed %>%
group_by(topic) %>%
top_n(10, wt=gamma) %>%
inner_join(news, by = c("document" = "artId")) %>%
distinct(artUrl) %>%
inner_join(df, by = "artUrl") %>%
select(topic, Message)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector