This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

安裝需要的packages

packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(readr)

## Loading required package: readr

require(tm)

## Loading required package: tm

## Loading required package: NLP

require(data.table)

## Loading required package: data.table

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(stringr)

## Loading required package: stringr

require(jiebaR)

## Loading required package: jiebaR

## Loading required package: jiebaRD

require(tidytext)

## Loading required package: tidytext

require(ggplot2)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

require(tidyr)

## Loading required package: tidyr

require(topicmodels)

## Loading required package: topicmodels

require(LDAvis)

## Loading required package: LDAvis

require(wordcloud2)

## Loading required package: wordcloud2

require(webshot)

## Loading required package: webshot

require(htmlwidgets)

## Loading required package: htmlwidgets

require(servr)

## Loading required package: servr

load("data.rda")
df = df %>% select(id,Message)
colnames(df) = c("artUrl","Message")

斷句

sample_sentences <- strsplit(df$Message,"[。！；？!?;]")
sample_sentences<- data.frame(
                        artUrl = rep(df$artUrl, sapply(sample_sentences, length)), 
                        sentence = unlist(sample_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
sample_sentences$sentence <- as.character(sample_sentences$sentence)

載入各種字典

 lexicon <- scan(file = "./data/news_lexicon.txt", what=character(),sep="\n", 
                    encoding='UTF-8')
lexicon

## [1] "<U+FEFF>又老又窮\n賴清德\n林佳龍\n韓國瑜\n陳其邁\n蔡英文\n柯文哲\n丁守中\n姚文智\n蘇貞昌\n朱立倫\n陳學聖\n鄭文燦\n侯友宜\n選戰"
## [2] "抹黑"                                                                                                                  
## [3] "高雄發大財\n發大財\n盧秀燕\n貨出得去"

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(user = "./data/news_lexicon.txt",write  = "NOFILE",stop_word = "stop_words.txt")
# 使用還願字典重新斷詞
# new_user_word(jieba_tokenizer, c(lexicon))
# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
     tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
tokens <- sample_sentences %>%
  unnest_tokens(word, sentence,token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens

加入識別文章的id,將資料轉換為Document Term Matrix (DTM)

news <- tokens %>%
  mutate(artId = group_indices(., artUrl))
news

news_dtm <-news %>% cast_dtm(artId, word, count)
news_dtm

## <<DocumentTermMatrix (documents: 26856, terms: 39744)>>
## Non-/sparse entries: 405114/1066959750
## Sparsity           : 100%
## Maximal term length: 9
## Weighting          : term frequency (tf)

建立LDA模型

\(\phi\) Matrix

news_lda <- LDA(news_dtm, k = 6, control = list(seed = 1234))
news_topics <- tidy(news_lda, matrix = "beta")

尋找Topic的代表字

remove_words <- c("選戰", "新聞","頭殼","新頭殼","選舉","生活圈","市長","下載","直播","候選人","戰情","大家","點到")
news_top_terms <- news_topics %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic,-beta)


news_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

>從圖表可得知，這六個topic大部分都是與各直轄市長選舉有關。但是有四個topic都有提到韓國瑜，代表他在新聞媒體討論的熱度依舊很高。

查看組別間差異最大的詞

beta_spread <- news_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic4 > .0004 | topic6 > .0004) %>%
  mutate(log_ratio = log2(topic6 / topic4))

針對topic 4, topic6 進行分析，分析哪些詞彙 很常出現在topic 4，但很少出現在topic6的詞彙。

news_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio), beta_spread %>% top_n(-10, log_ratio)) %>%
  arrange(log_ratio)

news_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  xlab("Word")+
  coord_flip()

> 左下角為topic4常出現的詞，右上角為top6常出現的字，可以發現topic4主要討論焦點為北農總經理的事情。

reserved_word <- news %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% 
  unlist()

news_removed <- news %>% 
  filter( word %in% reserved_word)
news_dtm_removed <- news_removed %>% cast_dtm(artId, word, count)
news_lda_removed <- LDA(news_dtm_removed, k = 6, control = list(seed = 1234))
news_topics_removed <- tidy(news_lda_removed, matrix = "beta")

移除所有出現在三篇文章以下的詞彙

news_topics_removed %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

>我們可看圖表發現，topic1、topic5出現的政治人物只有柯文哲，與其他topic不同，其他的topic明顯都是各直轄市市長選舉，所以我把topic1及topic5各字詞的beta值做log_ratio的比較。

news_topics_removed <- tidy(news_lda_removed, matrix = "beta")

beta_spread_removed <- news_topics_removed %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .0004 | topic5 > .0004) %>%
  mutate(log_ratio = log2(topic5 / topic1))

news_removed_topic_ratio <- rbind(beta_spread_removed %>% top_n(10,wt = log_ratio), beta_spread_removed %>% top_n(-10, log_ratio)) %>% 
  arrange(log_ratio)

news_removed_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  coord_flip()

> 左下角的詞為topic1常出現但是topic5少出現的詞，右上角反之，我們仔細看看左下角的字詞，可發現topic1主要就是在討論柯文哲當年的器官案。

\(\theta\) matrix (document * topic)

查看\(\theta\) matrix

news_documents_removed <- tidy(news_lda_removed, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
news_documents_removed

gamma值代表的是這篇文章中有多少比例的詞是出自於特定topic

簡單的主題分類

news_documents_removed$document<- news_documents_removed$document %>% as.integer()
news_documents_removed %>% 
  group_by(document) %>% 
  top_n(1,gamma) %>% 
  arrange(topic) %>% 
  inner_join(news %>% distinct(artUrl,artId), by=c("document" = "artId")) %>% 
   inner_join(df, by="artUrl") %>% 
  select(topic, Message)

## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector

## Adding missing grouping variables: `document`

使用比率較高的topic作為代表topic，觀察不同Topic的本文

news_documents_removed$document <- news_documents_removed$document %>% as.integer()
news_documents_removed %>% 
  group_by(topic) %>% 
  top_n(10, wt=gamma) %>% 
  inner_join(news, by = c("document" = "artId")) %>% 
  distinct(artUrl) %>%
  inner_join(df, by = "artUrl") %>% 
  select(topic, Message)

## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector

R Notebook