目錄

  • 載入packages
packages = c("kableExtra","lubridate","ggplot2","ggplot2","readr","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(knitr)
library(kableExtra)
require(readr)
library(dplyr)
require(lubridate)
require(ggplot2)

• 文章-資料介紹

  • 載入資料
non_comment_Kaohsiung <- read_csv("non_comment_Kaohsiung.csv")

non_comment_Kaohsiung %>% head()
## # A tibble: 6 x 12
##   url   text  ip    author title time                board_content
##   <chr> <chr> <chr> <chr>  <chr> <dttm>              <chr>        
## 1 http~ "fro~ 180.~ scott~ [閒聊]~ 2014-02-25 05:47:15 Kaohsiung    
## 2 http~ "網誌好~ 36.2~ wildp~ [遊記]~ 2014-03-04 10:13:59 Kaohsiung    
## 3 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-03-12 09:18:42 Kaohsiung    
## 4 http~ "網頁好~ <NA>  wildp~ [遊記]~ 2014-04-11 09:49:36 Kaohsiung    
## 5 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-04-13 14:08:50 Kaohsiung    
## 6 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-04-15 09:03:15 Kaohsiung    
## # ... with 5 more variables: find_word <chr>, board_content_type <chr>,
## #   comment_num <dbl>, imp_word_count <dbl>, route_imp_word_count <dbl>
  • 每月文章數
#新增變數month
month_and_url<- non_comment_Kaohsiung %>%
  select("find_word","url") %>%
  mutate(month=floor_date(non_comment_Kaohsiung$time,
                          unit = "month")) %>%
  group_by(month,find_word) %>% 
  summarise(counts=n())

month_and_url %>% 
  ggplot()+
  geom_bar(aes(x=month,y=counts,
               fill=factor(find_word)),
           stat = "identity")+
  ggtitle("討論文章數") +
  xlab("日期") + #y軸名稱
  ylab("數量") #x軸名稱

+ 討論各個公共載具比例

pie_data <- month_and_url %>%
  group_by(find_word) %>%
  summarise(total=sum(counts)) 
pie_data <- pie_data %>% mutate(freq=total/sum(.$total))
pie_data %>% head()
## # A tibble: 4 x 3
##   find_word total   freq
##   <chr>     <int>  <dbl>
## 1 bike         30 0.0295
## 2 公車        261 0.257 
## 3 捷運        304 0.299 
## 4 輕軌        421 0.414
label <- paste(round(pie_data$freq, digits = 2),"%",sep = "")
#繪製長條圖

pie <- ggplot(pie_data , aes(x="", y=freq, fill=find_word)) +
  geom_bar(width = 1, stat ="identity") +
  coord_polar("y", start=0) +## 再沿著Y,轉軸成圓餅圖
  geom_text(aes(x=1, y = c(0.99,0.8,0.6,0.25), label=label))
#y為角度
pie

##• 斷句與段詞

  • 載入packages
packages = c("stringr","jiebaR","tidytext","NLP","ggraph","igraph","scales","reshape2","widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(tidyr)
require(NLP)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)

+辭典、停用字…

#sentiment:Valence(1~9)負面到正面、Arousal(1~9)字的強烈程度
sentiment <- read_csv("Sentiment.csv")
## Parsed with column specification:
## cols(
##   No. = col_double(),
##   Word = col_character(),
##   Valence_Mean = col_double(),
##   Valence_SD = col_double(),
##   Arousal_Mean = col_double(),
##   Arousal_SD = col_double(),
##   Frequency = col_double()
## )
sentiment$Valence_Mean <- sentiment$Valence_Mean-5
sentiment$Arousal_Mean <- sentiment$Arousal_Mean-5
sentiment <- sentiment [,c(2,3,5)]
sentiment %>% summary
##      Word            Valence_Mean      Arousal_Mean     
##  Length:3552        Min.   :-4.0000   Min.   :-3.60000  
##  Class :character   1st Qu.:-1.6000   1st Qu.:-1.00000  
##  Mode  :character   Median :-0.5000   Median : 0.00000  
##                     Mean   :-0.3136   Mean   : 0.09721  
##                     3rd Qu.: 1.2000   3rd Qu.: 1.00000  
##                     Max.   : 3.6000   Max.   : 3.80000
#尚須修正
stop_words <- scan(file = "stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'stop_words.txt' 中的輸入不正確
stop_words <- c("人民","末","啊","我","我們","腳踏車","公車","輕軌","捷運")

negation_words <- scan(file = "negation_words.txt",what=character(),sep=',',
                       encoding='utf-8',fileEncoding='utf-8')

#尚須修正user_dict
user_dict<- scan(file = "user_dict.txt", what=character(),sep='\n',
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "user_dict.txt", what = character(), sep = "\n", :
## 輸入連結 'user_dict.txt' 中的輸入不正確
user_dict <-c("韓國瑜","陳菊","高雄市政府","高市府","大順路","試營運")

negation_words %>% head()
## [1] "不"   "未"   "未必" "毫不" "決不" "沒有"
  • 斷句處理
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
sentences <- strsplit(non_comment_Kaohsiung$text,"[。!;?!?;]")
# 以\n爲依據進行斷句
# 回傳結果為list of vectors,每個vector的內容為每篇文章的斷句結果
sentences <-  strsplit(non_comment_Kaohsiung$text,"\n")

##將斷句與url合併
#計算list長度
sentences_num <- lengths(sentences,use.names = F)
#unlist會將list中所有的vector展開成一個一維的vector
Url_sentences <- data.frame(artUrl=rep(non_comment_Kaohsiung$url, sentences_num),sentences = unlist(sentences))
Url_sentences %>% head()
##                                                     artUrl
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
##                         sentences
## 1         from 一卡通 ipass粉絲團
## 2                                
## 3                                
## 4 一卡通免費搭市區公車延長到6月底
## 5                                
## 6
  • 使用jieba斷詞
#初使化
jieba_tokenizer = worker()

#以檔案形式
jieba_tokenizer <- worker(user="user_dict.txt",stop_word = "stop_words.txt")

#jieba處理中文字,需新增此函數,避免error
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
Url_sentences$sentences <-  as.character(Url_sentences$sentences)

# 進行斷詞,並計算各詞彙在各文章中出現的次數
Url_words <- Url_sentences %>%
  unnest_tokens(word, sentences, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
Url_words %>% head()
## # A tibble: 6 x 3
##   artUrl                                                   word      n
##   <fct>                                                    <chr> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.html 北捷     73
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.html 輕軌     48
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.html 公車     40
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.html 捷運     38
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.html 捷運     36
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.html 輕軌     36
  • 計算 tf-idf
Url_words_tf_idf <- Url_words %>%
  bind_tf_idf(word, artUrl, n)
Url_words_tf_idf %>% head()
## # A tibble: 6 x 6
##   artUrl                                    word      n     tf   idf tf_idf
##   <fct>                                     <chr> <int>  <dbl> <dbl>  <dbl>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.14769~ 北捷     73 0.142  3.59  0.510 
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.14797~ 輕軌     48 0.0458 0.688 0.0315
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.14249~ 公車     40 0.0939 1.10  0.103 
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.14931~ 捷運     38 0.0365 0.570 0.0208
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.14873~ 捷運     36 0.0297 0.570 0.0169
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.15292~ 輕軌     36 0.0310 0.688 0.0213
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
Url_words_tf_idf%>% 
  group_by(artUrl) %>%
  top_n(10,wt=tf_idf) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)
## # A tibble: 8,580 x 2
##    word      n
##    <chr> <int>
##  1 文章     54
##  2 公車     39
##  3 黃線     32
##  4 司機     29
##  5 列車     22
##  6 大順     20
##  7 測試     19
##  8 輕軌     18
##  9 分鐘     17
## 10 刷卡     17
## # ... with 8,570 more rows

• 文章-情緒分析

  • bigram
# remove stopwords
jieba_tokenizer = worker()
# 使用新字典重新斷詞
# 把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(user_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
# 執行bigram分詞
Url_sentences_bigram <- Url_sentences %>%
  unnest_tokens(bigram, sentences, token = jieba_bigram)
Url_sentences_bigram %>% head()
##                                                       artUrl       bigram
## 1   https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html  from 一卡通
## 1.1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通 ipass
## 1.2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html ipass 粉絲團
## 4   https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html  一卡通 免費
## 4.1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html      免費 搭
## 4.2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html      搭 市區
# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- Url_sentences_bigram %>%
  filter(!str_detect(bigram, regex("[1-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
Url_sentences_sentiment_bigrams <- bigrams_separated %>%
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  filter(word2 %in% sentiment$Word)

#新增變數為new_score,將受到否定詞影響,調整其分數
non_comment_sentiment_bigrams <- Url_sentences_sentiment_bigrams %>% inner_join(sentiment,by=c(word2="Word")) %>%
  mutate(new_score=ifelse(word1 %in% negation_words,-1*Valence_Mean,Valence_Mean),
         sentiment_tag=ifelse(new_score>=0, "positive", "negative"))

情緒走勢圖(情緒)

#將non_comment_sentiment_bigrams與日期合併
test <- non_comment_Kaohsiung %>% select(url,time,find_word) %>% mutate(month=floor_date(non_comment_Kaohsiung$time,
                          unit = "month")) %>%
  select(-time)

non_comment_sentiment_bigrams <- non_comment_sentiment_bigrams %>% left_join(test,by=c("artUrl"="url"))
non_comment_sentiment_bigrams %>% head()
##                                                     artUrl  word1 word2
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通  免費
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html     非  心情
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html     板  無關
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   網誌    好
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   有人  認真
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   認真  分享
##   Valence_Mean Arousal_Mean new_score sentiment_tag find_word      month
## 1          1.2         -0.2       1.2      positive      公車 2014-02-01
## 2          0.0         -1.4       0.0      positive      公車 2014-02-01
## 3         -0.4         -1.4      -0.4      negative      公車 2014-02-01
## 4          1.8          0.2       1.8      positive      公車 2014-03-01
## 5          2.0          0.0       2.0      positive      公車 2014-03-01
## 6          1.8         -0.4       1.8      positive      公車 2014-03-01
non_comment_sentiment_bigrams <- non_comment_sentiment_bigrams %>%
  group_by(artUrl) %>% 
  summarise(Sum_new=sum(new_score),Sum=sum(Valence_Mean)) %>% 
  ungroup() %>%left_join(test,by=c("artUrl"="url")) %>% 
  group_by(month,find_word) %>% 
  summarise(month_mean=mean(Sum),month_mean_new=mean(Sum_new))

non_comment_sentiment_bigrams %>% 
  group_by(month,find_word)  %>% 
  ggplot() + 
  geom_line(aes(month, month_mean_new), color = "red",size=1) + 
  geom_line(aes(month, month_mean),size=1) + 
  labs(x="分鐘", y="情緒分數")+
  facet_wrap(~find_word, ncol = 1, scales = "free_y")

情緒走勢圖(情緒強烈程度)

#主題
#換資料

#新增變數為new_score,將受到否定詞影響,調整其分數
non_comment_Arousal_bigrams <- Url_sentences_sentiment_bigrams %>% inner_join(sentiment,by=c(word2="Word")) %>%
  mutate(new_score=ifelse(word1 %in% negation_words,-1*Arousal_Mean,Arousal_Mean))

#將non_comment_sentiment_bigrams與日期合併
test <- non_comment_Kaohsiung %>% select(url,time,find_word) %>% mutate(month=floor_date(non_comment_Kaohsiung$time,
                          unit = "month")) %>%
  select(-time)

non_comment_Arousal_bigrams <- non_comment_Arousal_bigrams %>% left_join(test,by=c("artUrl"="url"))
non_comment_Arousal_bigrams %>% head()
##                                                     artUrl  word1 word2
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通  免費
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html     非  心情
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html     板  無關
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   網誌    好
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   有人  認真
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html   認真  分享
##   Valence_Mean Arousal_Mean new_score find_word      month
## 1          1.2         -0.2      -0.2      公車 2014-02-01
## 2          0.0         -1.4       1.4      公車 2014-02-01
## 3         -0.4         -1.4      -1.4      公車 2014-02-01
## 4          1.8          0.2       0.2      公車 2014-03-01
## 5          2.0          0.0       0.0      公車 2014-03-01
## 6          1.8         -0.4      -0.4      公車 2014-03-01
non_comment_Arousal_bigrams <- non_comment_Arousal_bigrams %>%
  group_by(artUrl) %>% 
  summarise(Sum_new=sum(new_score),Sum=sum(Arousal_Mean)) %>% 
  ungroup() %>%left_join(test,by=c("artUrl"="url")) %>% 
  group_by(month,find_word) %>% 
  summarise(month_mean=mean(Sum),month_mean_new=mean(Sum_new))

non_comment_Arousal_bigrams %>% 
  group_by(month,find_word)  %>% 
  ggplot() + 
  geom_line(aes(month, month_mean_new), color = "red",size=1) + 
  geom_line(aes(month, month_mean),size=1) + 
  labs(x="分鐘", y="情緒分數")+
  facet_wrap(~find_word, ncol = 1, scales = "free_y")

• 文章-主題分析

  • 安裝需要的package
packages = c("tm", "data.table","stringr", "ggplot2", "topicmodels", "LDAvis", "webshot", "htmlwidgets")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
  • 載入package
require(servr)
require(tm)
require(data.table)
require(stringr)
require(ggplot2)
library(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
Url_words %>% head()
## # A tibble: 6 x 3
##   artUrl                                                   word      n
##   <fct>                                                    <chr> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.html 北捷     73
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.html 輕軌     48
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.html 公車     40
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.html 捷運     38
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.html 捷運     36
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.html 輕軌     36
#根據每一篇文章的Url給定一個id。
Url_words_2 <- Url_words %>%
  mutate(artId = group_indices(., artUrl))
Url_words_2 %>% head()
## # A tibble: 6 x 4
##   artUrl                                                  word      n artId
##   <fct>                                                   <chr> <int> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.ht~ 北捷     73   378
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.ht~ 輕軌     48   395
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.ht~ 公車     40    95
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.ht~ 捷運     38   498
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.ht~ 捷運     36   455
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.ht~ 輕軌     36   879
  • 將資料轉換為Document Term Matrix (DTM)
Url_words_dtm <-Url_words_2 %>% cast_dtm(artId, word, n)
inspect(Url_words_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 51/49
## Sparsity           : 49%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  00 30 大順 公車 北捷 高捷 高雄 捷運 黃線 輕軌
##   378  0  2    0    0   73   23    2    3    0    0
##   395  0  1    2    2    3    1   16   11    0   48
##   455  0  0    0   18    0    0   27   36    0   12
##   498  0  0    0    2    0    0   13   38   14    0
##   633  1  0    0    1    0    1   12   29    2    4
##   765  0  2    0   31    0    0    7   17    0    6
##   879  2  0    6    0    2    0   16   22   14   36
##   915 36 24    0    0    0    0   11    5    0    0
##   95   0  0    0   40    0    0    0    8    0    0
##   982  0  0    2    8    0    0    2    4    0   32
  • 建立LDA模型
Url_words_lda <- LDA(Url_words_dtm, k =2, control = list(seed = 1234))

ϕ matrix (topic * term)

#從中可以得到特定主題生成特定詞彙的概率。
Url_words_topics <- tidy(Url_words_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
Url_words_topics
## # A tibble: 40,744 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 北捷  0.000305
##  2     2 北捷  0.00165 
##  3     1 輕軌  0.0300  
##  4     2 輕軌  0.0110  
##  5     1 公車  0.00240 
##  6     2 公車  0.0200  
##  7     1 捷運  0.0140  
##  8     2 捷運  0.0179  
##  9     1 00    0.000969
## 10     2 00    0.000389
## # ... with 40,734 more rows
  • 尋找Topic的代表字
Url_words_top_terms <- Url_words_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

#整理出每一個Topic中生成概率最高的10個詞彙。
Url_words_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

+ 將常出現、跨主題共享的詞彙移除。

remove_words <- c("公車", "捷運","輕軌","高雄","高雄市","政府")
Url_words_top_terms <- Url_words_topics %>%
  filter(! term %in% remove_words) %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


Url_words_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

  • 查看組別間差異最大的詞
# #針對topic 1, topic2 進行分析,分析哪些詞彙 很常出現在topic 1,但很少出現在topic2的詞彙。
beta_spread <- Url_words_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .0004 | topic2 > .0004 ) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread
## # A tibble: 619 x 4
##    term    topic1   topic2 log_ratio
##    <chr>    <dbl>    <dbl>     <dbl>
##  1 00    9.69e- 4 0.000389    -1.32 
##  2 07    4.09e- 4 0.000239    -0.773
##  3 1-2   5.79e- 5 0.000840     3.86 
##  4 1.    8.14e- 4 0.000447    -0.864
##  5 10    1.50e- 3 0.00359      1.26 
##  6 100   1.79e- 4 0.00105      2.56 
##  7 104   1.61e-11 0.000537    25.0  
##  8 105   1.61e- 4 0.000891     2.47 
##  9 106   4.00e- 5 0.000408     3.35 
## 10 11    8.01e- 4 0.00139      0.792
## # ... with 609 more rows
# #針對topic 1, topic2 進行分析,分析哪些詞彙 很常出現在topic 1,但很少出現在topic2的詞彙。
# #取出log_ratio最高及最低的10個term
Url_words_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio),
                              beta_spread %>% top_n(-10, log_ratio)) %>%
  arrange(log_ratio)
Url_words_topic_ratio
## # A tibble: 20 x 4
##    term     topic1   topic2 log_ratio
##    <chr>     <dbl>    <dbl>     <dbl>
##  1 住戶   4.42e- 4 2.46e-47    -144. 
##  2 統包   6.18e- 4 9.01e-22     -59.3
##  3 二期   7.65e- 4 1.13e-20     -55.9
##  4 廠商   5.30e- 4 2.95e-20     -54.0
##  5 黑暗   4.12e- 4 1.51e-18     -48.0
##  6 大順   3.93e- 3 5.03e-15     -39.5
##  7 停工   6.18e- 4 1.08e-15     -39.1
##  8 車流   6.48e- 4 1.18e-15     -39.0
##  9 西班牙 4.42e- 4 4.61e-14     -33.2
## 10 抗議   8.54e- 4 2.69e-13     -31.6
## 11 車次   1.07e-17 5.54e- 4      45.6
## 12 候車   4.12e-20 6.38e- 4      53.8
## 13 電子   1.94e-21 6.21e- 4      58.1
## 14 票證   1.00e-21 5.71e- 4      59.0
## 15 報名   8.02e-22 5.54e- 4      59.3
## 16 里程   3.47e-27 4.20e- 4      76.7
## 17 悠遊   1.19e-28 8.73e- 4      82.6
## 18 車資   3.65e-35 4.20e- 4     103. 
## 19 享有   1.68e-47 4.70e- 4     144. 
## 20 套票   6.12e-84 4.36e- 4     265.
Url_words_topic_ratio  %>%
  ggplot(aes(x = reorder(term, log_ratio),
             y = log_ratio)) +
  geom_bar(stat="identity") +
  xlab("Word")+
  coord_flip()

θ matrix (document * topic)

#θ matrix (document * topic)
Url_words_documents <- tidy(Url_words_lda, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
Url_words_documents
## # A tibble: 2,030 x 3
##    document topic     gamma
##    <chr>    <int>     <dbl>
##  1 378          1 0.000182 
##  2 395          1 1.000    
##  3 95           1 0.000220 
##  4 498          1 1.000    
##  5 455          1 0.0000774
##  6 879          1 1.000    
##  7 915          1 1.000    
##  8 982          1 0.927    
##  9 765          1 0.513    
## 10 633          1 0.504    
## # ... with 2,020 more rows
#gamma值代表的是這篇文章中有多少比例的詞是出自於特定topic

簡單的主題分類

Url_words_2$artId <- as.character(Url_words_2$artId)

Url_words_documents%>%
  group_by(document) %>%
  top_n(1,gamma) %>%
  arrange(topic) %>%
  inner_join(Url_words_2 %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
  inner_join(non_comment_Kaohsiung, by=c("artUrl"= "url")) %>%
  select(topic,text)
## # A tibble: 1,015 x 3
## # Groups:   document [1,015]
##    document topic text                                                     
##    <chr>    <int> <chr>                                                    
##  1 395          1 "身為高雄人,也因為跟這位叔叔有點淵源\n\n\n他做了這麼多事情而離世了,我相信他最希望的也不是R.I.P這句~
##  2 498          1 "基本上捷運在台北和其他縣市觀感可能差很大.\n\n\n所以經歷過台北捷運黑暗期的歷史.\n那時候黑暗期,本來非~
##  3 879          1 "\n高雄交通體檢】輕軌二階段爭議大 捷運黃線待籌6百億\n出版時間:2018/06/17 11:00\n\n\~
##  4 915          1 "\n標題: [食記] 高雄市 捷運巨蛋站美食餐廳懶人包\n時間: Wed Aug  8 11:01:04 20~
##  5 982          1 "\n鐵路地下化 輕軌地上跑\n但輕軌與火車本質上是很不一樣的東西\n\n平交道遮斷路口常常長達兩三分鐘以上\n~
##  6 765          1 "看到公車我就來氣\n我是不知道其他路線是怎麼樣\n應該也好不到哪裡去\n\n通常一出捷運站走到公車站牌\n要等~
##  7 633          1 "之前有一位公眾人物因為在美麗島站\n拍了一張照片引起了很多意見\n的確造成這樣別人這樣的印像也是因為高雄現在捷~
##  8 808          1 "\n高雄輕軌入市中心衝擊交通?捷運局:依序、間隔施工\n \n\n高雄輕軌第一階段完工通車,第二階段目前已施工~
##  9 992          1 "黃2C路線及時刻表資訊:漢程客運→\n  \n高雄市公車動態資訊→黃1  \n\n高雄捷運黃線路線圖:高雄市政~
## 10 831          1 "這幾天都在肝FGO\n不過還是說一下我的拙見吧\n在下也不太喜歡平面輕軌\n應該說 不管是高架還是平面 在下都~
## # ... with 1,005 more rows
Url_words_2$artId <- as.character(Url_words_2$artId)

Url_words_documents%>%
  group_by(topic) %>%
  top_n(10, wt=gamma) %>%
  inner_join(Url_words_2, by = c("document" = "artId")) %>%
  distinct(artUrl) %>%
  inner_join(non_comment_Kaohsiung, by =c("artUrl"="url")) %>%
  select(topic, title)
## Warning: Column `artUrl`/`url` joining factor and character vector,
## coercing into character vector
## # A tibble: 20 x 2
## # Groups:   topic [2]
##    topic title                                           
##    <int> <chr>                                           
##  1     1 Re: [新聞] 高雄輕軌頭號粉絲辭世 兒子:人文輕軌是
##  2     1 Re: [新聞] 高雄捷運黃線拉抬 這3區住戶如中樂透  
##  3     1 [新聞] 輕軌二階段爭議大  捷運黃線待籌6百億      
##  4     1 Fw: [食記] 高雄市 捷運巨蛋站美食餐廳懶人包      
##  5     1 Re: [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加
##  6     1 Fw: [新聞] 李克聰:「不得已」的輕軌 該喊停了    
##  7     1 Re: [新聞] 砸千億!高雄終於要有第3條捷運「黃線」
##  8     1 [遊記] 高雄捷運黃線先導公車-黃2C路線考察        
##  9     1 [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加    
## 10     1 Re: [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加
## 11     2 [轉錄] 有沒有第二強和最弱的捷運站的八卦?        
## 12     2 [新聞] 3月1日起「捷運、公車雙向轉乘」公車半     
## 13     2 Fw: [分享] 高雄捷運+輕軌+公車心得               
## 14     2 Fw: [情報] 黃1捷運先導公車12月1日上路           
## 15     2 Re: [新聞] 高雄輕軌營運年花上億 每天載不到3千人
## 16     2 [遊記] 高雄輕軌 試乘活動心得,報名方式及注意    
## 17     2 [閒聊] 捷運真的比公車舒適便利嗎?                
## 18     2 Fw: [討論] 高雄打算規畫捷運公車吃到飽套票       
## 19     2 Fw: [情報] 107年4月北高捷運各站進出運量         
## 20     2 [遊記]  高雄輕軌 免費試乘心得 & 預約說明
  • 按照θ矩陣的“gamma”值劃分兩種主題的資料
Url_words_documents_spread <- Url_words_documents %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, gamma)

half_num = round(nrow(Url_words %>% distinct(artUrl))/2) # 原始文章數量的一半

topic1_id <- Url_words_documents_spread %>%   # 取出topic_1最高的half_num篇文章
  top_n(half_num, topic1) %>%
  select(document) %>%
  unlist()

topic2_id <- Url_words_documents_spread$document %>%
  setdiff(topic1_id)
  • Topic 1
#install.packages("wordcloud2")
require(wordcloud2)
## Loading required package: wordcloud2
word_cloud_topic_1 <- Url_words_2 %>%
  filter(artId %in% topic1_id) %>%
  group_by(word) %>%
  summarise(sum = sum(n)) %>%
  arrange(desc(sum)) %>%
  wordcloud2()
word_cloud_topic_1 
# saveWidget(word_cloud_topic_1, "word_cloud_topic_1.html", selfcontained = F)
# img <- webshot("word_cloud_topic_1.html", "wc1.png", delay=5)
  • Topic 2
word_cloud_topic_2 <- Url_words_2 %>%
  filter(artId %in% topic2_id  ) %>%
  group_by(word) %>%
  summarise(sum = sum(n)) %>%
  arrange(desc(sum)) %>%
  wordcloud2()
word_cloud_topic_2
# saveWidget(word_cloud_topic_2, "word_cloud_topic_2.html", selfcontained = F)
# img <- webshot("word_cloud_topic_2.html", "wc2.png", delay=5)

• 評論-資料介紹

  • 載入資料
#評論資料
comment_Kaohsiung <- read_csv("comment_Kaohsiung.csv")

#show
kable(comment_Kaohsiung %>% head()) %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "300px")
url commet_num text comment_type ip author title time board_content board_content_type comment_author comment_ip comment_time
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77853 這樣不會狂虧損嗎? 雖然開心免費這件事 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 Alex1103 NA 2014-02-25 08:03:00
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77854 到底是要免費多久啊=口= 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 blankhole NA 2014-02-25 07:13:00
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77855 推阿~~希望公車能再進步些 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 cy2013 NA 2014-02-25 12:19:00
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77856 讚! 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 daisukidayo NA 2014-02-26 10:59:00
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77857 最近看到越來越多人拿一卡通道OK小額消費了好現象 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 dreamcoast NA 2014-02-25 15:47:00
https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 77858 大推! 繼續搭公車去旅行~ 180.218.80.252 scottlu28 (D) [閒聊] 一卡通免費搭公車延長到9月底 2014-02-25 05:47:15 Kaohsiung 閒聊 familia NA 2014-03-01 01:55:00
  • 評論-情緒分析
#針對comment_type作的情緒分析
comment_type_url <-  comment_Kaohsiung %>% select(url,comment_type,time) %>% mutate(month=floor_date(comment_Kaohsiung$time,
                          unit = "month"),
      score=ifelse(comment_type=="推",1,-1))%>%
  select(-time)
#只保留推和噓
comment_type_url <- comment_type_url %>% filter(comment_type!="→")
#將find_word從non_comment_Kaohsiung抓出
test <- non_comment_Kaohsiung %>% select(url,find_word)
comment_type_url <- comment_type_url %>% left_join(test,by="url")

#將每篇文章情緒加總,再平均每日
comment_type_url_2 <- comment_type_url%>%
  group_by(url,find_word) %>% 
  summarise(Sum=sum(score)) %>% 
  ungroup() %>%
  left_join(comment_type_url[,c(1,3)],by=c("url")) %>% 
  group_by(month,find_word) %>% 
  summarise(month_mean=mean(Sum))
comment_type_url_2%>% head(15)
## # A tibble: 15 x 3
## # Groups:   month [9]
##    month               find_word month_mean
##    <dttm>              <chr>          <dbl>
##  1 2014-02-01 00:00:00 公車          25    
##  2 2014-03-01 00:00:00 公車           4    
##  3 2014-04-01 00:00:00 公車           2.75 
##  4 2014-05-01 00:00:00 公車           2    
##  5 2014-06-01 00:00:00 公車           1    
##  6 2014-08-01 00:00:00 公車           8    
##  7 2014-09-01 00:00:00 公車          20.7  
##  8 2014-09-01 00:00:00 捷運          10    
##  9 2014-09-01 00:00:00 輕軌          10    
## 10 2014-10-01 00:00:00 公車          -0.143
## 11 2014-10-01 00:00:00 捷運         -46.6  
## 12 2014-10-01 00:00:00 輕軌          10.4  
## 13 2014-11-01 00:00:00 公車           7.70 
## 14 2014-11-01 00:00:00 捷運          11.7  
## 15 2014-11-01 00:00:00 輕軌          12.1
#畫圖
comment_type_url_2  %>% 
  group_by(month,find_word)%>% 
  ggplot() + 
  geom_line(aes(month, month_mean),size=1) + 
  labs(x="分鐘", y="情緒分數")+
  facet_wrap(~find_word, ncol = 1, scales = "free_y")