packages = c("kableExtra","lubridate","ggplot2","ggplot2","readr","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(knitr)
library(kableExtra)
require(readr)
library(dplyr)
require(lubridate)
require(ggplot2)
non_comment_Kaohsiung <- read_csv("non_comment_Kaohsiung.csv")
non_comment_Kaohsiung %>% head()
## # A tibble: 6 x 12
## url text ip author title time board_content
## <chr> <chr> <chr> <chr> <chr> <dttm> <chr>
## 1 http~ "fro~ 180.~ scott~ [閒聊]~ 2014-02-25 05:47:15 Kaohsiung
## 2 http~ "網誌好~ 36.2~ wildp~ [遊記]~ 2014-03-04 10:13:59 Kaohsiung
## 3 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-03-12 09:18:42 Kaohsiung
## 4 http~ "網頁好~ <NA> wildp~ [遊記]~ 2014-04-11 09:49:36 Kaohsiung
## 5 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-04-13 14:08:50 Kaohsiung
## 6 http~ "網頁好~ 36.2~ wildp~ [遊記]~ 2014-04-15 09:03:15 Kaohsiung
## # ... with 5 more variables: find_word <chr>, board_content_type <chr>,
## # comment_num <dbl>, imp_word_count <dbl>, route_imp_word_count <dbl>
#新增變數month
month_and_url<- non_comment_Kaohsiung %>%
select("find_word","url") %>%
mutate(month=floor_date(non_comment_Kaohsiung$time,
unit = "month")) %>%
group_by(month,find_word) %>%
summarise(counts=n())
month_and_url %>%
ggplot()+
geom_bar(aes(x=month,y=counts,
fill=factor(find_word)),
stat = "identity")+
ggtitle("討論文章數") +
xlab("日期") + #y軸名稱
ylab("數量") #x軸名稱
+ 討論各個公共載具比例
pie_data <- month_and_url %>%
group_by(find_word) %>%
summarise(total=sum(counts))
pie_data <- pie_data %>% mutate(freq=total/sum(.$total))
pie_data %>% head()
## # A tibble: 4 x 3
## find_word total freq
## <chr> <int> <dbl>
## 1 bike 30 0.0295
## 2 公車 261 0.257
## 3 捷運 304 0.299
## 4 輕軌 421 0.414
label <- paste(round(pie_data$freq, digits = 2),"%",sep = "")
#繪製長條圖
pie <- ggplot(pie_data , aes(x="", y=freq, fill=find_word)) +
geom_bar(width = 1, stat ="identity") +
coord_polar("y", start=0) +## 再沿著Y,轉軸成圓餅圖
geom_text(aes(x=1, y = c(0.99,0.8,0.6,0.25), label=label))
#y為角度
pie
##• 斷句與段詞
packages = c("stringr","jiebaR","tidytext","NLP","ggraph","igraph","scales","reshape2","widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(tidyr)
require(NLP)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
+辭典、停用字…
#sentiment:Valence(1~9)負面到正面、Arousal(1~9)字的強烈程度
sentiment <- read_csv("Sentiment.csv")
## Parsed with column specification:
## cols(
## No. = col_double(),
## Word = col_character(),
## Valence_Mean = col_double(),
## Valence_SD = col_double(),
## Arousal_Mean = col_double(),
## Arousal_SD = col_double(),
## Frequency = col_double()
## )
sentiment$Valence_Mean <- sentiment$Valence_Mean-5
sentiment$Arousal_Mean <- sentiment$Arousal_Mean-5
sentiment <- sentiment [,c(2,3,5)]
sentiment %>% summary
## Word Valence_Mean Arousal_Mean
## Length:3552 Min. :-4.0000 Min. :-3.60000
## Class :character 1st Qu.:-1.6000 1st Qu.:-1.00000
## Mode :character Median :-0.5000 Median : 0.00000
## Mean :-0.3136 Mean : 0.09721
## 3rd Qu.: 1.2000 3rd Qu.: 1.00000
## Max. : 3.6000 Max. : 3.80000
#尚須修正
stop_words <- scan(file = "stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'stop_words.txt' 中的輸入不正確
stop_words <- c("人民","末","啊","我","我們","腳踏車","公車","輕軌","捷運")
negation_words <- scan(file = "negation_words.txt",what=character(),sep=',',
encoding='utf-8',fileEncoding='utf-8')
#尚須修正user_dict
user_dict<- scan(file = "user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "user_dict.txt", what = character(), sep = "\n", :
## 輸入連結 'user_dict.txt' 中的輸入不正確
user_dict <-c("韓國瑜","陳菊","高雄市政府","高市府","大順路","試營運")
negation_words %>% head()
## [1] "不" "未" "未必" "毫不" "決不" "沒有"
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
sentences <- strsplit(non_comment_Kaohsiung$text,"[。!;?!?;]")
# 以\n爲依據進行斷句
# 回傳結果為list of vectors,每個vector的內容為每篇文章的斷句結果
sentences <- strsplit(non_comment_Kaohsiung$text,"\n")
##將斷句與url合併
#計算list長度
sentences_num <- lengths(sentences,use.names = F)
#unlist會將list中所有的vector展開成一個一維的vector
Url_sentences <- data.frame(artUrl=rep(non_comment_Kaohsiung$url, sentences_num),sentences = unlist(sentences))
Url_sentences %>% head()
## artUrl
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html
## sentences
## 1 from 一卡通 ipass粉絲團
## 2
## 3
## 4 一卡通免費搭市區公車延長到6月底
## 5
## 6
#初使化
jieba_tokenizer = worker()
#以檔案形式
jieba_tokenizer <- worker(user="user_dict.txt",stop_word = "stop_words.txt")
#jieba處理中文字,需新增此函數,避免error
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
Url_sentences$sentences <- as.character(Url_sentences$sentences)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
Url_words <- Url_sentences %>%
unnest_tokens(word, sentences, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
Url_words %>% head()
## # A tibble: 6 x 3
## artUrl word n
## <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.html 北捷 73
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.html 輕軌 48
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.html 公車 40
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.html 捷運 38
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.html 捷運 36
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.html 輕軌 36
Url_words_tf_idf <- Url_words %>%
bind_tf_idf(word, artUrl, n)
Url_words_tf_idf %>% head()
## # A tibble: 6 x 6
## artUrl word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.14769~ 北捷 73 0.142 3.59 0.510
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.14797~ 輕軌 48 0.0458 0.688 0.0315
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.14249~ 公車 40 0.0939 1.10 0.103
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.14931~ 捷運 38 0.0365 0.570 0.0208
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.14873~ 捷運 36 0.0297 0.570 0.0169
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.15292~ 輕軌 36 0.0310 0.688 0.0213
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
Url_words_tf_idf%>%
group_by(artUrl) %>%
top_n(10,wt=tf_idf) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## # A tibble: 8,580 x 2
## word n
## <chr> <int>
## 1 文章 54
## 2 公車 39
## 3 黃線 32
## 4 司機 29
## 5 列車 22
## 6 大順 20
## 7 測試 19
## 8 輕軌 18
## 9 分鐘 17
## 10 刷卡 17
## # ... with 8,570 more rows
# remove stopwords
jieba_tokenizer = worker()
# 使用新字典重新斷詞
# 把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(user_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
# 執行bigram分詞
Url_sentences_bigram <- Url_sentences %>%
unnest_tokens(bigram, sentences, token = jieba_bigram)
Url_sentences_bigram %>% head()
## artUrl bigram
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html from 一卡通
## 1.1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通 ipass
## 1.2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html ipass 粉絲團
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通 免費
## 4.1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 免費 搭
## 4.2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 搭 市區
# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- Url_sentences_bigram %>%
filter(!str_detect(bigram, regex("[1-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
Url_sentences_sentiment_bigrams <- bigrams_separated %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
filter(word2 %in% sentiment$Word)
#新增變數為new_score,將受到否定詞影響,調整其分數
non_comment_sentiment_bigrams <- Url_sentences_sentiment_bigrams %>% inner_join(sentiment,by=c(word2="Word")) %>%
mutate(new_score=ifelse(word1 %in% negation_words,-1*Valence_Mean,Valence_Mean),
sentiment_tag=ifelse(new_score>=0, "positive", "negative"))
#將non_comment_sentiment_bigrams與日期合併
test <- non_comment_Kaohsiung %>% select(url,time,find_word) %>% mutate(month=floor_date(non_comment_Kaohsiung$time,
unit = "month")) %>%
select(-time)
non_comment_sentiment_bigrams <- non_comment_sentiment_bigrams %>% left_join(test,by=c("artUrl"="url"))
non_comment_sentiment_bigrams %>% head()
## artUrl word1 word2
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通 免費
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 非 心情
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 板 無關
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 網誌 好
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 有人 認真
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 認真 分享
## Valence_Mean Arousal_Mean new_score sentiment_tag find_word month
## 1 1.2 -0.2 1.2 positive 公車 2014-02-01
## 2 0.0 -1.4 0.0 positive 公車 2014-02-01
## 3 -0.4 -1.4 -0.4 negative 公車 2014-02-01
## 4 1.8 0.2 1.8 positive 公車 2014-03-01
## 5 2.0 0.0 2.0 positive 公車 2014-03-01
## 6 1.8 -0.4 1.8 positive 公車 2014-03-01
non_comment_sentiment_bigrams <- non_comment_sentiment_bigrams %>%
group_by(artUrl) %>%
summarise(Sum_new=sum(new_score),Sum=sum(Valence_Mean)) %>%
ungroup() %>%left_join(test,by=c("artUrl"="url")) %>%
group_by(month,find_word) %>%
summarise(month_mean=mean(Sum),month_mean_new=mean(Sum_new))
non_comment_sentiment_bigrams %>%
group_by(month,find_word) %>%
ggplot() +
geom_line(aes(month, month_mean_new), color = "red",size=1) +
geom_line(aes(month, month_mean),size=1) +
labs(x="分鐘", y="情緒分數")+
facet_wrap(~find_word, ncol = 1, scales = "free_y")
#主題
#換資料
#新增變數為new_score,將受到否定詞影響,調整其分數
non_comment_Arousal_bigrams <- Url_sentences_sentiment_bigrams %>% inner_join(sentiment,by=c(word2="Word")) %>%
mutate(new_score=ifelse(word1 %in% negation_words,-1*Arousal_Mean,Arousal_Mean))
#將non_comment_sentiment_bigrams與日期合併
test <- non_comment_Kaohsiung %>% select(url,time,find_word) %>% mutate(month=floor_date(non_comment_Kaohsiung$time,
unit = "month")) %>%
select(-time)
non_comment_Arousal_bigrams <- non_comment_Arousal_bigrams %>% left_join(test,by=c("artUrl"="url"))
non_comment_Arousal_bigrams %>% head()
## artUrl word1 word2
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 一卡通 免費
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 非 心情
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html 板 無關
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 網誌 好
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 有人 認真
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1393928042.A.3B5.html 認真 分享
## Valence_Mean Arousal_Mean new_score find_word month
## 1 1.2 -0.2 -0.2 公車 2014-02-01
## 2 0.0 -1.4 1.4 公車 2014-02-01
## 3 -0.4 -1.4 -1.4 公車 2014-02-01
## 4 1.8 0.2 0.2 公車 2014-03-01
## 5 2.0 0.0 0.0 公車 2014-03-01
## 6 1.8 -0.4 -0.4 公車 2014-03-01
non_comment_Arousal_bigrams <- non_comment_Arousal_bigrams %>%
group_by(artUrl) %>%
summarise(Sum_new=sum(new_score),Sum=sum(Arousal_Mean)) %>%
ungroup() %>%left_join(test,by=c("artUrl"="url")) %>%
group_by(month,find_word) %>%
summarise(month_mean=mean(Sum),month_mean_new=mean(Sum_new))
non_comment_Arousal_bigrams %>%
group_by(month,find_word) %>%
ggplot() +
geom_line(aes(month, month_mean_new), color = "red",size=1) +
geom_line(aes(month, month_mean),size=1) +
labs(x="分鐘", y="情緒分數")+
facet_wrap(~find_word, ncol = 1, scales = "free_y")
packages = c("tm", "data.table","stringr", "ggplot2", "topicmodels", "LDAvis", "webshot", "htmlwidgets")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(servr)
require(tm)
require(data.table)
require(stringr)
require(ggplot2)
library(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
Url_words %>% head()
## # A tibble: 6 x 3
## artUrl word n
## <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.html 北捷 73
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.html 輕軌 48
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.html 公車 40
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.html 捷運 38
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.html 捷運 36
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.html 輕軌 36
#根據每一篇文章的Url給定一個id。
Url_words_2 <- Url_words %>%
mutate(artId = group_indices(., artUrl))
Url_words_2 %>% head()
## # A tibble: 6 x 4
## artUrl word n artId
## <fct> <chr> <int> <int>
## 1 https://www.ptt.cc/bbs/Kaohsiung/M.1476951380.A.CAE.ht~ 北捷 73 378
## 2 https://www.ptt.cc/bbs/Kaohsiung/M.1479785177.A.AD5.ht~ 輕軌 48 395
## 3 https://www.ptt.cc/bbs/Kaohsiung/M.1424971519.A.7C3.ht~ 公車 40 95
## 4 https://www.ptt.cc/bbs/Kaohsiung/M.1493116619.A.BA3.ht~ 捷運 38 498
## 5 https://www.ptt.cc/bbs/Kaohsiung/M.1487346948.A.A0C.ht~ 捷運 36 455
## 6 https://www.ptt.cc/bbs/Kaohsiung/M.1529239455.A.27E.ht~ 輕軌 36 879
Url_words_dtm <-Url_words_2 %>% cast_dtm(artId, word, n)
inspect(Url_words_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 51/49
## Sparsity : 49%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 00 30 大順 公車 北捷 高捷 高雄 捷運 黃線 輕軌
## 378 0 2 0 0 73 23 2 3 0 0
## 395 0 1 2 2 3 1 16 11 0 48
## 455 0 0 0 18 0 0 27 36 0 12
## 498 0 0 0 2 0 0 13 38 14 0
## 633 1 0 0 1 0 1 12 29 2 4
## 765 0 2 0 31 0 0 7 17 0 6
## 879 2 0 6 0 2 0 16 22 14 36
## 915 36 24 0 0 0 0 11 5 0 0
## 95 0 0 0 40 0 0 0 8 0 0
## 982 0 0 2 8 0 0 2 4 0 32
Url_words_lda <- LDA(Url_words_dtm, k =2, control = list(seed = 1234))
#從中可以得到特定主題生成特定詞彙的概率。
Url_words_topics <- tidy(Url_words_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
Url_words_topics
## # A tibble: 40,744 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 北捷 0.000305
## 2 2 北捷 0.00165
## 3 1 輕軌 0.0300
## 4 2 輕軌 0.0110
## 5 1 公車 0.00240
## 6 2 公車 0.0200
## 7 1 捷運 0.0140
## 8 2 捷運 0.0179
## 9 1 00 0.000969
## 10 2 00 0.000389
## # ... with 40,734 more rows
Url_words_top_terms <- Url_words_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#整理出每一個Topic中生成概率最高的10個詞彙。
Url_words_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
+ 將常出現、跨主題共享的詞彙移除。
remove_words <- c("公車", "捷運","輕軌","高雄","高雄市","政府")
Url_words_top_terms <- Url_words_topics %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
Url_words_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
# #針對topic 1, topic2 進行分析,分析哪些詞彙 很常出現在topic 1,但很少出現在topic2的詞彙。
beta_spread <- Url_words_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 619 x 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 00 9.69e- 4 0.000389 -1.32
## 2 07 4.09e- 4 0.000239 -0.773
## 3 1-2 5.79e- 5 0.000840 3.86
## 4 1. 8.14e- 4 0.000447 -0.864
## 5 10 1.50e- 3 0.00359 1.26
## 6 100 1.79e- 4 0.00105 2.56
## 7 104 1.61e-11 0.000537 25.0
## 8 105 1.61e- 4 0.000891 2.47
## 9 106 4.00e- 5 0.000408 3.35
## 10 11 8.01e- 4 0.00139 0.792
## # ... with 609 more rows
# #針對topic 1, topic2 進行分析,分析哪些詞彙 很常出現在topic 1,但很少出現在topic2的詞彙。
# #取出log_ratio最高及最低的10個term
Url_words_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio),
beta_spread %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
Url_words_topic_ratio
## # A tibble: 20 x 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 住戶 4.42e- 4 2.46e-47 -144.
## 2 統包 6.18e- 4 9.01e-22 -59.3
## 3 二期 7.65e- 4 1.13e-20 -55.9
## 4 廠商 5.30e- 4 2.95e-20 -54.0
## 5 黑暗 4.12e- 4 1.51e-18 -48.0
## 6 大順 3.93e- 3 5.03e-15 -39.5
## 7 停工 6.18e- 4 1.08e-15 -39.1
## 8 車流 6.48e- 4 1.18e-15 -39.0
## 9 西班牙 4.42e- 4 4.61e-14 -33.2
## 10 抗議 8.54e- 4 2.69e-13 -31.6
## 11 車次 1.07e-17 5.54e- 4 45.6
## 12 候車 4.12e-20 6.38e- 4 53.8
## 13 電子 1.94e-21 6.21e- 4 58.1
## 14 票證 1.00e-21 5.71e- 4 59.0
## 15 報名 8.02e-22 5.54e- 4 59.3
## 16 里程 3.47e-27 4.20e- 4 76.7
## 17 悠遊 1.19e-28 8.73e- 4 82.6
## 18 車資 3.65e-35 4.20e- 4 103.
## 19 享有 1.68e-47 4.70e- 4 144.
## 20 套票 6.12e-84 4.36e- 4 265.
Url_words_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio),
y = log_ratio)) +
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()
#θ matrix (document * topic)
Url_words_documents <- tidy(Url_words_lda, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
Url_words_documents
## # A tibble: 2,030 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 378 1 0.000182
## 2 395 1 1.000
## 3 95 1 0.000220
## 4 498 1 1.000
## 5 455 1 0.0000774
## 6 879 1 1.000
## 7 915 1 1.000
## 8 982 1 0.927
## 9 765 1 0.513
## 10 633 1 0.504
## # ... with 2,020 more rows
#gamma值代表的是這篇文章中有多少比例的詞是出自於特定topic
Url_words_2$artId <- as.character(Url_words_2$artId)
Url_words_documents%>%
group_by(document) %>%
top_n(1,gamma) %>%
arrange(topic) %>%
inner_join(Url_words_2 %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
inner_join(non_comment_Kaohsiung, by=c("artUrl"= "url")) %>%
select(topic,text)
## # A tibble: 1,015 x 3
## # Groups: document [1,015]
## document topic text
## <chr> <int> <chr>
## 1 395 1 "身為高雄人,也因為跟這位叔叔有點淵源\n\n\n他做了這麼多事情而離世了,我相信他最希望的也不是R.I.P這句~
## 2 498 1 "基本上捷運在台北和其他縣市觀感可能差很大.\n\n\n所以經歷過台北捷運黑暗期的歷史.\n那時候黑暗期,本來非~
## 3 879 1 "\n高雄交通體檢】輕軌二階段爭議大 捷運黃線待籌6百億\n出版時間:2018/06/17 11:00\n\n\~
## 4 915 1 "\n標題: [食記] 高雄市 捷運巨蛋站美食餐廳懶人包\n時間: Wed Aug 8 11:01:04 20~
## 5 982 1 "\n鐵路地下化 輕軌地上跑\n但輕軌與火車本質上是很不一樣的東西\n\n平交道遮斷路口常常長達兩三分鐘以上\n~
## 6 765 1 "看到公車我就來氣\n我是不知道其他路線是怎麼樣\n應該也好不到哪裡去\n\n通常一出捷運站走到公車站牌\n要等~
## 7 633 1 "之前有一位公眾人物因為在美麗島站\n拍了一張照片引起了很多意見\n的確造成這樣別人這樣的印像也是因為高雄現在捷~
## 8 808 1 "\n高雄輕軌入市中心衝擊交通?捷運局:依序、間隔施工\n \n\n高雄輕軌第一階段完工通車,第二階段目前已施工~
## 9 992 1 "黃2C路線及時刻表資訊:漢程客運→\n \n高雄市公車動態資訊→黃1 \n\n高雄捷運黃線路線圖:高雄市政~
## 10 831 1 "這幾天都在肝FGO\n不過還是說一下我的拙見吧\n在下也不太喜歡平面輕軌\n應該說 不管是高架還是平面 在下都~
## # ... with 1,005 more rows
Url_words_2$artId <- as.character(Url_words_2$artId)
Url_words_documents%>%
group_by(topic) %>%
top_n(10, wt=gamma) %>%
inner_join(Url_words_2, by = c("document" = "artId")) %>%
distinct(artUrl) %>%
inner_join(non_comment_Kaohsiung, by =c("artUrl"="url")) %>%
select(topic, title)
## Warning: Column `artUrl`/`url` joining factor and character vector,
## coercing into character vector
## # A tibble: 20 x 2
## # Groups: topic [2]
## topic title
## <int> <chr>
## 1 1 Re: [新聞] 高雄輕軌頭號粉絲辭世 兒子:人文輕軌是
## 2 1 Re: [新聞] 高雄捷運黃線拉抬 這3區住戶如中樂透
## 3 1 [新聞] 輕軌二階段爭議大 捷運黃線待籌6百億
## 4 1 Fw: [食記] 高雄市 捷運巨蛋站美食餐廳懶人包
## 5 1 Re: [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加
## 6 1 Fw: [新聞] 李克聰:「不得已」的輕軌 該喊停了
## 7 1 Re: [新聞] 砸千億!高雄終於要有第3條捷運「黃線」
## 8 1 [遊記] 高雄捷運黃線先導公車-黃2C路線考察
## 9 1 [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加
## 10 1 Re: [新聞] 民眾抗議輕軌衝擊生活交通 高市捷運局加
## 11 2 [轉錄] 有沒有第二強和最弱的捷運站的八卦?
## 12 2 [新聞] 3月1日起「捷運、公車雙向轉乘」公車半
## 13 2 Fw: [分享] 高雄捷運+輕軌+公車心得
## 14 2 Fw: [情報] 黃1捷運先導公車12月1日上路
## 15 2 Re: [新聞] 高雄輕軌營運年花上億 每天載不到3千人
## 16 2 [遊記] 高雄輕軌 試乘活動心得,報名方式及注意
## 17 2 [閒聊] 捷運真的比公車舒適便利嗎?
## 18 2 Fw: [討論] 高雄打算規畫捷運公車吃到飽套票
## 19 2 Fw: [情報] 107年4月北高捷運各站進出運量
## 20 2 [遊記] 高雄輕軌 免費試乘心得 & 預約說明
Url_words_documents_spread <- Url_words_documents %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, gamma)
half_num = round(nrow(Url_words %>% distinct(artUrl))/2) # 原始文章數量的一半
topic1_id <- Url_words_documents_spread %>% # 取出topic_1最高的half_num篇文章
top_n(half_num, topic1) %>%
select(document) %>%
unlist()
topic2_id <- Url_words_documents_spread$document %>%
setdiff(topic1_id)
#install.packages("wordcloud2")
require(wordcloud2)
## Loading required package: wordcloud2
word_cloud_topic_1 <- Url_words_2 %>%
filter(artId %in% topic1_id) %>%
group_by(word) %>%
summarise(sum = sum(n)) %>%
arrange(desc(sum)) %>%
wordcloud2()
word_cloud_topic_1
# saveWidget(word_cloud_topic_1, "word_cloud_topic_1.html", selfcontained = F)
# img <- webshot("word_cloud_topic_1.html", "wc1.png", delay=5)
word_cloud_topic_2 <- Url_words_2 %>%
filter(artId %in% topic2_id ) %>%
group_by(word) %>%
summarise(sum = sum(n)) %>%
arrange(desc(sum)) %>%
wordcloud2()
word_cloud_topic_2
# saveWidget(word_cloud_topic_2, "word_cloud_topic_2.html", selfcontained = F)
# img <- webshot("word_cloud_topic_2.html", "wc2.png", delay=5)
#評論資料
comment_Kaohsiung <- read_csv("comment_Kaohsiung.csv")
#show
kable(comment_Kaohsiung %>% head()) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
| url | commet_num | text | comment_type | ip | author | title | time | board_content | board_content_type | comment_author | comment_ip | comment_time |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77853 | 這樣不會狂虧損嗎? 雖然開心免費這件事 | 推 | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | Alex1103 | NA | 2014-02-25 08:03:00 |
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77854 | 到底是要免費多久啊=口= | → | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | blankhole | NA | 2014-02-25 07:13:00 |
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77855 | 推阿~~希望公車能再進步些 | 推 | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | cy2013 | NA | 2014-02-25 12:19:00 |
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77856 | 讚! | 推 | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | daisukidayo | NA | 2014-02-26 10:59:00 |
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77857 | 最近看到越來越多人拿一卡通道OK小額消費了好現象 | 推 | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | dreamcoast | NA | 2014-02-25 15:47:00 |
| https://www.ptt.cc/bbs/Kaohsiung/M.1393307238.A.EF8.html | 77858 | 大推! 繼續搭公車去旅行~ | 推 | 180.218.80.252 | scottlu28 (D) | [閒聊] 一卡通免費搭公車延長到9月底 | 2014-02-25 05:47:15 | Kaohsiung | 閒聊 | familia | NA | 2014-03-01 01:55:00 |
#針對comment_type作的情緒分析
comment_type_url <- comment_Kaohsiung %>% select(url,comment_type,time) %>% mutate(month=floor_date(comment_Kaohsiung$time,
unit = "month"),
score=ifelse(comment_type=="推",1,-1))%>%
select(-time)
#只保留推和噓
comment_type_url <- comment_type_url %>% filter(comment_type!="→")
#將find_word從non_comment_Kaohsiung抓出
test <- non_comment_Kaohsiung %>% select(url,find_word)
comment_type_url <- comment_type_url %>% left_join(test,by="url")
#將每篇文章情緒加總,再平均每日
comment_type_url_2 <- comment_type_url%>%
group_by(url,find_word) %>%
summarise(Sum=sum(score)) %>%
ungroup() %>%
left_join(comment_type_url[,c(1,3)],by=c("url")) %>%
group_by(month,find_word) %>%
summarise(month_mean=mean(Sum))
comment_type_url_2%>% head(15)
## # A tibble: 15 x 3
## # Groups: month [9]
## month find_word month_mean
## <dttm> <chr> <dbl>
## 1 2014-02-01 00:00:00 公車 25
## 2 2014-03-01 00:00:00 公車 4
## 3 2014-04-01 00:00:00 公車 2.75
## 4 2014-05-01 00:00:00 公車 2
## 5 2014-06-01 00:00:00 公車 1
## 6 2014-08-01 00:00:00 公車 8
## 7 2014-09-01 00:00:00 公車 20.7
## 8 2014-09-01 00:00:00 捷運 10
## 9 2014-09-01 00:00:00 輕軌 10
## 10 2014-10-01 00:00:00 公車 -0.143
## 11 2014-10-01 00:00:00 捷運 -46.6
## 12 2014-10-01 00:00:00 輕軌 10.4
## 13 2014-11-01 00:00:00 公車 7.70
## 14 2014-11-01 00:00:00 捷運 11.7
## 15 2014-11-01 00:00:00 輕軌 12.1
#畫圖
comment_type_url_2 %>%
group_by(month,find_word)%>%
ggplot() +
geom_line(aes(month, month_mean),size=1) +
labs(x="分鐘", y="情緒分數")+
facet_wrap(~find_word, ncol = 1, scales = "free_y")