Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "Rtsne", "randomcoloR", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(widyr)
## Loading required package: widyr
require(readr)
## Loading required package: readr
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
require(NLP)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(ggraph)
## Loading required package: ggraph
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(tm)
## Loading required package: tm
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(quanteda)
## Loading required package: quanteda
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:igraph':
##
## as.igraph
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
## The following object is masked from 'package:utils':
##
## View
require(Matrix)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
require(slam)
## Loading required package: slam
##
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
##
## rollup
require(Rtsne)
## Loading required package: Rtsne
require(randomcoloR)
## Loading required package: randomcoloR
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
資料載入:本資料為2018/06/01 ~ 2020/04/20 PTT Movie、ChinaDrama、KoreaDrama、EAseries、TaiwanDrama Gossiping、Womentalk 之資料,透過文字分析平台檢索「Netflix」、「網飛」兩個關鍵字,共搜尋到49855篇文章。
netflix = fread('/Users/bonniechen/Desktop/midterm/netflixRecommend_articleMetaData.csv',encoding = 'UTF-8')
netflix$artDate = netflix$artDate %>% as.Date("%Y/%m/%d")
head(netflix)
## artTitle artDate artTime
## 1: [請益]Netflix電影 2018-06-30 10:22:08
## 2: [請益]Netflix電影 2018-06-30 10:22:08
## 3: [請益]Netflix電影 2018-06-30 10:22:08
## 4: [請益]Netflix電影 2018-06-30 10:22:08
## 5: [請益]Netflix電影 2018-06-30 10:22:08
## 6: [請益]Netflix電影 2018-06-30 10:22:08
## artUrl artPoster artCat
## 1: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 2: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 3: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 4: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 5: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 6: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## commentPoster commentStatus commentDate
## 1: ievolnds 推 2018-06-30 18:30:00+00:00
## 2: shamanlin → 2018-06-30 18:32:00+00:00
## 3: shamanlin → 2018-06-30 18:32:00+00:00
## 4: egg781 推 2018-06-30 18:56:00+00:00
## 5: SE4NLN415 → 2018-06-30 19:06:00+00:00
## 6: SE4NLN415 → 2018-06-30 19:06:00+00:00
## commentContent
## 1: :網飛'自製'電影真的十部有九部爛
## 2: :幾乎都爛,很少看到好的
## 3: :有想看的美劇上架臨時買一個月就好了
## 4: :另一部能看的就幽靈空間了
## 5: :一直都很雷頂多strangerthings紙牌屋很多人捧
## 6: :但內容沒多到我想花錢倒是朋友給我用他帳號還不錯
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
n_tokens <- netflix %>% unnest_tokens(word, commentContent, token=tokenizer)
str(n_tokens)
## Classes 'data.table' and 'data.frame': 188153 obs. of 10 variables:
## $ artTitle : chr "[請益]Netflix電影" "[請益]Netflix電影" "[請益]Netflix電影" "[請益]Netflix電影" ...
## $ artDate : Date, format: "2018-06-30" "2018-06-30" ...
## $ artTime : chr "10:22:08" "10:22:08" "10:22:08" "10:22:08" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" ...
## $ artPoster : chr "black58gigi" "black58gigi" "black58gigi" "black58gigi" ...
## $ artCat : chr "movie" "movie" "movie" "movie" ...
## $ commentPoster: chr "ievolnds" "ievolnds" "ievolnds" "ievolnds" ...
## $ commentStatus: chr "推" "推" "推" "推" ...
## $ commentDate : chr "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" ...
## $ word : chr "網飛" "電影" "真的" "十部" ...
## - attr(*, ".internal.selfref")=<externalptr>
head(n_tokens, 20)
## artTitle artDate artTime
## 1: [請益]Netflix電影 2018-06-30 10:22:08
## 2: [請益]Netflix電影 2018-06-30 10:22:08
## 3: [請益]Netflix電影 2018-06-30 10:22:08
## 4: [請益]Netflix電影 2018-06-30 10:22:08
## 5: [請益]Netflix電影 2018-06-30 10:22:08
## 6: [請益]Netflix電影 2018-06-30 10:22:08
## 7: [請益]Netflix電影 2018-06-30 10:22:08
## 8: [請益]Netflix電影 2018-06-30 10:22:08
## 9: [請益]Netflix電影 2018-06-30 10:22:08
## 10: [請益]Netflix電影 2018-06-30 10:22:08
## 11: [請益]Netflix電影 2018-06-30 10:22:08
## 12: [請益]Netflix電影 2018-06-30 10:22:08
## 13: [請益]Netflix電影 2018-06-30 10:22:08
## 14: [請益]Netflix電影 2018-06-30 10:22:08
## 15: [請益]Netflix電影 2018-06-30 10:22:08
## 16: [請益]Netflix電影 2018-06-30 10:22:08
## 17: [請益]Netflix電影 2018-06-30 10:22:08
## 18: [請益]Netflix電影 2018-06-30 10:22:08
## 19: [請益]Netflix電影 2018-06-30 10:22:08
## 20: [請益]Netflix電影 2018-06-30 10:22:08
## artUrl artPoster artCat
## 1: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 2: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 3: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 4: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 5: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 6: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 7: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 8: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 9: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 10: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 11: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 12: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 13: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 14: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 15: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 16: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 17: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 18: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 19: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 20: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## commentPoster commentStatus commentDate word
## 1: ievolnds 推 2018-06-30 18:30:00+00:00 網飛
## 2: ievolnds 推 2018-06-30 18:30:00+00:00 電影
## 3: ievolnds 推 2018-06-30 18:30:00+00:00 真的
## 4: ievolnds 推 2018-06-30 18:30:00+00:00 十部
## 5: ievolnds 推 2018-06-30 18:30:00+00:00 九部
## 6: shamanlin → 2018-06-30 18:32:00+00:00 都爛
## 7: shamanlin → 2018-06-30 18:32:00+00:00 看到
## 8: shamanlin → 2018-06-30 18:32:00+00:00 美劇
## 9: shamanlin → 2018-06-30 18:32:00+00:00 上架
## 10: shamanlin → 2018-06-30 18:32:00+00:00 臨時
## 11: shamanlin → 2018-06-30 18:32:00+00:00 一個月
## 12: egg781 推 2018-06-30 18:56:00+00:00 一部
## 13: egg781 推 2018-06-30 18:56:00+00:00 幽靈
## 14: egg781 推 2018-06-30 18:56:00+00:00 空間
## 15: SE4NLN415 → 2018-06-30 19:06:00+00:00 一直
## 16: SE4NLN415 → 2018-06-30 19:06:00+00:00 很雷
## 17: SE4NLN415 → 2018-06-30 19:06:00+00:00 strangerthings
## 18: SE4NLN415 → 2018-06-30 19:06:00+00:00 紙牌
## 19: SE4NLN415 → 2018-06-30 19:06:00+00:00 內容
## 20: SE4NLN415 → 2018-06-30 19:06:00+00:00 沒多到
word_count <- n_tokens %>%
#select(word) %>%
group_by(word) %>%
summarise(count = n()) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
head(word_count, 100)
## # A tibble: 100 x 2
## word count
## <chr> <int>
## 1 netflix 2162
## 2 真的 1973
## 3 好看 1550
## 4 電影 1482
## 5 覺得 1447
## 6 xd 1355
## 7 台灣 1064
## 8 網飛 974
## 9 nf 892
## 10 應該 839
## # … with 90 more rows
討論度較高、較常出現的影集名稱為「毒梟」、「黑鏡」、「怪奇物語」、「罪夢者」等
word_count %>%
filter(word !="netflix") %>%
filter(word !="網飛") %>%
filter(word !="nf") %>%
filter(count>200) %>%
wordcloud2()
可大致上看出Netflix在ptt版上常和「電影」、「影集」、「韓劇」等關鍵字一同出現
plot_merge <- word_count %>%
filter(word !="netflix") %>%
filter(word !="網飛") %>%
filter(word !="nf") %>%
#group_by(type) %>%
top_n(30, count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x=word, y=count)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
#facet_wrap(~type, ncol = 1, scales="free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
plot_merge
我們好奇排行榜中前面沒有出現韓劇名稱,因此另外從全部資料中,抓出KoreaDrama版的資料
korea <- n_tokens %>% filter(artCat == "KoreaDrama")
head(korea)
## artTitle artDate artTime
## 1 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 2 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 3 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 4 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 5 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 6 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 2 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 3 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 4 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 5 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 6 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## artCat commentPoster commentStatus commentDate word
## 1 KoreaDrama s55272004 推 2018-06-17 08:34:00 分手
## 2 KoreaDrama s55272004 推 2018-06-17 08:34:00 第二天
## 3 KoreaDrama hannaminion → 2018-06-17 08:34:00 分手
## 4 KoreaDrama hannaminion → 2018-06-17 08:34:00 第二天
## 5 KoreaDrama bohemianismR 推 2018-06-17 08:40:00 這部
## 6 KoreaDrama bohemianismR 推 2018-06-17 08:40:00 明洙
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
korea_tokens_count <- korea %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(korea_tokens_count, 1000)
## # A tibble: 88 x 2
## word sum
## <chr> <int>
## 1 netflix 196
## 2 期待 132
## 3 真的 100
## 4 xd 100
## 5 愛奇藝 81
## 6 韓劇 72
## 7 覺得 69
## 8 好看 54
## 9 網飛 52
## 10 這部 47
## # … with 78 more rows
# korea_tokens_count %>%
# filter(word !="netflix") %>%
# filter(word !="網飛") %>%
# wordcloud2()
可看出常出現的韓劇名稱有「hyena」、「愛的迫降」、「機智醫生」等
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count_by_date <- n_tokens %>%
#filter(nchar(.$word)>1) %>%
group_by(word, artDate) %>%
#group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3) %>%
arrange(desc(sum))
head(tokens_count_by_date)
## # A tibble: 6 x 3
## # Groups: word [5]
## word artDate sum
## <chr> <date> <int>
## 1 nf 2019-03-04 163
## 2 電影 2019-03-04 158
## 3 網飛 2019-03-04 137
## 4 99 2020-01-03 136
## 5 戲院 2019-03-04 135
## 6 電影 2019-03-03 116
# tokens_count_by_date <- n_tokens %>%
# group_by(artDate) %>%
# summarise(count = n())
# tokens_count_by_date
以LIWC字典判斷文集中的word屬於正面字還是負面字
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(N)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
tokens_count_by_date %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 502 x 4
## # Groups: word [131]
## word artDate sum sentiment
## <chr> <date> <int> <fct>
## 1 值得 2019-03-31 77 positive
## 2 問題 2019-03-04 62 negative
## 3 可憐 2020-04-03 35 negative
## 4 遊戲 2019-12-25 34 positive
## 5 難看 2020-04-03 27 negative
## 6 願意 2019-03-04 25 positive
## 7 難看 2020-04-11 17 negative
## 8 問題 2019-01-27 17 negative
## 9 問題 2019-11-01 17 negative
## 10 降低 2020-03-31 16 negative
## # … with 492 more rows
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# n_tokens_liwc <- n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# #merge(x = n_tokens, y = LIWC, by = "word", all.y = TRUE)
# n_tokens_liwc <- n_tokens_liwc %>%
# left_join(n_tokens)
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
sentiment_count = tokens_count_by_date %>%
select(artDate,word,sum) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(sum))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%y/%m/%d"))
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%y/%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019-03-31'))
[1]])),colour = "red", size = 0.5) +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019-03-04'))
[1]])),colour = "blue", size = 0.5) +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020-04-03'))
[1]])),colour = "blue", size = 0.5)
netflix %>% filter(artDate == as.Date('2019-03-31')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [請益]Netflix的魔戒電影突然沒辦法看了!? 2019-03-31 17:51:12
## 2 [問卦]Netflix值得買嗎? 2019-03-31 03:19:35
## 3 Re:[問卦]Netflix值得買嗎? 2019-03-31 03:38:45
## 4 [問卦]Netflix觀眾評分到底有三小意義? 2019-03-31 03:39:09
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/movie/M.1554083834.A.DCB.html hifone112
## 2 https://www.ptt.cc/bbs/Gossiping/M.1554031537.A.6D8.html simonbear
## 3 https://www.ptt.cc/bbs/Gossiping/M.1554032687.A.2FE.html pradeclick
## 4 https://www.ptt.cc/bbs/Gossiping/M.1554032712.A.676.html peter080808
## artCat commentPoster commentStatus commentDate
## 1 movie SupCat 推 2019-04-01 01:56:00+00:00
## 2 Gossiping ilovelol → 2019-03-31 11:19:00+00:00
## 3 Gossiping Win7 噓 2019-03-31 11:41:00+00:00
## 4 Gossiping widec → 2019-03-31 11:40:00+00:00
## commentContent
## 1 :上架要登頭版昭告天下下架惦惦不說話
## 2 :值得
## 3 :沒有一個月試用?你從何得知的訊息?一張信用卡試用一個月
## 4 :Netflix介面這麼爛你不靠北反而靠北評分?
# tokens_count_by_date %>%
# filter(artDate == as.Date('2019-03-31')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# #filter(count>20) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的正面情緒最多?正面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2019-03-31')) %>%
inner_join(LIWC) %>%
filter(sentiment == "positive") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 4 x 4
## # Groups: artUrl [4]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M… positive [問卦]Netflix值得買嗎? 113
## 2 https://www.ptt.cc/bbs/Gossiping/M… positive [問卦]Netflix觀眾評分到底有三小意義?… 7
## 3 https://www.ptt.cc/bbs/Gossiping/M… positive Re:[問卦]Netflix值得買嗎? 6
## 4 https://www.ptt.cc/bbs/movie/M.155… positive [請益]Netflix的魔戒電影突然沒辦法看了!?… 6
n_tokens %>%
filter(artDate == as.Date('2019-03-31')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「Netflix值得買嗎?」,因此出現許多正面相關詞彙「值得」、「不錯」等,因此在這篇文章中推論出大家對於Netflix是否值得購買是較為肯定的
netflix %>% filter(artDate == as.Date('2019-03-04')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [好雷]絲絨電鋸VelvetBuzzsaw(netflix) 2019-03-04 00:08:53
## 2 [新聞]史匹柏修法阻止參加奧斯卡Netflix回應了 2019-03-04 04:50:57
## 3 Re:[新聞]史匹柏修法阻止參加奧斯卡Netflix回應了 2019-03-04 07:18:47
## 4 [請益]Netflix發生未預期的錯誤 2019-03-04 17:18:35
## 5 [新聞]史蒂芬史匹柏提案修法 拒絕Netflix電影 2019-03-04 07:37:05
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/movie/M.1551687296.A.D54.html zzauber
## 2 https://www.ptt.cc/bbs/movie/M.1551704219.A.B3A.html haehae311444
## 3 https://www.ptt.cc/bbs/movie/M.1551713091.A.1C2.html ezdoesit
## 4 https://www.ptt.cc/bbs/EAseries/M.1551749079.A.C40.html timbrake
## 5 https://www.ptt.cc/bbs/Gossiping/M.1551714190.A.A55.html ejrq5785
## artCat commentPoster commentStatus commentDate
## 1 movie sfzerox 推 2019-03-04 08:58:00+00:00
## 2 movie purplebfly 推 2019-03-04 12:52:00+00:00
## 3 movie angellll → 2019-03-04 15:21:00+00:00
## 4 EAseries NotLuo 推 2019-03-05 03:19:00+00:00
## 5 Gossiping North4use 推 2019-03-04 15:37:00+00:00
## commentContent
## 1 :狠婊藝術炒作只是不覺得男主有死的必要他沒犯錯吧
## 2 :你就找一間電影院,播個口碑場1場或1天,不就好了
## 3 :就跟手機一樣摟糞game排除優質遊戲
## 4 :我的可以播耶
## 5 :好等下去Netflix看搶救雷恩大兵
# tokens_count_by_date %>%
# filter(artDate == as.Date('2019-03-04')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# filter(count>20) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的負面情緒最多?負面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2019-03-04')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 5 x 4
## # Groups: artUrl [5]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/movie/M.1… negative Re:[新聞]史匹柏修法阻止參加奧斯卡Netflix回… 153
## 2 https://www.ptt.cc/bbs/movie/M.1… negative [新聞]史匹柏修法阻止參加奧斯卡Netflix回應了… 74
## 3 https://www.ptt.cc/bbs/EAseries/… negative [請益]Netflix發生未預期的錯誤… 2
## 4 https://www.ptt.cc/bbs/Gossiping… negative [新聞]史蒂芬史匹柏提案修法 拒絕Netflix電影… 2
## 5 https://www.ptt.cc/bbs/movie/M.1… negative [好雷]絲絨電鋸VelvetBuzzsaw(netfl… 1
n_tokens %>%
filter(artDate == as.Date('2019-03-04')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「史蒂芬史匹柏提案修法 拒絕Netflix電影報名奧斯卡」,因此出現許多負面詞彙「問題」、「侵犯」、「放棄」等,看出大家對於此議題情緒較為負面
netflix %>% filter(artDate == as.Date('2020-04-03')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 Fw:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 07:02:42
## 2 [新聞]丁海寅將出演Netflix新劇《D.P狗的日子》 2020-04-03 02:58:35
## 3 [新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 06:53:48
## 4 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 07:12:42
## 5 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 09:31:31
## 6 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:02:04
## 7 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:06:18
## 8 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:10:56
## 9 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 22:30:17
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/China-Drama/M.1585897363.A.0A7.html YOPOYOPO
## 2 https://www.ptt.cc/bbs/KoreaDrama/M.1585882718.A.941.html sosoing
## 3 https://www.ptt.cc/bbs/Gossiping/M.1585896839.A.852.html yu1164
## 4 https://www.ptt.cc/bbs/Gossiping/M.1585897966.A.044.html zyc5566
## 5 https://www.ptt.cc/bbs/Gossiping/M.1585906294.A.69A.html bear26
## 6 https://www.ptt.cc/bbs/Gossiping/M.1585908126.A.957.html nikubou5566
## 7 https://www.ptt.cc/bbs/Gossiping/M.1585908380.A.555.html HyperPoro
## 8 https://www.ptt.cc/bbs/Gossiping/M.1585908658.A.350.html qq204
## 9 https://www.ptt.cc/bbs/Gossiping/M.1585953019.A.603.html ryaninscu
## artCat commentPoster commentStatus commentDate
## 1 China_Drama cuteme5566 推 2020-04-03 14:54:00
## 2 KoreaDrama tj2061 推 2020-04-03 11:24:00
## 3 Gossiping cuteme5566 推 2020-04-03 14:54:00
## 4 Gossiping BrianTN17 推 2020-04-03 15:13:00
## 5 Gossiping milkyway168 → 2020-04-03 17:35:00
## 6 Gossiping T3T 噓 2020-04-03 18:03:00
## 7 Gossiping bamama56 → 2020-04-03 18:06:00
## 8 Gossiping babyMclaren → 2020-04-03 18:12:00
## 9 Gossiping suckpopo 推 2020-04-04 06:37:00
## commentContent
## 1 :五樓只看甲片
## 2 :丁海寅狂接欸
## 3 :五樓只看甲片
## 4 :大頭貼也來一段
## 5 :廣告費砸下去啊!
## 6 :日本片無法演了因為被抓去關
## 7 :我猜他老婆愛看
## 8 :韓劇很多人看但很難看
## 9 :你沒研究一個失去才華的人如何力爭上游,想東山再起嗎?
# tokens_count_by_date %>%
# filter(artDate == as.Date('2020-04-03')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# filter(count>8) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的負面情緒最多?負面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2020-04-03')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 6 x 4
## # Groups: artUrl [6]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/… negative [新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」… 95
## 2 https://www.ptt.cc/bbs/China-Dram… negative Fw:[新聞]周杰倫爆氣轟Netflix「都看你在推… 19
## 3 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 8
## 4 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 8
## 5 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 1
## 6 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 1
n_tokens %>%
filter(artDate == as.Date('2020-04-03')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「周杰倫爆氣轟Netflix「都看你在推韓劇」,不滿《周遊記》能見度低」,引起網友不滿,出現「可憐」、「難看」、「垃圾」等負面詞彙。