動機與分析目的:近期政府大動作下架盜版影片網站,也因為疫情關係大家長時間待在家追劇,因此我們想看看熱門線上影音平台Netflix、愛奇藝、KKTV的比較,但由於ptt上有關愛奇藝和KKTV的資料量不多,因此我們選擇Netflix來分析在ptt上的討論度。此外,也分析出ptt上較推薦的影集供大家參考。
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "Rtsne", "randomcoloR", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(widyr)
## Loading required package: widyr
require(readr)
## Loading required package: readr
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
require(NLP)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(ggraph)
## Loading required package: ggraph
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(tm)
## Loading required package: tm
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(quanteda)
## Loading required package: quanteda
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:igraph':
##
## as.igraph
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
## The following object is masked from 'package:utils':
##
## View
require(Matrix)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
require(slam)
## Loading required package: slam
##
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
##
## rollup
require(Rtsne)
## Loading required package: Rtsne
require(randomcoloR)
## Loading required package: randomcoloR
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
資料載入:本資料為2018/06/01 ~ 2020/04/20 PTT Movie、ChinaDrama、KoreaDrama、EAseries、TaiwanDrama Gossiping、Womentalk 之資料,透過文字分析平台檢索「Netflix」、「網飛」兩個關鍵字,共搜尋到49855篇文章。
netflix = fread('/Users/bonniechen/Desktop/midterm/netflixRecommend_articleMetaData.csv',encoding = 'UTF-8')
netflix$artDate = netflix$artDate %>% as.Date("%Y/%m/%d")
head(netflix)
## artTitle artDate artTime
## 1: [請益]Netflix電影 2018-06-30 10:22:08
## 2: [請益]Netflix電影 2018-06-30 10:22:08
## 3: [請益]Netflix電影 2018-06-30 10:22:08
## 4: [請益]Netflix電影 2018-06-30 10:22:08
## 5: [請益]Netflix電影 2018-06-30 10:22:08
## 6: [請益]Netflix電影 2018-06-30 10:22:08
## artUrl artPoster artCat
## 1: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 2: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 3: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 4: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 5: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 6: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## commentPoster commentStatus commentDate
## 1: ievolnds 推 2018-06-30 18:30:00+00:00
## 2: shamanlin → 2018-06-30 18:32:00+00:00
## 3: shamanlin → 2018-06-30 18:32:00+00:00
## 4: egg781 推 2018-06-30 18:56:00+00:00
## 5: SE4NLN415 → 2018-06-30 19:06:00+00:00
## 6: SE4NLN415 → 2018-06-30 19:06:00+00:00
## commentContent
## 1: :網飛'自製'電影真的十部有九部爛
## 2: :幾乎都爛,很少看到好的
## 3: :有想看的美劇上架臨時買一個月就好了
## 4: :另一部能看的就幽靈空間了
## 5: :一直都很雷頂多strangerthings紙牌屋很多人捧
## 6: :但內容沒多到我想花錢倒是朋友給我用他帳號還不錯
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
n_tokens <- netflix %>% unnest_tokens(word, commentContent, token=tokenizer)
str(n_tokens)
## Classes 'data.table' and 'data.frame': 188144 obs. of 10 variables:
## $ artTitle : chr "[請益]Netflix電影" "[請益]Netflix電影" "[請益]Netflix電影" "[請益]Netflix電影" ...
## $ artDate : Date, format: "2018-06-30" "2018-06-30" ...
## $ artTime : chr "10:22:08" "10:22:08" "10:22:08" "10:22:08" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" "https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html" ...
## $ artPoster : chr "black58gigi" "black58gigi" "black58gigi" "black58gigi" ...
## $ artCat : chr "movie" "movie" "movie" "movie" ...
## $ commentPoster: chr "ievolnds" "ievolnds" "ievolnds" "ievolnds" ...
## $ commentStatus: chr "推" "推" "推" "推" ...
## $ commentDate : chr "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" "2018-06-30 18:30:00+00:00" ...
## $ word : chr "網飛" "電影" "真的" "十部" ...
## - attr(*, ".internal.selfref")=<externalptr>
head(n_tokens, 20)
## artTitle artDate artTime
## 1: [請益]Netflix電影 2018-06-30 10:22:08
## 2: [請益]Netflix電影 2018-06-30 10:22:08
## 3: [請益]Netflix電影 2018-06-30 10:22:08
## 4: [請益]Netflix電影 2018-06-30 10:22:08
## 5: [請益]Netflix電影 2018-06-30 10:22:08
## 6: [請益]Netflix電影 2018-06-30 10:22:08
## 7: [請益]Netflix電影 2018-06-30 10:22:08
## 8: [請益]Netflix電影 2018-06-30 10:22:08
## 9: [請益]Netflix電影 2018-06-30 10:22:08
## 10: [請益]Netflix電影 2018-06-30 10:22:08
## 11: [請益]Netflix電影 2018-06-30 10:22:08
## 12: [請益]Netflix電影 2018-06-30 10:22:08
## 13: [請益]Netflix電影 2018-06-30 10:22:08
## 14: [請益]Netflix電影 2018-06-30 10:22:08
## 15: [請益]Netflix電影 2018-06-30 10:22:08
## 16: [請益]Netflix電影 2018-06-30 10:22:08
## 17: [請益]Netflix電影 2018-06-30 10:22:08
## 18: [請益]Netflix電影 2018-06-30 10:22:08
## 19: [請益]Netflix電影 2018-06-30 10:22:08
## 20: [請益]Netflix電影 2018-06-30 10:22:08
## artUrl artPoster artCat
## 1: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 2: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 3: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 4: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 5: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 6: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 7: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 8: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 9: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 10: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 11: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 12: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 13: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 14: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 15: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 16: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 17: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 18: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 19: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## 20: https://www.ptt.cc/bbs/movie/M.1530383291.A.2EF.html black58gigi movie
## commentPoster commentStatus commentDate word
## 1: ievolnds 推 2018-06-30 18:30:00+00:00 網飛
## 2: ievolnds 推 2018-06-30 18:30:00+00:00 電影
## 3: ievolnds 推 2018-06-30 18:30:00+00:00 真的
## 4: ievolnds 推 2018-06-30 18:30:00+00:00 十部
## 5: ievolnds 推 2018-06-30 18:30:00+00:00 九部
## 6: shamanlin → 2018-06-30 18:32:00+00:00 都爛
## 7: shamanlin → 2018-06-30 18:32:00+00:00 看到
## 8: shamanlin → 2018-06-30 18:32:00+00:00 美劇
## 9: shamanlin → 2018-06-30 18:32:00+00:00 上架
## 10: shamanlin → 2018-06-30 18:32:00+00:00 臨時
## 11: shamanlin → 2018-06-30 18:32:00+00:00 一個月
## 12: egg781 推 2018-06-30 18:56:00+00:00 一部
## 13: egg781 推 2018-06-30 18:56:00+00:00 幽靈
## 14: egg781 推 2018-06-30 18:56:00+00:00 空間
## 15: SE4NLN415 → 2018-06-30 19:06:00+00:00 一直
## 16: SE4NLN415 → 2018-06-30 19:06:00+00:00 很雷
## 17: SE4NLN415 → 2018-06-30 19:06:00+00:00 strangerthings
## 18: SE4NLN415 → 2018-06-30 19:06:00+00:00 紙牌
## 19: SE4NLN415 → 2018-06-30 19:06:00+00:00 內容
## 20: SE4NLN415 → 2018-06-30 19:06:00+00:00 沒多到
word_count <- n_tokens %>%
#select(word) %>%
group_by(word) %>%
summarise(count = n()) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
head(word_count, 100)
## # A tibble: 100 x 2
## word count
## <chr> <int>
## 1 netflix 2162
## 2 真的 1973
## 3 好看 1550
## 4 電影 1482
## 5 覺得 1447
## 6 xd 1355
## 7 台灣 1064
## 8 網飛 974
## 9 nf 892
## 10 應該 839
## # … with 90 more rows
討論度較高、較常出現的影集名稱為「毒梟」、「黑鏡」、「怪奇物語」、「罪夢者」等
word_count %>%
filter(word !="netflix") %>%
filter(word !="網飛") %>%
filter(word !="nf") %>%
filter(count>200) %>%
wordcloud2()
可大致上看出Netflix在ptt版上常和「電影」、「影集」、「韓劇」等關鍵字一同出現
plot_merge <- word_count %>%
filter(word !="netflix") %>%
filter(word !="網飛") %>%
filter(word !="nf") %>%
#group_by(type) %>%
top_n(30, count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x=word, y=count)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
#facet_wrap(~type, ncol = 1, scales="free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
plot_merge
我們好奇排行榜中前面沒有出現韓劇名稱,因此另外從全部資料中,抓出KoreaDrama版的資料
korea <- n_tokens %>% filter(artCat == "KoreaDrama")
head(korea)
## artTitle artDate artTime
## 1 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 2 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 3 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 4 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 5 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## 6 [問題]問一部Netflix男主角睡著就會回溯的韓劇 2018-06-17 00:24:17
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 2 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 3 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 4 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 5 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## 6 https://www.ptt.cc/bbs/KoreaDrama/M.1529195060.A.08C.html pondy
## artCat commentPoster commentStatus commentDate word
## 1 KoreaDrama s55272004 推 2018-06-17 08:34:00 分手
## 2 KoreaDrama s55272004 推 2018-06-17 08:34:00 第二天
## 3 KoreaDrama hannaminion → 2018-06-17 08:34:00 分手
## 4 KoreaDrama hannaminion → 2018-06-17 08:34:00 第二天
## 5 KoreaDrama bohemianismR 推 2018-06-17 08:40:00 這部
## 6 KoreaDrama bohemianismR 推 2018-06-17 08:40:00 明洙
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
korea_tokens_count <- korea %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(korea_tokens_count, 1000)
## # A tibble: 88 x 2
## word sum
## <chr> <int>
## 1 netflix 196
## 2 期待 132
## 3 真的 100
## 4 xd 100
## 5 愛奇藝 81
## 6 韓劇 72
## 7 覺得 69
## 8 好看 54
## 9 網飛 52
## 10 這部 47
## # … with 78 more rows
# korea_tokens_count %>%
# filter(word !="netflix") %>%
# filter(word !="網飛") %>%
# wordcloud2()
可看出常出現的韓劇名稱有「hyena」、「愛的迫降」、「機智醫生」等
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count_by_date <- n_tokens %>%
#filter(nchar(.$word)>1) %>%
group_by(word, artDate) %>%
#group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3) %>%
arrange(desc(sum))
head(tokens_count_by_date)
## # A tibble: 6 x 3
## # Groups: word [5]
## word artDate sum
## <chr> <date> <int>
## 1 nf 2019-03-04 163
## 2 電影 2019-03-04 158
## 3 網飛 2019-03-04 137
## 4 99 2020-01-03 136
## 5 戲院 2019-03-04 135
## 6 電影 2019-03-03 116
# tokens_count_by_date <- n_tokens %>%
# group_by(artDate) %>%
# summarise(count = n())
# tokens_count_by_date
以LIWC字典判斷文集中的word屬於正面字還是負面字
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(N)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
tokens_count_by_date %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 502 x 4
## # Groups: word [131]
## word artDate sum sentiment
## <chr> <date> <int> <fct>
## 1 值得 2019-03-31 77 positive
## 2 問題 2019-03-04 62 negative
## 3 可憐 2020-04-03 35 negative
## 4 遊戲 2019-12-25 34 positive
## 5 難看 2020-04-03 27 negative
## 6 願意 2019-03-04 25 positive
## 7 難看 2020-04-11 17 negative
## 8 問題 2019-01-27 17 negative
## 9 問題 2019-11-01 17 negative
## 10 降低 2020-03-31 16 negative
## # … with 492 more rows
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# n_tokens_liwc <- n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# #merge(x = n_tokens, y = LIWC, by = "word", all.y = TRUE)
# n_tokens_liwc <- n_tokens_liwc %>%
# left_join(n_tokens)
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
sentiment_count = tokens_count_by_date %>%
select(artDate,word,sum) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(sum))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%y/%m/%d"))
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%y/%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019-03-31'))
[1]])),colour = "red", size = 0.5) +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019-03-04'))
[1]])),colour = "blue", size = 0.5) +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020-04-03'))
[1]])),colour = "blue", size = 0.5)
netflix %>% filter(artDate == as.Date('2019-03-31')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [請益]Netflix的魔戒電影突然沒辦法看了!? 2019-03-31 17:51:12
## 2 [問卦]Netflix值得買嗎? 2019-03-31 03:19:35
## 3 Re:[問卦]Netflix值得買嗎? 2019-03-31 03:38:45
## 4 [問卦]Netflix觀眾評分到底有三小意義? 2019-03-31 03:39:09
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/movie/M.1554083834.A.DCB.html hifone112
## 2 https://www.ptt.cc/bbs/Gossiping/M.1554031537.A.6D8.html simonbear
## 3 https://www.ptt.cc/bbs/Gossiping/M.1554032687.A.2FE.html pradeclick
## 4 https://www.ptt.cc/bbs/Gossiping/M.1554032712.A.676.html peter080808
## artCat commentPoster commentStatus commentDate
## 1 movie SupCat 推 2019-04-01 01:56:00+00:00
## 2 Gossiping ilovelol → 2019-03-31 11:19:00+00:00
## 3 Gossiping Win7 噓 2019-03-31 11:41:00+00:00
## 4 Gossiping widec → 2019-03-31 11:40:00+00:00
## commentContent
## 1 :上架要登頭版昭告天下下架惦惦不說話
## 2 :值得
## 3 :沒有一個月試用?你從何得知的訊息?一張信用卡試用一個月
## 4 :Netflix介面這麼爛你不靠北反而靠北評分?
# tokens_count_by_date %>%
# filter(artDate == as.Date('2019-03-31')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# #filter(count>20) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的正面情緒最多?正面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2019-03-31')) %>%
inner_join(LIWC) %>%
filter(sentiment == "positive") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 4 x 4
## # Groups: artUrl [4]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M… positive [問卦]Netflix值得買嗎? 113
## 2 https://www.ptt.cc/bbs/Gossiping/M… positive [問卦]Netflix觀眾評分到底有三小意義?… 7
## 3 https://www.ptt.cc/bbs/Gossiping/M… positive Re:[問卦]Netflix值得買嗎? 6
## 4 https://www.ptt.cc/bbs/movie/M.155… positive [請益]Netflix的魔戒電影突然沒辦法看了!?… 6
n_tokens %>%
filter(artDate == as.Date('2019-03-31')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「Netflix值得買嗎?」,因此出現許多正面相關詞彙「值得」、「不錯」等,因此在這篇文章中推論出大家對於Netflix是否值得購買是較為肯定的
netflix %>% filter(artDate == as.Date('2019-03-04')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [好雷]絲絨電鋸VelvetBuzzsaw(netflix) 2019-03-04 00:08:53
## 2 [新聞]史匹柏修法阻止參加奧斯卡Netflix回應了 2019-03-04 04:50:57
## 3 Re:[新聞]史匹柏修法阻止參加奧斯卡Netflix回應了 2019-03-04 07:18:47
## 4 [請益]Netflix發生未預期的錯誤 2019-03-04 17:18:35
## 5 [新聞]史蒂芬史匹柏提案修法 拒絕Netflix電影 2019-03-04 07:37:05
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/movie/M.1551687296.A.D54.html zzauber
## 2 https://www.ptt.cc/bbs/movie/M.1551704219.A.B3A.html haehae311444
## 3 https://www.ptt.cc/bbs/movie/M.1551713091.A.1C2.html ezdoesit
## 4 https://www.ptt.cc/bbs/EAseries/M.1551749079.A.C40.html timbrake
## 5 https://www.ptt.cc/bbs/Gossiping/M.1551714190.A.A55.html ejrq5785
## artCat commentPoster commentStatus commentDate
## 1 movie sfzerox 推 2019-03-04 08:58:00+00:00
## 2 movie purplebfly 推 2019-03-04 12:52:00+00:00
## 3 movie angellll → 2019-03-04 15:21:00+00:00
## 4 EAseries NotLuo 推 2019-03-05 03:19:00+00:00
## 5 Gossiping North4use 推 2019-03-04 15:37:00+00:00
## commentContent
## 1 :狠婊藝術炒作只是不覺得男主有死的必要他沒犯錯吧
## 2 :你就找一間電影院,播個口碑場1場或1天,不就好了
## 3 :就跟手機一樣摟糞game排除優質遊戲
## 4 :我的可以播耶
## 5 :好等下去Netflix看搶救雷恩大兵
# tokens_count_by_date %>%
# filter(artDate == as.Date('2019-03-04')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# filter(count>20) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的負面情緒最多?負面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2019-03-04')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 5 x 4
## # Groups: artUrl [5]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/movie/M.1… negative Re:[新聞]史匹柏修法阻止參加奧斯卡Netflix回… 153
## 2 https://www.ptt.cc/bbs/movie/M.1… negative [新聞]史匹柏修法阻止參加奧斯卡Netflix回應了… 74
## 3 https://www.ptt.cc/bbs/EAseries/… negative [請益]Netflix發生未預期的錯誤… 2
## 4 https://www.ptt.cc/bbs/Gossiping… negative [新聞]史蒂芬史匹柏提案修法 拒絕Netflix電影… 2
## 5 https://www.ptt.cc/bbs/movie/M.1… negative [好雷]絲絨電鋸VelvetBuzzsaw(netfl… 1
n_tokens %>%
filter(artDate == as.Date('2019-03-04')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「史蒂芬史匹柏提案修法 拒絕Netflix電影報名奧斯卡」,因此出現許多負面詞彙「問題」、「侵犯」、「放棄」等,看出大家對於此議題情緒較為負面
netflix %>% filter(artDate == as.Date('2020-04-03')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 Fw:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 07:02:42
## 2 [新聞]丁海寅將出演Netflix新劇《D.P狗的日子》 2020-04-03 02:58:35
## 3 [新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 06:53:48
## 4 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 07:12:42
## 5 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 09:31:31
## 6 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:02:04
## 7 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:06:18
## 8 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 10:10:56
## 9 Re:[新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」 2020-04-03 22:30:17
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/China-Drama/M.1585897363.A.0A7.html YOPOYOPO
## 2 https://www.ptt.cc/bbs/KoreaDrama/M.1585882718.A.941.html sosoing
## 3 https://www.ptt.cc/bbs/Gossiping/M.1585896839.A.852.html yu1164
## 4 https://www.ptt.cc/bbs/Gossiping/M.1585897966.A.044.html zyc5566
## 5 https://www.ptt.cc/bbs/Gossiping/M.1585906294.A.69A.html bear26
## 6 https://www.ptt.cc/bbs/Gossiping/M.1585908126.A.957.html nikubou5566
## 7 https://www.ptt.cc/bbs/Gossiping/M.1585908380.A.555.html HyperPoro
## 8 https://www.ptt.cc/bbs/Gossiping/M.1585908658.A.350.html qq204
## 9 https://www.ptt.cc/bbs/Gossiping/M.1585953019.A.603.html ryaninscu
## artCat commentPoster commentStatus commentDate
## 1 China_Drama cuteme5566 推 2020-04-03 14:54:00
## 2 KoreaDrama tj2061 推 2020-04-03 11:24:00
## 3 Gossiping cuteme5566 推 2020-04-03 14:54:00
## 4 Gossiping BrianTN17 推 2020-04-03 15:13:00
## 5 Gossiping milkyway168 → 2020-04-03 17:35:00
## 6 Gossiping T3T 噓 2020-04-03 18:03:00
## 7 Gossiping bamama56 → 2020-04-03 18:06:00
## 8 Gossiping babyMclaren → 2020-04-03 18:12:00
## 9 Gossiping suckpopo 推 2020-04-04 06:37:00
## commentContent
## 1 :五樓只看甲片
## 2 :丁海寅狂接欸
## 3 :五樓只看甲片
## 4 :大頭貼也來一段
## 5 :廣告費砸下去啊!
## 6 :日本片無法演了因為被抓去關
## 7 :我猜他老婆愛看
## 8 :韓劇很多人看但很難看
## 9 :你沒研究一個失去才華的人如何力爭上游,想東山再起嗎?
# tokens_count_by_date %>%
# filter(artDate == as.Date('2020-04-03')) %>%
# select(word,sum) %>%
# group_by(word) %>%
# summarise(count = sum(sum)) %>%
# filter(count>8) %>% # 過濾出現太少次的字
# wordcloud2()
哪篇文章的負面情緒最多?負面情緒的字是?
n_tokens %>%
filter(artDate == as.Date('2020-04-03')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 6 x 4
## # Groups: artUrl [6]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/… negative [新聞]周杰倫爆氣轟Netflix「都看你在推韓劇」… 95
## 2 https://www.ptt.cc/bbs/China-Dram… negative Fw:[新聞]周杰倫爆氣轟Netflix「都看你在推… 19
## 3 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 8
## 4 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 8
## 5 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 1
## 6 https://www.ptt.cc/bbs/Gossiping/… negative Re:[新聞]周杰倫爆氣轟Netflix「都看你在推… 1
n_tokens %>%
filter(artDate == as.Date('2020-04-03')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
前幾篇內容在討論「周杰倫爆氣轟Netflix「都看你在推韓劇」,不滿《周遊記》能見度低」,引起網友不滿,出現「可憐」、「難看」、「垃圾」等負面詞彙。
資料載入
#黑鏡
bm_comment = fread('mirror/Netflix_comment_mirror_articleMetaData.csv',encoding = 'UTF-8')
bm_post = fread('mirror/Netflix_comment_mirror_artWordFreq.csv',encoding = 'UTF-8')
bm_comment$artDate = bm_comment$artDate %>% as.Date("%Y/%m/%d")
bm_post$artDate = bm_post$artDate %>% as.Date("%Y/%m/%d")
#怪奇物語
s_comment = fread('strange_things/Netflix_comment_strange_articleMetaData.csv',encoding = 'UTF-8')
s_post = fread('strange_things/Netflix_comment_strange_artWordFreq.csv',encoding = 'UTF-8')
s_comment$artDate = s_comment$artDate %>% as.Date("%Y/%m/%d")
s_post$artDate = s_post$artDate %>% as.Date("%Y/%m/%d")
#李屍朝鮮
k_comment = fread('lee/Netflix_comment_kingdom_articleMetaData.csv',encoding = 'UTF-8')
k_post = fread('lee/Netflix_comment_kingdom_artWordFreq.csv',encoding = 'UTF-8')
k_comment$artDate = k_comment$artDate %>% as.Date("%Y/%m/%d")
k_post$artDate = k_post$artDate %>% as.Date("%Y/%m/%d")
#愛的迫將
l_comment = fread('landinginUrgentforLove/Netflix_comment_love_articleMetaData.csv',encoding = 'UTF-8')
l_post = fread('landinginUrgentforLove/Netflix_comment_love_artWordFreq.csv',encoding = 'UTF-8')
l_comment$artDate = l_comment$artDate %>% as.Date("%Y/%m/%d")
l_post$artDate = l_post$artDate %>% as.Date("%Y/%m/%d")
#罪夢者
nm_comment = fread('nowhere/Netflix_comment_nowhere_articleMetaData.csv',encoding = 'UTF-8')
nm_post = fread('nowhere/Netflix_comment_nowhere_artWordFreq.csv',encoding = 'UTF-8')
nm_comment$artDate = nm_comment$artDate %>% as.Date("%Y/%m/%d")
nm_post$artDate = nm_post$artDate %>% as.Date("%Y/%m/%d")
對各個劇的留言的斷詞
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(user="user_dict.txt", stop_word = "stop_words.txt")
tokenizer <- function(t) {
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
#黑鏡留言斷詞
bm_comment <- bm_comment %>%
unnest_tokens(word, commentContent, token = tokenizer)
#怪奇物語留言斷詞
s_comment <- s_comment %>%
unnest_tokens(word, commentContent, token = tokenizer)
#kingdom留言斷詞
k_comment <- k_comment %>%
unnest_tokens(word, commentContent, token = tokenizer)
#愛的迫降留言斷詞
l_comment <- l_comment %>%
unnest_tokens(word, commentContent, token = tokenizer)
#罪夢者留言斷詞
nm_comment <- nm_comment %>%
unnest_tokens(word, commentContent, token = tokenizer)
黑鏡-貼文分析
黑鏡:文章數-日期
bm_date <- bm_post %>%
select(artDate, artUrl) %>%
distinct()
bm_article_count_by_date <- bm_date %>%
group_by(artDate) %>%
summarise(count = n())
黑鏡:文章數-月份
bm_post <- bm_post %>% mutate(ym = format(bm_post$artDate, format = "%Y/%m")) #抓出年/月份
bm_months <- bm_post %>%
select(artUrl, ym) %>%
distinct()
bm_article_count_by_month <- bm_months %>%
group_by(ym) %>%
summarise(count = n())
bm_months_plot <- bm_months %>%
ggplot(aes(x = ym)) +
geom_bar(color = "pink", size = 0.5) +
ggtitle("黑鏡討論文章數(月份)") +
geom_text(aes(label=..count..), stat="count", color = "black", vjust = -0.5, size = 3) +
theme(text = element_text(family = "Heiti TC Light"))+
xlab("y/m") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
bm_months_plot
bm_tokens_by_month <- bm_post %>%
count(ym, word, sort = T)
#2018年12月(最多文章月份)的詞頻
bm_1812 <- bm_tokens_by_month %>%
filter(ym == "2018/12") %>%
group_by(ym) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col(show.legend = FALSE) +
theme(text = element_text(family = "Heiti TC Light"))+
facet_wrap(~ym, scales="free", ncol = 2) +
coord_flip()
bm_1812
#2019年(三個文章數一樣多的月份)的詞頻
bm_19 <- bm_tokens_by_month %>%
filter(ym == "2019/01" | ym == "2019/06" | ym == "2019/07") %>%
group_by(ym) %>%
top_n(4, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col(show.legend = FALSE) +
theme(text = element_text(family = "Heiti TC Light"))+
facet_wrap(~ym, scales="free", ncol = 2) +
coord_flip()
bm_19
黑鏡-留言分析
bm_comment <- bm_comment %>%
mutate(ym = format(artDate, format = "%Y/%m"))
bm_ctokens_by_month <- bm_comment %>%
count(ym, word, sort = T)
bm_cmonths <- bm_comment %>%
select(ym, artUrl)
bm_cmonths_plot <- bm_cmonths %>%
ggplot(aes(x = ym)) +
geom_bar(color = "pink", size = 0.5) +
ggtitle("黑鏡留言數(月份)") +
geom_text(aes(label=..count..), stat="count", color = "black", vjust = -0.5, size = 3) +
theme(text = element_text(family = "Heiti TC Light"))+
xlab("y/m") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
bm_cmonths_plot
#最多文章月份的詞頻
bm_ctokens_by_month <- bm_comment %>%
count(word, ym, sort = T)
bm_c1907 <- bm_ctokens_by_month %>%
filter(ym == "2018/12" | ym == "2019/01" | ym == "2019/06" | ym == "2019/07") %>%
group_by(ym) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col(show.legend = FALSE) +
theme(text = element_text(family = "Heiti TC Light"))+
facet_wrap(~ym, scales="free", ncol = 2) +
coord_flip()
bm_c1907
#bm_cloud <- bm_comment %>%
# select(word, ym) %>%
# group_by(word) %>%
# filter(ym == "2018/12" | ym == "2019/01" | ym == "2019/06" | ym == "2019/07") %>%
# summarize(count = n()) %>%
# filter(count > 10) %>%
# wordcloud2()
#bm_cloud
bm_comment <- bm_comment %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
bm_c_count <- bm_comment %>% select(sentiment, word, artDate) %>%
group_by(word) %>%
summarise(count = n()) #加word的詞頻
bm_comment <- bm_comment %>% left_join(bm_c_count) #與s_comment合併
## Joining, by = "word"
bm_comment <- bm_comment %>% mutate(year = format(artDate, format = "%Y"))
bm_comment %>%
filter(year == "2016" | year == "2017" | year == "2018" | year == "2019" | year == "2020" ) %>%
ggplot() +
geom_line(aes(x = artDate, y = count, colour = sentiment)) +
scale_x_date(labels =date_format("%m/%d")) +
facet_wrap(~year, scales="free", ncol = 2)
#找出每篇貼文的total
bm_c_total <- bm_comment %>%
group_by(artUrl) %>%
summarize(total = n())
#bm_comment與total合併
bm_c_tfidf <- bm_comment %>%
select(word, artUrl, count) %>%
left_join(bm_c_total)
## Joining, by = "artUrl"
#製作tf-idf
bm_c_tfidf <- bm_c_tfidf %>%
bind_tf_idf(word, count, total) %>%
distinct()
## Warning in bind_tf_idf.data.frame(., word, count, total): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
#找出前十tf-idf的詞
t <- bm_c_tfidf %>% group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
bm_c_cor <- bm_comment %>%
group_by(word) %>%
filter(n() > 3) %>%
pairwise_cor(word, artUrl, sort = T)
# 顯示相關性大於0.55的組合
set.seed(2020)
bm_c_cor %>%
filter(correlation > 0.55) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
theme(text = element_text(family = "Heiti TC Light"))+
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
bm_c_cor %>%
filter(item1 %in% c("支持", "絕望", "驚訝")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
theme(text = element_text(family = "Heiti TC Light"))+
facet_wrap(~ item1, scales = "free") +
coord_flip()
## Selecting by correlation
噁心跟支持都大量出現在7/11 @政治時事
怪奇物語: 文章數-日期
s_date <- s_post %>%
select(artDate, artUrl) %>%
distinct()
s_article_count_by_date <- s_date %>%
group_by(artDate) %>%
summarise(count = n())
怪奇物語: 文章數-月份
s_post <- s_post %>% mutate(ym = format(s_post$artDate, format = "%Y/%m")) #抓出年/月份
s_months <- s_post %>%
select(artUrl, ym) %>%
distinct()
s_article_count_by_month <- s_months %>%
group_by(ym) %>%
summarise(count = n())
s_months_plot <- s_months %>%
ggplot(aes(x = ym)) +
geom_bar(color = "pink", size = 0.5) +
ggtitle("Netflix討論文章數(月份)") +
geom_text(aes(label=..count..), stat="count", color = "black", vjust = -0.5, size = 3) +
xlab("y/m") +
theme(text = element_text(family = "Heiti TC Light")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
s_months_plot
我們可以看到有三個貼文數數量最多的年分與月份落在2017/11、2019/07、2019/08 分別是11篇、24篇、8篇,相較於其他月份,這三月明顯高出許多,推測可能跟第二季、第三季發布時間有關 怪奇物語於2016年7月15日,第1季全8集故事在netflix上發布,2017年10月27日,netflix發布了第2季全季9集,第3季(共8集)則於於2019年7月4日)整季上線
#怪奇物語
s_tokens_by_month <- s_post %>%
count(ym, word, sort = T)
#(最多文章月份)的詞頻
s_1907 <- s_tokens_by_month %>%
filter(ym == "2019/07" |ym == "2017/11" | ym == "2019/08" | ym == "2017/10") %>%
group_by(ym) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ym, scales="free", ncol = 2) +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
s_1907
怪奇物語-留言分析
#怪奇物語
s_comment <- s_comment %>%
mutate(ym = format(artDate, format = "%Y/%m"))
s_ctokens_by_month <- s_comment %>%
count(ym, word, sort = T)
#怪奇物語
s_cmonths <- s_comment %>%
select(ym, artUrl)
s_cmonths_plot <- s_cmonths %>%
ggplot(aes(x = ym)) +
geom_bar(color = "pink", size = 0.5) +
ggtitle("怪奇物語討論文章數(月份)") +
geom_text(aes(label=..count..), stat="count", color = "black", vjust = -0.5, size = 3) +
xlab("y/m") +
theme(text = element_text(family = "Heiti TC Light")) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
s_cmonths_plot
我們發現留言的部分與貼文類似,在第二季、第三季發布日前後達到高峰
#怪奇物語
#(最多文章月份)的詞頻
s_ctokens_by_month <- s_comment %>%
count(word, ym, sort = T)
s_c1907 <- s_ctokens_by_month %>%
filter(ym == "2019/07" | ym == "2017/10" | ym == "2017/11" | ym == "2019/08") %>%
group_by(ym) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ym, scales="free", ncol = 2) +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
s_c1907
在2017/11中出現年代的詞彙,因為怪奇物語的年代是80年代,影集中也出現許多勾起回憶的場景,像是大型機台、音樂、服裝等等的東西
#怪奇物語
#s_ccloud <- s_comment %>%
# select(word, ym) %>%
# group_by(word) %>%
# filter(ym == "2019/07" | ym == "2017/10" | ym == "2017/11" | ym == "2019/08") %>%
# summarize(count = n()) %>%
# filter(count > 1) %>%
# wordcloud2()
#s_ccloud
#怪奇物語
s_comment <- s_comment %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
s_c_count <- s_comment %>% select(sentiment, word, artDate) %>%
group_by(word) %>%
summarise(count = n()) #加word的詞頻
s_comment <- s_comment %>% left_join(s_c_count) #與s_comment合併
## Joining, by = "word"
s_comment <- s_comment %>% mutate(year = format(artDate, format = "%Y"))
s_comment %>%
ggplot() +
geom_line(aes(x = artDate, y = count, colour = sentiment)) +
scale_x_date(labels =date_format("%Y/%m/%d")) +
facet_wrap(~year, scales="free", ncol = 2)
#怪奇物語
#找出每篇貼文的total
s_c_total <- s_comment %>%
group_by(artUrl) %>%
summarize(total = n())
#s_comment與total合併
s_c_tfidf <- s_comment %>%
select(word, artUrl, count) %>%
left_join(s_c_total)
## Joining, by = "artUrl"
#製作tf-idf
s_c_tfidf <- s_c_tfidf %>%
bind_tf_idf(word, count, total)
## Warning in bind_tf_idf.data.frame(., word, count, total): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
#找出前十tf-idf的詞
t1 <- s_c_tfidf %>% group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
#怪奇物語
s_c_cor <- s_comment %>%
group_by(word) %>%
filter(n() > 3) %>%
pairwise_cor(word, artUrl, sort = T)
# 顯示相關性大於0.45的組合
set.seed(2020)
#怪奇物語
s_c_cor %>%
filter(correlation > 0.45) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
#怪奇物語
s_c_cor %>%
filter(item1 %in% c("成功","愛情","可悲")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
## Selecting by correlation
愛情:在討論劇中小孩間的愛情,有人覺得多餘、討厭,有人覺得很棒、好笑
成功:討論怪奇物語成功的關鍵
可悲:是因為有些鄉民言論過激,且沒有邏輯的批評,引發論戰。有幾個鄉民討論的點有愛情、劇中媽媽會不會很自私等等
結論:我們發現Netflix相對於其他線上影音平台,在ptt上討論度高出非常多,因此推論出Netflix是大家看劇的首選NO.1,而在PTT上聲量較高的影集,風格也不盡相同,如果喜歡回味80年代復古味道可以看看「怪奇物語」,想要體驗互動劇情的話就可以看「黑鏡」。