資料集合 (1)ptt 的八卦版與政黑版
關鍵字:「阿滴」、「都省瑞」、「Ray Du」、「紐時」、「紐約時報」
時間區段:2019-04-03 ~ 2020-04-17
(2)dcard 的時事版與武漢肺炎版
關鍵字:「阿滴」、「都省瑞」、「Ray Du」、「紐時」、「紐約時報」
時間區段:2020-04-09 ~ 2020-04-18
阿滴作為一個訂閱數破250萬的網路紅人,其行為除了對台灣社會都有一定的影響以外,其一舉一動更是會被社會大眾帶著放大鏡檢視,不論是大選時期的鼓勵年情人投票事件,到最近的紐約時報事件,都可以看出阿滴在台灣的影響力,本研究將透過文字探勘技術,分析網友對阿滴的意見以及想法,並結合時事對分析結果進行解釋。
## [1] ""
ad <- fread("阿滴八卦版ptt.csv", encoding = "UTF-8", header = TRUE)
ad1 <- fread("阿滴政黑板.csv", encoding = "UTF-8", header = TRUE)
Gossiping = fread("阿滴八卦版ptt.csv", header = T, encoding = "UTF-8") %>%
mutate(sentence = gsub("\n", "。", sentence))
Gossiping_comment = fread("阿滴八卦版ptt留言.csv", header = T, encoding = "UTF-8") %>%
mutate(commentContent = gsub(":", "", commentContent))
HatePolitics = fread("阿滴政黑板.csv", header = T, encoding = "UTF-8") %>%
mutate(sentence = gsub("\n", "。", sentence))
HatePolitics_comment = fread("阿滴政黑板留言.csv", header = T, encoding = "UTF-8") %>%
mutate(commentContent = gsub(":", "", commentContent))
#合併八卦版和政黑板
ptt<-rbind(Gossiping, HatePolitics)
ptt_comment<-rbind(Gossiping_comment, HatePolitics_comment)
ptt$artDate= ptt$artDate %>% as.Date("%Y/%m/%d")
ptt$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", ptt$sentence) #去除文章中網址
dcard1 <- read_csv("dcard_time.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## sentence = col_character()
## )
dcard1$category = 'Time'
dcard2 <- read_csv("dcard_wuhan.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## sentence = col_character()
## )
dcard2$category = 'Wuhan'
dcard<-rbind(dcard1, dcard2)
dcard$artDate= dcard$artDate %>% as.Date("%Y/%m/%d")
dcard$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", dcard$sentence) #去除文章中網址
RD = ptt %>% group_by(artDate) %>% count()
plot(RD$artDate,RD$n,type='l',xlab = "",
ylab = "文章數",col='orange')
axis.Date(side=1,at=RD$artDate,format='%Y-%m-%d',labels=T,las=1) #ADD X-AXIS LABELS WITH"YEAR-MONTH" FORMAT
axis(side=2,at=RD$artDate,las=2) #ADD Y-AXIS LABELS
abline(v=as.Date(c('2020-04-12')),col='gray',lty=3)
我們可以觀察到平常PTT網友沒有很熱衷於討論阿滴,在2020/4/11以後討論阿滴的聲量達到高峰,顯然是因為紐時廣告事件
#接著看這幾天當中留言大多是“推”還是“噓”或是無意見
ptt_comment %>%
filter(artDate > ymd("2020-04-11") & artDate < ymd("2020-04-15")) %>%
count(commentStatus)
## # A tibble: 3 x 2
## commentStatus n
## <chr> <int>
## 1 → 4560
## 2 推 6678
## 3 噓 2064
end <- as.Date("2020-04-10")
RD = ptt %>% filter( artDate < end) %>% group_by(artDate) %>% count()
plot(RD$artDate,RD$n,type='l',xlab = "",
ylab = "文章數",col='orange')
abline(v=as.Date(c('2019-12-27')),col='gray',lty=3)
紐時廣告事件以前,阿滴每個月內的討論量大約只有1~4篇,12月時討論聲量上升的原因是阿滴號召其他youtuber一起拍「呼籲年輕人投票」的影片,接下來會以TFIDF討論其他時間點討論上升的原因。
RD = ptt
RD$artDate = format(RD$artDate,"%Y/%m")
RD = RD %>% group_by(artDate) %>% count()
ggplot(data=RD) +
# 先畫bar plot
geom_bar(aes(x=factor(1),
y=n,
fill=artDate),
stat = "identity"
) +
# 再沿著Y,轉軸成圓餅圖
coord_polar("y", start=0)
4月的文章數量就佔了超過75%總數的,可見紐時廣告讓阿滴獲得了很高的曝光率
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer <- worker( user="user_dict.txt", stop_word = "stop_words.txt")
g_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
我們想討論不同時間點阿滴所被討論的內容以甚麼為主,所以用時間為單位,將資料分組。
hate <- HatePolitics
re_hate <- HatePolitics_comment
ptt_8 <- Gossiping
re_ptt_8 <- Gossiping_comment
body<-rbind(ptt_8,hate)
re<-rbind(re_ptt_8,re_hate)
data<-as.data.frame(matrix(NA,nrow(body),3))
colnames(data)<-c("title","body","respon")
for(i in 1:nrow(data)){
tmp_title<-(body$artTitle)[i]
id<-which(re$artTitle==tmp_title)
try(tmp_data<-re[id,], silent = T)
try(tmp_re_one<-paste2(tmp_data$commentContent, sep = " "), silent = T)
body$sentence[i]<-str_replace_all(body$sentence[i],"[[:punct:]]","")
try(tmp_re_one<-str_replace_all(tmp_re_one,"[[:punct:]]",""), silent = T)
data[i,]<-c(tmp_title,body$sentence[i],tmp_re_one)
}
data$body<-paste(data$body,data$respon)
data<-data[-c(1:4),]
time<-ymd(body$artDate)[-c(1:4)]
#
#隨機抽出10篇檢查斷字
jieba_tokenizer = worker(stop_word ="stop_words.txt")
set.seed(20200419)
(id<-sample(nrow(data),10))
## [1] 349 18 362 260 159 85 218 22 43 325
new_user_word(jieba_tokenizer,c("阿滴","譚德塞","病例","美國人","can","help"
,"who","落落長","taiwan","翻譯","台灣","文法","批評"))
## [1] TRUE
##
tmp<-as.data.frame(table(segment(removeNumbers(tolower(data$body[1])), jieba_tokenizer)))
tmp$id=i
tf<-tmp
for(i in 2:nrow(data)){
tmp<-as.data.frame(table(segment(removeNumbers(tolower(data$body[i])), jieba_tokenizer)))
tmp$id=i
tf<-rbind(tf,tmp)
}
library(tidytext)
dtm<-as.matrix(cast_sparse(tf,id, Var1, Freq))
#
time<-(month(time))
mid<-dtm[which(time%in%c(5,6,7,8,9,10)),] #2019 5到10月一組
end<-dtm[which(time%in%c(11,12)),] #2019 11到12月一組
begin<-dtm[which(time%in%c(1,2,3)),] #2020 1到3月一組
m4<-dtm[which(time%in%c(4)),] #2020 4月一組
mid<-apply(mid, 2, sum)
end<-apply(end, 2, sum)
begin<-apply(begin, 2, sum)
m4<-apply(m4, 2, sum)
dtm2<-rbind(mid,end,begin,m4)
tf<-dtm2
tf[,]<-NA
for(i in 1:4){
tf[i,]<-dtm2[i,]/sum(dtm2[i,])
}
idf<-c()
for(i in 1:ncol(tf)){
idf[i]<-log10((4/sum(dtm2[,i]!=0)))
}
tfidf<-dtm2
tfidf[,]<-NA
for(i in 1:ncol(tfidf)){
tfidf[,i]<-tf[,i]*idf[i]
}
n=10
p=data.frame(
word=names(tfidf[1,order(tfidf[1,], decreasing =T)][1:n]),
tfidf=tfidf[1,order(tfidf[1,], decreasing =T)][1:n]
)
p$time=1
for(i in 2:4){
tmp=data.frame(
word=names(tfidf[i,order(tfidf[i,], decreasing =T)][1:n]),
tfidf=tfidf[i,order(tfidf[i,], decreasing =T)][1:n]
)
tmp$time=i
p= rbind(p,tmp)
}
p$time=c(rep("2019 年中",n),rep("2019 年底",n),rep("2020 1~3月",n),rep("2020 4月",n))
ggplot(p,aes(word, tfidf, fill =time)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tfidf") +
facet_wrap(~time, ncol = 2, scales = "free") +
coord_flip()
我們能從此圖中看出不同時間中,PTT討論阿滴的內容分成:
1.2019年中 採訪總統專機(https://www.youtube.com/watch?v=1YcBfJMGxqM)
2.2019年底 號召youtuber拍投票影片(https://www.youtube.com/watch?v=jGppF61TXsw)、違反勞基法(https://star.ettoday.net/news/1600009)
3.2020年1~3月 選舉相關討論(https://www.youtube.com/watch?v=aZ5znkF_Umw)
4.2020年4月 紐時廣告事件(https://www.storm.mg/lifestyle/2512106)
Gossiping$artDate= Gossiping$artDate %>% as.Date("%Y/%m/%d")
dcard$artDate = dcard$artDate %>% as.Date("%Y/%m/%d")
ad_tokens <- Gossiping %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
select(-artTime, -artUrl)
ad_tokens_count <- ad_tokens %>%
filter(!(word %in% c("阿滴", "http","com","jpg","imgur"))) %>%
group_by(artTitle,artDate,word) %>%
summarise(count = n()) %>%
arrange(desc(count))
ad_dcard_tokens <- dcard %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
select(-artTime, -artUrl)
ad_dcard_tokens_count <- ad_dcard_tokens %>%
filter(!(word %in% c("阿滴", "http","com","jpg","imgur"))) %>%
group_by(artTitle,artDate,word) %>%
summarise(count = n()) %>%
arrange(desc(count))
ad_tokens_count
## # A tibble: 20,211 x 4
## # Groups: artTitle, artDate [280]
## artTitle artDate word count
## <chr> <date> <chr> <int>
## 1 Re:[新聞]阿滴曝光「台灣給世界的公開信」中英文 2020-04-11 台灣 82
## 2 [新聞]反擊譚德塞募資破千萬!阿滴刊《紐時》 2020-04-11 the 47
## 3 [新聞]阿滴曝光「台灣給世界的公開信」中英文 2020-04-11 the 47
## 4 [新聞]反擊譚德塞募資破千萬!阿滴刊《紐時》 2020-04-11 台灣 35
## 5 [新聞]阿滴開箱總統專機曝光空軍一號內裝 2019-07-13 總統 35
## 6 [新聞]阿滴曝光「台灣給世界的公開信」中英文 2020-04-11 台灣 34
## 7 Re:[問卦]認真?阿滴到底憑啥覺得他能代表台灣? 2020-04-11 台灣 34
## 8 [新聞]與阿滴英文打擂台?蔡英文今晚推Youtube 2019-08-18 英文 28
## 9 [新聞]反擊譚德塞募資破千萬!阿滴刊《紐時》 2020-04-11 and 24
## 10 [新聞]反擊譚德塞募資破千萬!阿滴刊《紐時》 2020-04-11 of 24
## # ... with 20,201 more rows
#建情緒分析字典
P <- read_file("positive.txt")
N <- read_file("negative.txt")
#將字串依","分割
#strsplit會回傳list , 取list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
ad_sentiment_count <- ad_tokens_count %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
mutate(count=sum(count))
## Adding missing grouping variables: `artTitle`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
g=ad_sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,col=sentiment))+
scale_x_date(labels = date_format("%Y/%m/%d"))
p=ggplotly(g)
a1 <- list(x = as.numeric(ymd("2020-04-9")),y = 1,
text = "紐約時報事件發生",ax = -80,ay = -100)
p%>%
layout(annotations = a1)
ad_sentiment_count <- ad_tokens_count %>%
select(artDate,word,count) %>%
group_by(artDate) %>%
mutate(day_count = sum(count)) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
mutate(count=sum(count)) %>%
mutate(day_mean = count / day_count)
## Adding missing grouping variables: `artTitle`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
g=ad_sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=day_mean,col=sentiment))+
scale_x_date(labels = date_format("%Y/%m/%d"))
p=ggplotly(g)
a1 <- list(x = as.numeric(ymd("2019-07-14")),y = 0.105,
text = "參訪總統專機",ax = -40,ay = -50)
a2 <- list(x = as.numeric(ymd("2019-08-14")),y = 0.1333,
text = "爆料違反勞基法",ax = -20,ay = -50)
a3 <- list(x = as.numeric(ymd("2019-12-26")),y = 0.1333,
text = "鼓勵投票影片發布",ax = -20,ay = -0.05)
p%>%
layout(annotations = a1)%>%
layout(annotations = a2)%>%
layout(annotations = a3)
#Word Correlation
jieba_tokenizer = worker()
jieba_tokenizer1 = function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1] #去掉字串長度爲1的詞彙
return(tokens)
}
})
}
#使用以jieba_tokenizer斷詞後的檔案
ptt_tokens <- ptt %>%
unnest_tokens(word, sentence, token = jieba_tokenizer1) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>% #將包含英文字母或和數字的詞彙清除
filter(!word %in% stop_words$word)
ptt_word_pair <- ptt_tokens %>%
pairwise_count(word, artUrl, sort = TRUE)
ptt_word_pair
## # A tibble: 2,132,932 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 台灣 阿滴 190
## 2 阿滴 台灣 190
## 3 英文 阿滴 143
## 4 阿滴 英文 143
## 5 什麼 阿滴 133
## 6 阿滴 什麼 133
## 7 可以 阿滴 128
## 8 阿滴 可以 128
## 9 就是 阿滴 124
## 10 阿滴 就是 124
## # ... with 2,132,922 more rows
ptt_word_cors <- ptt_tokens %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
ptt_word_cors
## # A tibble: 73,170 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 新聞標題 完整 0.929
## 2 完整 新聞標題 0.929
## 3 完整 署名 0.913
## 4 署名 完整 0.913
## 5 連結 新聞標題 0.908
## 6 網址 新聞標題 0.908
## 7 新聞標題 連結 0.908
## 8 新聞標題 網址 0.908
## 9 備註 新聞標題 0.901
## 10 新聞標題 備註 0.901
## # ... with 73,160 more rows
set.seed(2019)
ptt_word_cors %>%
filter(correlation > .5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
Hm_tokens <- HatePolitics_comment %>%
unnest_tokens(word, commentContent, token = g_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>% #將包含英文字母或和數字的詞彙清除
filter(!word %in% stop_words$word)
Hm_word_pair <- Hm_tokens %>%
pairwise_count(word, artUrl, sort = TRUE)
Hm_word_cors <- Hm_tokens %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
set.seed(2019)
Hm_word_cors %>%
filter(correlation > .4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
ad <- fread("阿滴八卦版ptt.csv", encoding = "UTF-8", header = TRUE)
ad1 <- fread("阿滴政黑板.csv", encoding = "UTF-8", header = TRUE)
dcard1 <- read_csv("dcard_time.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## sentence = col_character()
## )
dcard1$category = 'Time'
dcard2 <- read_csv("dcard_wuhan.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## sentence = col_character()
## )
dcard2$category = 'Wuhan'
dcard<-rbind(dcard1, dcard2)
dcard$artDate= dcard$artDate %>% as.Date("%Y/%m/%d")
dcard$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", dcard$sentence) #去除文章中網址
tokens <- dcard %>% unnest_tokens(word, artTitle, token=g_tokenizer)
articles <- dcard %>% unnest_tokens(word, sentence, token=g_tokenizer)
##文章內容字頻
articles_count <- articles %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
head(articles_count, 15)
## # A tibble: 15 x 2
## word sum
## <chr> <int>
## 1 台灣 216
## 2 我們 122
## 3 廣告 105
## 4 紐約時報 58
## 5 美國 57
## 6 who 56
## 7 阿滴 53
## 8 世界 49
## 9 taiwan 44
## 10 看到 43
## 11 the 40
## 12 一個 39
## 13 可以 39
## 14 集資 38
## 15 募資 36
articles_count[1:15,] %>%
arrange(sum) %>%
{barplot(.$sum, names=.$word,horiz=T,las=1,col=rainbow(15))}
##文章內容文字雲
articles_count %>% wordcloud2()
data_count_by_art <- articles %>%
group_by(word,artTitle, artDate, artTime, artUrl) %>%
count(word, sort = TRUE) %>%
ungroup()
data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")
word_count <- data_count_by_art %>%
select(word,n) %>%
group_by(word) %>%
summarise(count = sum(n)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count
## # A tibble: 516 x 2
## word count
## <chr> <int>
## 1 台灣 216
## 2 我們 122
## 3 廣告 105
## 4 紐約時報 58
## 5 美國 57
## 6 who 56
## 7 阿滴 53
## 8 世界 49
## 9 taiwan 44
## 10 看到 43
## # ... with 506 more rows
##正負面用字比較
word_count %>% inner_join(LIWC) %>%
arrange(desc(abs(count))) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14,family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
由此可得知dcard文章對阿滴買紐時廣告的評價趨向於正面
RD = dcard %>% group_by(artDate) %>% count()
plot(RD$artDate,RD$n,type='l',xlab = "",
ylab = "文章數",col='orange',xlim=as.Date(c('2020-04-01','2020-04-21')),ylim=c(0,30))
axis.Date(side=1,at=RD$artDate,format='%Y-%m-%d',labels=T,las=1) #ADD X-AXIS LABELS WITH"YEAR-MONTH" FORMAT
axis(side=2,at=RD$artDate,las=2) #ADD Y-AXIS LABELS
abline(v=as.Date(c('2020-04-11','2020-04-31')),col='gray',lty=3)
#PTT
gossip <- read_csv("阿滴八卦版ptt.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
gossip$category = 'gossip'
hate <- read_csv("阿滴政黑板.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
hate$category = 'hate'
ptt<-rbind(gossip, hate)
ptt$artDate= ptt$artDate %>% as.Date("%Y/%m/%d")
ptt$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", ptt$sentence) #去除文章中網址
start <- as.Date("2020-04-09")
NewYork = ptt %>% filter( artDate> start)
#斷詞
NewYork_tokens <- NewYork %>% unnest_tokens(word, artTitle, token=g_tokenizer)
NewYork_articles <- NewYork %>% unnest_tokens(word, sentence, token=g_tokenizer)
#文章內容字頻
data_count_by_art <- NewYork_articles %>%
group_by(word,artTitle, artDate, artTime, artUrl) %>%
count(word, sort = TRUE) %>%
ungroup()
data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")
word_count <- data_count_by_art %>%
select(word,n) %>%
group_by(word) %>%
summarise(count = sum(n)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
#正負面用字比較 top30
word_count %>% inner_join(LIWC) %>%
arrange(desc(abs(count))) %>%
mutate(word = reorder(word,count)) %>%
top_n(30,wt = count) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14,family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
frequency <- NewYork_articles %>%
#mutate(word = str_extract(word, "[a-z']+")) %>% #像grep一樣
count(category, word) %>%
filter(nchar(.$word)>1) %>%
group_by(category) %>%
mutate(proportion = n / sum(n)) %>% #算出字詞比例
select(-n) %>%
spread(category, proportion) %>%
gather(category, proportion, `gossip`)
ggplot(frequency, aes(x = proportion, y = `hate`, color = abs(`hate` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~category, ncol = 2) +
theme(legend.position="none") +
labs(y = "政黑版", x = "八卦版")
## Warning: Removed 6465 rows containing missing values (geom_point).
## Warning: Removed 6465 rows containing missing values (geom_text).
可看出政黑版的討論將紐時廣告一事導向民進黨政治操弄的手段
NewYork_articles2 = NewYork_articles
NewYork_articles2$category = 'ptt'
NewYork_articles2 = NewYork_articles2 %>%select(artTitle,artDate,artTime,artUrl,category,word)
Dcard_articles <- dcard %>% unnest_tokens(word, sentence, token=g_tokenizer)
Dcard_articles$category = 'Dcard'
All = rbind(NewYork_articles2,Dcard_articles)
All
## # A tibble: 39,554 x 6
## artTitle artDate artTime artUrl category word
## <chr> <date> <time> <chr> <chr> <chr>
## 1 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 1.
## 2 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 媒體
## 3 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 來源
## 4 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt etto~
## 5 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 2.
## 6 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 記者
## 7 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 署名
## 8 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 張筱涵~
## 9 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 3.
## 10 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt 完整
## # ... with 39,544 more rows
f <- as.Date("2020-01-07")
sentiment_count = All %>% filter(artDate != f) %>% inner_join(LIWC) %>%
group_by(category,artDate,sentiment) %>%
summarise(count=n()) %>%
spread(.,key = sentiment,value = count) %>%
mutate(negative = -negative)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
ggplot(sentiment_count,aes(x=artDate,y=count))+
geom_segment( aes(x=artDate, xend=artDate, y=`positive`, yend=`negative`),size=1,color="grey50") +
geom_point( aes(x=artDate, y=`positive`), color="#DE7E73", size=2 ) + #紅色是正面情緒
geom_point( aes(x=artDate, y=`negative`), color="#84B1ED", size=2 ) + #藍色世負面情緒
coord_flip()+
facet_wrap(~ category, scales = "free")
All %>% filter(artDate != f) %>% inner_join(LIWC) %>%
group_by(category,artDate,sentiment) %>%
summarise(count=n()) %>%
#spread(sentiment, count, fill = 0) %>%
ggplot() +
geom_line(aes(x = artDate, y = count, colour = sentiment), size = 0.8) +
facet_wrap(~ category, scales = "free")
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
由這兩張圖來看,ptt網友的討論幾乎為正負參半,dcard的網友似乎對阿滴買廣告的事件較為友善 尤其在4/12阿滴發布道歉影片後,情緒值偏向正面
frequency <- All %>%
#mutate(word = str_extract(word, "[a-z']+")) %>% #像grep一樣
count(category, word) %>%
filter(nchar(.$word)>1) %>%
group_by(category) %>%
mutate(proportion = n / sum(n)) %>% #算出字詞比例
select(-n) %>%
spread(category, proportion) %>%
gather(category, proportion, `Dcard`)
ggplot(frequency, aes(x = proportion, y = `ptt`, color = abs(`ptt` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~category, ncol = 2) +
theme(legend.position="none") +
labs(y = "ptt", x = "Dcard")
## Warning: Removed 6997 rows containing missing values (geom_point).
## Warning: Removed 6997 rows containing missing values (geom_text).
這篇對比可以和剛剛的情緒值作為對照,偏向ptt的用字會比較負面一點,例如“風向”、“道歉”等,而Dcard的用字則是中立偏正面,例如“taiwancanhelp”、“防疫”、“參與”等等。
兩者都有提到台灣、反擊、廣告等字,且台灣出現的比例很高。
阿滴的網路聲量主要來自於爭議事件,尤其2020年4月所發生的紐約時報事件最為名顯,從TFIDF也可以看出,各時間區段的重要用字也都來自於那段時間的爭議事件,而情緒分析也可以看出整體而言正負面評價是持平的,只有特殊事件發生時會有比較大的不同。比較PTT與Dcard也可以看出,Dcard相對於PTT對阿滴更為正面,且用詞也有明顯得不同。