資料集合 (1)ptt 的八卦版與政黑版
關鍵字：「阿滴」、「都省瑞」、「Ray Du」、「紐時」、「紐約時報」
時間區段：2019-04-03 ～ 2020-04-17
(2)dcard 的時事版與武漢肺炎版
關鍵字：「阿滴」、「都省瑞」、「Ray Du」、「紐時」、「紐約時報」
時間區段：2020-04-09 ～ 2020-04-18

動機與分析目的:

阿滴作為一個訂閱數破250萬的網路紅人，其行為除了對台灣社會都有一定的影響以外，其一舉一動更是會被社會大眾帶著放大鏡檢視，不論是大選時期的鼓勵年情人投票事件，到最近的紐約時報事件，都可以看出阿滴在台灣的影響力，本研究將透過文字探勘技術，分析網友對阿滴的意見以及想法，並結合時事對分析結果進行解釋。

系統參數設定

## [1] ""

安裝需要的packages

讀取資料

ad <- fread("阿滴八卦版ptt.csv", encoding = "UTF-8", header = TRUE)
ad1 <- fread("阿滴政黑板.csv", encoding = "UTF-8", header = TRUE)
Gossiping = fread("阿滴八卦版ptt.csv", header = T, encoding = "UTF-8") %>%
              mutate(sentence = gsub("\n", "。", sentence))
Gossiping_comment = fread("阿滴八卦版ptt留言.csv", header = T, encoding = "UTF-8") %>%
              mutate(commentContent = gsub(":", "", commentContent))
HatePolitics = fread("阿滴政黑板.csv", header = T, encoding = "UTF-8") %>%
              mutate(sentence = gsub("\n", "。", sentence))
HatePolitics_comment = fread("阿滴政黑板留言.csv", header = T, encoding = "UTF-8") %>%
              mutate(commentContent = gsub(":", "", commentContent))

#合併八卦版和政黑板
ptt<-rbind(Gossiping, HatePolitics)
ptt_comment<-rbind(Gossiping_comment, HatePolitics_comment)

ptt$artDate= ptt$artDate %>% as.Date("%Y/%m/%d")
ptt$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", ptt$sentence) #去除文章中網址
dcard1 <- read_csv("dcard_time.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   sentence = col_character()
## )

dcard1$category = 'Time' 
dcard2 <- read_csv("dcard_wuhan.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   sentence = col_character()
## )

dcard2$category = 'Wuhan'
dcard<-rbind(dcard1, dcard2)
dcard$artDate= dcard$artDate %>% as.Date("%Y/%m/%d")
dcard$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", dcard$sentence) #去除文章中網址

PTT討論聲量圖-全部

RD = ptt %>% group_by(artDate) %>% count() 
plot(RD$artDate,RD$n,type='l',xlab = "",
  ylab = "文章數",col='orange')
axis.Date(side=1,at=RD$artDate,format='%Y-%m-%d',labels=T,las=1) #ADD X-AXIS LABELS WITH"YEAR-MONTH" FORMAT
axis(side=2,at=RD$artDate,las=2) #ADD Y-AXIS LABELS
abline(v=as.Date(c('2020-04-12')),col='gray',lty=3)

我們可以觀察到平常PTT網友沒有很熱衷於討論阿滴，在2020/4/11以後討論阿滴的聲量達到高峰，顯然是因為紐時廣告事件

#接著看這幾天當中留言大多是“推”還是“噓”或是無意見

ptt_comment %>% 
  filter(artDate > ymd("2020-04-11") &  artDate < ymd("2020-04-15")) %>% 
  count(commentStatus)

## # A tibble: 3 x 2
##   commentStatus     n
##   <chr>         <int>
## 1 →              4560
## 2 推             6678
## 3 噓             2064

留言推的人較多一些

PTT討論聲量圖-4/10以前

end <- as.Date("2020-04-10")
RD = ptt %>% filter( artDate < end) %>% group_by(artDate) %>% count() 
plot(RD$artDate,RD$n,type='l',xlab = "",
  ylab = "文章數",col='orange')
abline(v=as.Date(c('2019-12-27')),col='gray',lty=3)

紐時廣告事件以前，阿滴每個月內的討論量大約只有1~4篇，12月時討論聲量上升的原因是阿滴號召其他youtuber一起拍「呼籲年輕人投票」的影片，接下來會以TFIDF討論其他時間點討論上升的原因。

圓餅圖

RD = ptt 
RD$artDate = format(RD$artDate,"%Y/%m")
RD = RD %>% group_by(artDate) %>% count()
ggplot(data=RD) +
    # 先畫bar plot
    geom_bar(aes(x=factor(1),
                 y=n,
                 fill=artDate),
             stat = "identity"
             ) +
    # 再沿著Y，轉軸成圓餅圖
    coord_polar("y", start=0)

4月的文章數量就佔了超過75%總數的，可見紐時廣告讓阿滴獲得了很高的曝光率

jieba

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer <- worker( user="user_dict.txt", stop_word = "stop_words.txt")

g_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

TF-IDF(看時間)

我們想討論不同時間點阿滴所被討論的內容以甚麼為主，所以用時間為單位，將資料分組。

hate <- HatePolitics
re_hate <- HatePolitics_comment
ptt_8 <- Gossiping
re_ptt_8 <- Gossiping_comment

body<-rbind(ptt_8,hate)
re<-rbind(re_ptt_8,re_hate)
data<-as.data.frame(matrix(NA,nrow(body),3))
colnames(data)<-c("title","body","respon")

for(i in 1:nrow(data)){
  tmp_title<-(body$artTitle)[i]
  id<-which(re$artTitle==tmp_title)
  try(tmp_data<-re[id,], silent = T)
  try(tmp_re_one<-paste2(tmp_data$commentContent, sep = " "), silent = T)
  body$sentence[i]<-str_replace_all(body$sentence[i],"[[:punct:]]","")
  try(tmp_re_one<-str_replace_all(tmp_re_one,"[[:punct:]]",""), silent = T)
  data[i,]<-c(tmp_title,body$sentence[i],tmp_re_one)
}

data$body<-paste(data$body,data$respon)
data<-data[-c(1:4),]
time<-ymd(body$artDate)[-c(1:4)]

#
#隨機抽出10篇檢查斷字
jieba_tokenizer = worker(stop_word ="stop_words.txt")
set.seed(20200419)
(id<-sample(nrow(data),10))

##  [1] 349  18 362 260 159  85 218  22  43 325

new_user_word(jieba_tokenizer,c("阿滴","譚德塞","病例","美國人","can","help"
                                ,"who","落落長","taiwan","翻譯","台灣","文法","批評"))

## [1] TRUE

##
tmp<-as.data.frame(table(segment(removeNumbers(tolower(data$body[1])), jieba_tokenizer)))
tmp$id=i
tf<-tmp
for(i in 2:nrow(data)){
  tmp<-as.data.frame(table(segment(removeNumbers(tolower(data$body[i])), jieba_tokenizer)))
  tmp$id=i
  tf<-rbind(tf,tmp)
}
library(tidytext)
dtm<-as.matrix(cast_sparse(tf,id, Var1, Freq))

#
time<-(month(time))
mid<-dtm[which(time%in%c(5,6,7,8,9,10)),] #2019 5到10月一組
end<-dtm[which(time%in%c(11,12)),]        #2019 11到12月一組
begin<-dtm[which(time%in%c(1,2,3)),]      #2020 1到3月一組
m4<-dtm[which(time%in%c(4)),]             #2020 4月一組
mid<-apply(mid, 2, sum)
end<-apply(end, 2, sum)
begin<-apply(begin, 2, sum)
m4<-apply(m4, 2, sum)

dtm2<-rbind(mid,end,begin,m4)
tf<-dtm2
tf[,]<-NA
for(i in 1:4){
  tf[i,]<-dtm2[i,]/sum(dtm2[i,])
}
idf<-c()
for(i in 1:ncol(tf)){
  idf[i]<-log10((4/sum(dtm2[,i]!=0)))
}
tfidf<-dtm2
tfidf[,]<-NA
for(i in 1:ncol(tfidf)){
  tfidf[,i]<-tf[,i]*idf[i]
}
n=10
p=data.frame(
  word=names(tfidf[1,order(tfidf[1,], decreasing =T)][1:n]),
  tfidf=tfidf[1,order(tfidf[1,], decreasing =T)][1:n]
)
p$time=1
for(i in 2:4){
  tmp=data.frame(
    word=names(tfidf[i,order(tfidf[i,], decreasing =T)][1:n]),
    tfidf=tfidf[i,order(tfidf[i,], decreasing =T)][1:n]
  )
  tmp$time=i
  p= rbind(p,tmp)
}
p$time=c(rep("2019 年中",n),rep("2019 年底",n),rep("2020 1~3月",n),rep("2020 4月",n))
ggplot(p,aes(word, tfidf, fill =time)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tfidf") +
  facet_wrap(~time, ncol = 2, scales = "free") +
  coord_flip()

我們能從此圖中看出不同時間中，PTT討論阿滴的內容分成：
1.2019年中採訪總統專機(https://www.youtube.com/watch?v=1YcBfJMGxqM)
2.2019年底號召youtuber拍投票影片(https://www.youtube.com/watch?v=jGppF61TXsw)、違反勞基法(https://star.ettoday.net/news/1600009)
3.2020年1~3月選舉相關討論(https://www.youtube.com/watch?v=aZ5znkF_Umw)
4.2020年4月紐時廣告事件(https://www.storm.mg/lifestyle/2512106)

整體情緒分析

Gossiping$artDate= Gossiping$artDate %>% as.Date("%Y/%m/%d")
dcard$artDate = dcard$artDate %>% as.Date("%Y/%m/%d")

ad_tokens <- Gossiping %>% 
  unnest_tokens(word, sentence, token=g_tokenizer) %>% 
  select(-artTime, -artUrl)

ad_tokens_count <- ad_tokens %>% 
  filter(!(word %in%  c("阿滴", "http","com","jpg","imgur"))) %>% 
  group_by(artTitle,artDate,word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))


ad_dcard_tokens <- dcard %>% 
  unnest_tokens(word, sentence, token=g_tokenizer) %>% 
  select(-artTime, -artUrl)

ad_dcard_tokens_count <- ad_dcard_tokens %>% 
  filter(!(word %in%  c("阿滴", "http","com","jpg","imgur"))) %>% 
  group_by(artTitle,artDate,word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

ad_tokens_count

## # A tibble: 20,211 x 4
## # Groups:   artTitle, artDate [280]
##    artTitle                                      artDate    word  count
##    <chr>                                         <date>     <chr> <int>
##  1 Re:[新聞]阿滴曝光「台灣給世界的公開信」中英文 2020-04-11 台灣     82
##  2 [新聞]反擊譚德塞募資破千萬！阿滴刊《紐時》    2020-04-11 the      47
##  3 [新聞]阿滴曝光「台灣給世界的公開信」中英文    2020-04-11 the      47
##  4 [新聞]反擊譚德塞募資破千萬！阿滴刊《紐時》    2020-04-11 台灣     35
##  5 [新聞]阿滴開箱總統專機曝光空軍一號內裝        2019-07-13 總統     35
##  6 [新聞]阿滴曝光「台灣給世界的公開信」中英文    2020-04-11 台灣     34
##  7 Re:[問卦]認真？阿滴到底憑啥覺得他能代表台灣？ 2020-04-11 台灣     34
##  8 [新聞]與阿滴英文打擂台？蔡英文今晚推Youtube   2019-08-18 英文     28
##  9 [新聞]反擊譚德塞募資破千萬！阿滴刊《紐時》    2020-04-11 and      24
## 10 [新聞]反擊譚德塞募資破千萬！阿滴刊《紐時》    2020-04-11 of       24
## # ... with 20,201 more rows

#建情緒分析字典

P <- read_file("positive.txt")

N <- read_file("negative.txt")

#將字串依","分割
#strsplit會回傳list , 取list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

情緒字出現次數趨勢-ptt

ad_sentiment_count <- ad_tokens_count %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  mutate(count=sum(count))

## Adding missing grouping variables: `artTitle`

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

g=ad_sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,col=sentiment))+
  scale_x_date(labels = date_format("%Y/%m/%d")) 
p=ggplotly(g)
 a1 <- list(x = as.numeric(ymd("2020-04-9")),y = 1,
          text = "紐約時報事件發生",ax = -80,ay = -100)
p%>%
  layout(annotations = a1)

4月因為文章數暴增，情緒分數也跟著暴增

情緒字/總字數 ptt

ad_sentiment_count <- ad_tokens_count %>%
  select(artDate,word,count) %>%
  group_by(artDate) %>%
  mutate(day_count = sum(count)) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  mutate(count=sum(count)) %>%
  mutate(day_mean = count / day_count)

## Adding missing grouping variables: `artTitle`

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

g=ad_sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=day_mean,col=sentiment))+
  scale_x_date(labels = date_format("%Y/%m/%d")) 
p=ggplotly(g)
a1 <- list(x = as.numeric(ymd("2019-07-14")),y = 0.105,
          text = "參訪總統專機",ax = -40,ay = -50)
a2 <- list(x = as.numeric(ymd("2019-08-14")),y = 0.1333,
          text = "爆料違反勞基法",ax = -20,ay = -50)
a3 <- list(x = as.numeric(ymd("2019-12-26")),y = 0.1333,
          text = "鼓勵投票影片發布",ax = -20,ay = -0.05)
p%>%
  layout(annotations = a1)%>%
  layout(annotations = a2)%>%
  layout(annotations = a3)

為去除樣本不均勻的影響，因此除以當天日期的總字數後再畫一次情緒分析圖

#Word Correlation

jieba_tokenizer = worker()

jieba_tokenizer1 = function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]   #去掉字串長度爲1的詞彙
      return(tokens)
    }
  })
}

計算兩個詞彙同時出現的總次數

#使用以jieba_tokenizer斷詞後的檔案
ptt_tokens <- ptt %>%   
    unnest_tokens(word, sentence, token = jieba_tokenizer1) %>%
    filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%    #將包含英文字母或和數字的詞彙清除
    filter(!word %in% stop_words$word) 

ptt_word_pair <- ptt_tokens %>%
  pairwise_count(word, artUrl, sort = TRUE)
ptt_word_pair

## # A tibble: 2,132,932 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 台灣  阿滴    190
##  2 阿滴  台灣    190
##  3 英文  阿滴    143
##  4 阿滴  英文    143
##  5 什麼  阿滴    133
##  6 阿滴  什麼    133
##  7 可以  阿滴    128
##  8 阿滴  可以    128
##  9 就是  阿滴    124
## 10 阿滴  就是    124
## # ... with 2,132,922 more rows

計算兩個詞彙間的相關性

ptt_word_cors <- ptt_tokens %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)
ptt_word_cors

## # A tibble: 73,170 x 3
##    item1    item2    correlation
##    <chr>    <chr>          <dbl>
##  1 新聞標題 完整           0.929
##  2 完整     新聞標題       0.929
##  3 完整     署名           0.913
##  4 署名     完整           0.913
##  5 連結     新聞標題       0.908
##  6 網址     新聞標題       0.908
##  7 新聞標題 連結           0.908
##  8 新聞標題 網址           0.908
##  9 備註     新聞標題       0.901
## 10 新聞標題 備註           0.901
## # ... with 73,160 more rows

顯示相關性大於0.5的組合

set.seed(2019)
ptt_word_cors %>%
  filter(correlation > .5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Hm_tokens <- HatePolitics_comment %>%
    unnest_tokens(word, commentContent, token = g_tokenizer) %>%
    filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%    #將包含英文字母或和數字的詞彙清除
    filter(!word %in% stop_words$word) 

Hm_word_pair <- Hm_tokens %>%
  pairwise_count(word, artUrl, sort = TRUE)

Hm_word_cors <- Hm_tokens %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

set.seed(2019)
Hm_word_cors %>%
  filter(correlation > .4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + 
  theme_void()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

針對4月份紐時事件做討論

Dcard

ad <- fread("阿滴八卦版ptt.csv", encoding = "UTF-8", header = TRUE)
ad1 <- fread("阿滴政黑板.csv", encoding = "UTF-8", header = TRUE)
dcard1 <- read_csv("dcard_time.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   sentence = col_character()
## )

dcard1$category = 'Time' 
dcard2 <- read_csv("dcard_wuhan.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   sentence = col_character()
## )

dcard2$category = 'Wuhan'
dcard<-rbind(dcard1, dcard2)
dcard$artDate= dcard$artDate %>% as.Date("%Y/%m/%d")
dcard$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", dcard$sentence) #去除文章中網址

tokens <- dcard %>% unnest_tokens(word, artTitle, token=g_tokenizer)
articles <- dcard %>% unnest_tokens(word, sentence, token=g_tokenizer)

##文章內容字頻

articles_count <- articles %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

head(articles_count, 15)

## # A tibble: 15 x 2
##    word       sum
##    <chr>    <int>
##  1 台灣       216
##  2 我們       122
##  3 廣告       105
##  4 紐約時報    58
##  5 美國        57
##  6 who         56
##  7 阿滴        53
##  8 世界        49
##  9 taiwan      44
## 10 看到        43
## 11 the         40
## 12 一個        39
## 13 可以        39
## 14 集資        38
## 15 募資        36

畫成barplot

articles_count[1:15,] %>%
  arrange(sum) %>% 
  {barplot(.$sum, names=.$word,horiz=T,las=1,col=rainbow(15))}

##文章內容文字雲

articles_count %>% wordcloud2()

發現dcard上時事和武漢肺炎版的文章都集中在阿滴募資買紐約時報廣告這件事情上

data_count_by_art <- articles %>% 
  group_by(word,artTitle, artDate, artTime, artUrl) %>% 
  count(word, sort = TRUE) %>%
  ungroup()

data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")

word_count <- data_count_by_art %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
word_count

## # A tibble: 516 x 2
##    word     count
##    <chr>    <int>
##  1 台灣       216
##  2 我們       122
##  3 廣告       105
##  4 紐約時報    58
##  5 美國        57
##  6 who         56
##  7 阿滴        53
##  8 世界        49
##  9 taiwan      44
## 10 看到        43
## # ... with 506 more rows

##正負面用字比較

  word_count %>% inner_join(LIWC) %>% 
  arrange(desc(abs(count))) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

由此可得知dcard文章對阿滴買紐時廣告的評價趨向於正面

討論聲量圖

RD = dcard %>% group_by(artDate) %>% count() 
plot(RD$artDate,RD$n,type='l',xlab = "",
  ylab = "文章數",col='orange',xlim=as.Date(c('2020-04-01','2020-04-21')),ylim=c(0,30))
axis.Date(side=1,at=RD$artDate,format='%Y-%m-%d',labels=T,las=1) #ADD X-AXIS LABELS WITH"YEAR-MONTH" FORMAT
axis(side=2,at=RD$artDate,las=2) #ADD Y-AXIS LABELS
abline(v=as.Date(c('2020-04-11','2020-04-31')),col='gray',lty=3)

#PTT

gossip <- read_csv("阿滴八卦版ptt.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

gossip$category = 'gossip'
hate <- read_csv("阿滴政黑板.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

hate$category = 'hate'
ptt<-rbind(gossip, hate)
ptt$artDate= ptt$artDate %>% as.Date("%Y/%m/%d")
ptt$sentence <- gsub(c("https://[a-zA-Z0-9./_]+"), "", ptt$sentence) #去除文章中網址

取出4/10之後的文章資料作探討與對比

start <- as.Date("2020-04-09")
NewYork = ptt %>% filter( artDate> start)

#斷詞

NewYork_tokens <- NewYork %>% unnest_tokens(word, artTitle, token=g_tokenizer)
NewYork_articles <- NewYork %>% unnest_tokens(word, sentence, token=g_tokenizer)

#文章內容字頻

data_count_by_art <- NewYork_articles %>% 
  group_by(word,artTitle, artDate, artTime, artUrl) %>% 
  count(word, sort = TRUE) %>%
  ungroup()

data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")

word_count <- data_count_by_art %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

#正負面用字比較 top30

  word_count %>% inner_join(LIWC) %>% 
  arrange(desc(abs(count))) %>%
  mutate(word = reorder(word,count)) %>%
  top_n(30,wt = count) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

對照ptt政黑版和八卦版的用字

frequency <- NewYork_articles %>%  
  #mutate(word = str_extract(word, "[a-z']+")) %>% #像grep一樣
  count(category, word) %>%
  filter(nchar(.$word)>1) %>%
  group_by(category) %>%
  mutate(proportion = n / sum(n)) %>%  #算出字詞比例
  select(-n) %>% 
  spread(category, proportion) %>%  
  gather(category, proportion, `gossip`) 

ggplot(frequency, aes(x = proportion, y = `hate`, color = abs(`hate` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~category, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "政黑版", x = "八卦版")

## Warning: Removed 6465 rows containing missing values (geom_point).

## Warning: Removed 6465 rows containing missing values (geom_text).

可看出政黑版的討論將紐時廣告一事導向民進黨政治操弄的手段

ptt和dcard資料整理

NewYork_articles2 = NewYork_articles
NewYork_articles2$category = 'ptt'
NewYork_articles2 = NewYork_articles2 %>%select(artTitle,artDate,artTime,artUrl,category,word)
Dcard_articles <- dcard %>% unnest_tokens(word, sentence, token=g_tokenizer)
Dcard_articles$category = 'Dcard'
All = rbind(NewYork_articles2,Dcard_articles)
All

## # A tibble: 39,554 x 6
##    artTitle          artDate    artTime  artUrl              category word 
##    <chr>             <date>     <time>   <chr>               <chr>    <chr>
##  1 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      1.   
##  2 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      媒體 
##  3 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      來源 
##  4 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      etto~
##  5 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      2.   
##  6 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      記者 
##  7 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      署名 
##  8 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      張筱涵~
##  9 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      3.   
## 10 [新聞]阿滴拆台譚德塞「1小時募~ 2020-04-10 07:45:01 https://www.ptt.cc~ ptt      完整 
## # ... with 39,544 more rows

ptt和dcard正負面字詞出現次數比較

f <- as.Date("2020-01-07")
sentiment_count = All %>% filter(artDate != f) %>% inner_join(LIWC) %>%
  group_by(category,artDate,sentiment) %>%
  summarise(count=n()) %>% 
  spread(.,key = sentiment,value = count) %>% 
  mutate(negative = -negative)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

ggplot(sentiment_count,aes(x=artDate,y=count))+
  geom_segment( aes(x=artDate, xend=artDate, y=`positive`, yend=`negative`),size=1,color="grey50") +
  geom_point( aes(x=artDate, y=`positive`), color="#DE7E73", size=2 ) + #紅色是正面情緒
  geom_point( aes(x=artDate, y=`negative`), color="#84B1ED", size=2 ) + #藍色世負面情緒
  coord_flip()+
  facet_wrap(~ category, scales = "free")

情緒-日期折線圖-比較

All %>% filter(artDate != f) %>% inner_join(LIWC) %>% 
  group_by(category,artDate,sentiment) %>%
  summarise(count=n()) %>%
  #spread(sentiment, count, fill = 0) %>%
  ggplot() +
  geom_line(aes(x = artDate, y = count, colour = sentiment), size = 0.8) +
  facet_wrap(~ category, scales = "free")

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

由這兩張圖來看，ptt網友的討論幾乎為正負參半，dcard的網友似乎對阿滴買廣告的事件較為友善尤其在4/12阿滴發布道歉影片後，情緒值偏向正面

ptt和dcard的用字比較

frequency <- All %>%  
  #mutate(word = str_extract(word, "[a-z']+")) %>% #像grep一樣
  count(category, word) %>%
  filter(nchar(.$word)>1) %>%
  group_by(category) %>%
  mutate(proportion = n / sum(n)) %>%  #算出字詞比例
  select(-n) %>% 
  spread(category, proportion) %>% 
  gather(category, proportion, `Dcard`) 

ggplot(frequency, aes(x = proportion, y = `ptt`, color = abs(`ptt` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~category, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "ptt", x = "Dcard")

## Warning: Removed 6997 rows containing missing values (geom_point).

## Warning: Removed 6997 rows containing missing values (geom_text).

這篇對比可以和剛剛的情緒值作為對照，偏向ptt的用字會比較負面一點，例如“風向”、“道歉”等，而Dcard的用字則是中立偏正面，例如“taiwancanhelp”、“防疫”、“參與”等等。
兩者都有提到台灣、反擊、廣告等字，且台灣出現的比例很高。

結論：

阿滴的網路聲量主要來自於爭議事件，尤其2020年4月所發生的紐約時報事件最為名顯，從TFIDF也可以看出，各時間區段的重要用字也都來自於那段時間的爭議事件，而情緒分析也可以看出整體而言正負面評價是持平的，只有特殊事件發生時會有比較大的不同。比較PTT與Dcard也可以看出，Dcard相對於PTT對阿滴更為正面，且用詞也有明顯得不同。

期中專案-阿滴於PPT與Dcard討論分析

第五組

2020/04