基本繪圖分析

這邊處理的資料集,是Dcard股票看板中,以特斯拉為關鍵字進行討論的結果

# 載入packages
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
# 讀取檔案
setwd("/Users/a1234/Desktop/SCHOOL/碩班第二學期/社群媒體分析/0316")
file <- read.csv('dcard_stock_artWordFreq.csv')
file$artDate <- file$artDate %>% as.Date("%Y/%m/%d")
# 日期是2021-01-01 - 2021-03-02
head(file)
##                                artTitle    artDate  artTime
## 1 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 2 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 3 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 4 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 5 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 6 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
##                                     artUrl word count
## 1 https://www.dcard.tw/f/stock/p/235099491 突破     9
## 2 https://www.dcard.tw/f/stock/p/235099491 類股     9
## 3 https://www.dcard.tw/f/stock/p/235099491 指數     7
## 4 https://www.dcard.tw/f/stock/p/235099491 趨勢     7
## 5 https://www.dcard.tw/f/stock/p/235099491 本週     6
## 6 https://www.dcard.tw/f/stock/p/235099491 創高     5
tail(file)
##                                artTitle    artDate  artTime
## 4066 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4067 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4068 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4069 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4070 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4071 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
##                                        artUrl   word count
## 4066 https://www.dcard.tw/f/stock/p/235461196 到時候     1
## 4067 https://www.dcard.tw/f/stock/p/235461196   車輛     1
## 4068 https://www.dcard.tw/f/stock/p/235461196 想像中     1
## 4069 https://www.dcard.tw/f/stock/p/235461196   信仰     1
## 4070 https://www.dcard.tw/f/stock/p/235461196   值會     1
## 4071 https://www.dcard.tw/f/stock/p/235461196   暴跌     1
# 只需要文章以及日期兩個欄位
# 一篇文章有很多個詞彙,所以會有很多列,但我們只需要一篇文章保留一個列即可。
data <- file %>% 
  select(artDate, artUrl) %>% 
  distinct()

article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())

head(article_count_by_date, 20)
## # A tibble: 20 x 2
##    artDate    count
##    <date>     <int>
##  1 2021-01-02     1
##  2 2021-01-04     1
##  3 2021-01-05     1
##  4 2021-01-07     1
##  5 2021-01-08     1
##  6 2021-01-10     1
##  7 2021-01-12     3
##  8 2021-01-14     2
##  9 2021-01-16     2
## 10 2021-01-18     1
## 11 2021-01-21     1
## 12 2021-01-25     1
## 13 2021-01-26     3
## 14 2021-01-27     3
## 15 2021-02-03     1
## 16 2021-02-05     1
## 17 2021-02-07     1
## 18 2021-02-09     2
## 19 2021-02-14     1
## 20 2021-02-22     1
# 計算每天討論的文章數
plot_date <- article_count_by_date %>% 
  ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 1) + 
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

plot_date

# 文字雲
# 將資料集中所有文章按照文字進行分群,計算每一個字的總詞頻。
sum_data <- file %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))
head(sum_data)
## # A tibble: 6 x 3
##   word     sum .groups
##   <fct>  <int> <chr>  
## 1 特斯拉    74 drop   
## 2 台積電    39 drop   
## 3 投資      38 drop   
## 4 股價      37 drop   
## 5 股票      30 drop   
## 6 公司      29 drop
wcloud <- sum_data%>% 
  #filter(sum >= 50) %>% 
  wordcloud2()
wcloud
data_0108 <- file %>% filter(artDate == "2021-01-08")
data_0302 <- file %>% filter(artDate == "2021-03-02")
head(data_0302)
##                             artTitle    artDate  artTime
## 1 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 2 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 3 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 5 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 6 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
##                                     artUrl   word count
## 1 https://www.dcard.tw/f/stock/p/235461196   歐洲    15
## 2 https://www.dcard.tw/f/stock/p/235461196 電動車    12
## 3 https://www.dcard.tw/f/stock/p/235461196 特斯拉    11
## 4 https://www.dcard.tw/f/stock/p/235461196   市場     8
## 5 https://www.dcard.tw/f/stock/p/235461196   集團     5
## 6 https://www.dcard.tw/f/stock/p/235461196 交車量     5
data_0108 <- data_0108 %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))

data_0302 <- data_0302 %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))
plot_0108 <- data_0108 %>% 
  wordcloud2()
plot_0108
plot_0302 <- data_0302 %>% 
  wordcloud2()
plot_0302
# 情緒折線圖
csv_sen <- read.csv("dcard_stock_artSen_emotion.csv", encoding = "UTF-8")
csv_sen$artDate = csv_sen$artDate %>% 
  as.Date("%Y/%m/%d")
head(csv_sen)
##                                     artTitle    artDate  artTime
## 1              #分享 成交量-看書學習進場竟慘 2021-01-01 11:17:30
## 2                                       榮運 2021-01-01 15:30:05
## 3               #其他 2020股市最慘的賠多少? 2021-01-01 15:49:04
## 4               大家有慣用的資訊獲取軟體嗎? 2021-01-01 16:50:49
## 5                               分點疑問請益 2021-01-02 00:38:36
## 6 #分享 除了漲翻天個股 更要留心那些弱勢股... 2021-01-02 10:19:29
##                                     artUrl positive_emotion_grade
## 1 https://www.dcard.tw/f/stock/p/235090575                     10
## 2 https://www.dcard.tw/f/stock/p/235092009                      1
## 3 https://www.dcard.tw/f/stock/p/235092121                      1
## 4 https://www.dcard.tw/f/stock/p/235092427                      0
## 5 https://www.dcard.tw/f/stock/p/235094975                      1
## 6 https://www.dcard.tw/f/stock/p/235096130                      8
##   negative_emotion_grade neutral_emotion_grade
## 1                      7                     1
## 2                      1                     0
## 3                      1                     0
## 4                      2                     1
## 5                      0                     0
## 6                      3                     1
# 按照日期groupby,然後正負面的情緒
data_sen <- csv_sen %>% 
  group_by(artDate) %>% 
  summarise(positive = sum(positive_emotion_grade), negative = sum(negative_emotion_grade), neutral=sum(neutral_emotion_grade))
head(data_sen)
## # A tibble: 6 x 4
##   artDate    positive negative neutral
##   <date>        <int>    <int>   <int>
## 1 2021-01-01       12       11       2
## 2 2021-01-02       32        5       2
## 3 2021-01-03       11        5       3
## 4 2021-01-04       47       35      10
## 5 2021-01-05       36       19       8
## 6 2021-01-06       23        8       3
data_sen %>% ggplot(aes(x= artDate)) +
  geom_line(aes(y = positive, colour = "red")) +
  geom_line(aes(y = negative, colour = "blue")) +
  geom_line(aes(y = neutral, colour = "yellow")) + 
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  scale_color_discrete(name="情緒種類", labels = c("positive","negative","neutral")) + 
  ggtitle("Dcard 股票看板 關鍵詞「特斯拉」的討論情緒") + 
  xlab("日期") + 
  ylab("分數") + 
  theme(text = element_text(family = "Heiti TC Light"))