這邊處理的資料集,是Dcard股票看板中,以特斯拉為關鍵字進行討論的結果
# 載入packages
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
# 讀取檔案
setwd("/Users/a1234/Desktop/SCHOOL/碩班第二學期/社群媒體分析/0316")
file <- read.csv('dcard_stock_artWordFreq.csv')
file$artDate <- file$artDate %>% as.Date("%Y/%m/%d")
# 日期是2021-01-01 - 2021-03-02
head(file)
## artTitle artDate artTime
## 1 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 2 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 3 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 4 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 5 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## 6 #標的 202012W5 臺股週報暨趨勢(年結) 2021-01-02 22:57:13
## artUrl word count
## 1 https://www.dcard.tw/f/stock/p/235099491 突破 9
## 2 https://www.dcard.tw/f/stock/p/235099491 類股 9
## 3 https://www.dcard.tw/f/stock/p/235099491 指數 7
## 4 https://www.dcard.tw/f/stock/p/235099491 趨勢 7
## 5 https://www.dcard.tw/f/stock/p/235099491 本週 6
## 6 https://www.dcard.tw/f/stock/p/235099491 創高 5
tail(file)
## artTitle artDate artTime
## 4066 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4067 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4068 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4069 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4070 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4071 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## artUrl word count
## 4066 https://www.dcard.tw/f/stock/p/235461196 到時候 1
## 4067 https://www.dcard.tw/f/stock/p/235461196 車輛 1
## 4068 https://www.dcard.tw/f/stock/p/235461196 想像中 1
## 4069 https://www.dcard.tw/f/stock/p/235461196 信仰 1
## 4070 https://www.dcard.tw/f/stock/p/235461196 值會 1
## 4071 https://www.dcard.tw/f/stock/p/235461196 暴跌 1
# 只需要文章以及日期兩個欄位
# 一篇文章有很多個詞彙,所以會有很多列,但我們只需要一篇文章保留一個列即可。
data <- file %>%
select(artDate, artUrl) %>%
distinct()
article_count_by_date <- data %>%
group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 20 x 2
## artDate count
## <date> <int>
## 1 2021-01-02 1
## 2 2021-01-04 1
## 3 2021-01-05 1
## 4 2021-01-07 1
## 5 2021-01-08 1
## 6 2021-01-10 1
## 7 2021-01-12 3
## 8 2021-01-14 2
## 9 2021-01-16 2
## 10 2021-01-18 1
## 11 2021-01-21 1
## 12 2021-01-25 1
## 13 2021-01-26 3
## 14 2021-01-27 3
## 15 2021-02-03 1
## 16 2021-02-05 1
## 17 2021-02-07 1
## 18 2021-02-09 2
## 19 2021-02-14 1
## 20 2021-02-22 1
# 計算每天討論的文章數
plot_date <- article_count_by_date %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "#00AFBB", size = 1) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
xlab("日期") +
ylab("數量") +
# theme
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date
# 文字雲
# 將資料集中所有文章按照文字進行分群,計算每一個字的總詞頻。
sum_data <- file %>%
group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
head(sum_data)
## # A tibble: 6 x 3
## word sum .groups
## <fct> <int> <chr>
## 1 特斯拉 74 drop
## 2 台積電 39 drop
## 3 投資 38 drop
## 4 股價 37 drop
## 5 股票 30 drop
## 6 公司 29 drop
wcloud <- sum_data%>%
#filter(sum >= 50) %>%
wordcloud2()
wcloud
data_0108 <- file %>% filter(artDate == "2021-01-08")
data_0302 <- file %>% filter(artDate == "2021-03-02")
head(data_0302)
## artTitle artDate artTime
## 1 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 2 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 3 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 4 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 5 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## 6 #新聞 特斯拉歐洲一月市占率跌出前三 2021-03-02 16:47:29
## artUrl word count
## 1 https://www.dcard.tw/f/stock/p/235461196 歐洲 15
## 2 https://www.dcard.tw/f/stock/p/235461196 電動車 12
## 3 https://www.dcard.tw/f/stock/p/235461196 特斯拉 11
## 4 https://www.dcard.tw/f/stock/p/235461196 市場 8
## 5 https://www.dcard.tw/f/stock/p/235461196 集團 5
## 6 https://www.dcard.tw/f/stock/p/235461196 交車量 5
data_0108 <- data_0108 %>%
group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
data_0302 <- data_0302 %>%
group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
plot_0108 <- data_0108 %>%
wordcloud2()
plot_0108
plot_0302 <- data_0302 %>%
wordcloud2()
plot_0302
# 情緒折線圖
csv_sen <- read.csv("dcard_stock_artSen_emotion.csv", encoding = "UTF-8")
csv_sen$artDate = csv_sen$artDate %>%
as.Date("%Y/%m/%d")
head(csv_sen)
## artTitle artDate artTime
## 1 #分享 成交量-看書學習進場竟慘 2021-01-01 11:17:30
## 2 榮運 2021-01-01 15:30:05
## 3 #其他 2020股市最慘的賠多少? 2021-01-01 15:49:04
## 4 大家有慣用的資訊獲取軟體嗎? 2021-01-01 16:50:49
## 5 分點疑問請益 2021-01-02 00:38:36
## 6 #分享 除了漲翻天個股 更要留心那些弱勢股... 2021-01-02 10:19:29
## artUrl positive_emotion_grade
## 1 https://www.dcard.tw/f/stock/p/235090575 10
## 2 https://www.dcard.tw/f/stock/p/235092009 1
## 3 https://www.dcard.tw/f/stock/p/235092121 1
## 4 https://www.dcard.tw/f/stock/p/235092427 0
## 5 https://www.dcard.tw/f/stock/p/235094975 1
## 6 https://www.dcard.tw/f/stock/p/235096130 8
## negative_emotion_grade neutral_emotion_grade
## 1 7 1
## 2 1 0
## 3 1 0
## 4 2 1
## 5 0 0
## 6 3 1
# 按照日期groupby,然後正負面的情緒
data_sen <- csv_sen %>%
group_by(artDate) %>%
summarise(positive = sum(positive_emotion_grade), negative = sum(negative_emotion_grade), neutral=sum(neutral_emotion_grade))
head(data_sen)
## # A tibble: 6 x 4
## artDate positive negative neutral
## <date> <int> <int> <int>
## 1 2021-01-01 12 11 2
## 2 2021-01-02 32 5 2
## 3 2021-01-03 11 5 3
## 4 2021-01-04 47 35 10
## 5 2021-01-05 36 19 8
## 6 2021-01-06 23 8 3
data_sen %>% ggplot(aes(x= artDate)) +
geom_line(aes(y = positive, colour = "red")) +
geom_line(aes(y = negative, colour = "blue")) +
geom_line(aes(y = neutral, colour = "yellow")) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
scale_color_discrete(name="情緒種類", labels = c("positive","negative","neutral")) +
ggtitle("Dcard 股票看板 關鍵詞「特斯拉」的討論情緒") +
xlab("日期") +
ylab("分數") +
theme(text = element_text(family = "Heiti TC Light"))