基本介紹

  • 目的:使用coreNLP與sentimentr分析PTT上華航機師染疫的文字資料
  • 概述:4月23日中央流行疫情指揮中心公布華航3機師新冠肺炎確診,當日華航股價開高走低,最後以小跌1%收場。後續雖然新聞持續發酵,但股價仍受惠於近期航運價格上揚而走高,本案例是希望借助PTT的討論分析新聞的熱度,討論板對於新聞的情緒,最後希望利用tf-idf與情緒起伏來推估與股價。
  • 資料來源:PTT的八卦、航空、股票板,4/01~5/01,139筆,中文

1.載入工具包並且將PTT資料匯入

packages = c("pacman")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale(category = "LC_ALL", locale = "Chinese (Traditional)_Taiwan.950") 
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
# 避免中文亂碼zh_TW.UTF-8

載入需要的工具包

pacman::p_load("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","widyr","ggraph", "igraph","cnSentimentR","data.table","glmnet","broom","rsample","caTools","caret","rpart","rpart.plot","e1071","textstem","tm", 'readr','reshape2','wordcloud',"corrplot","Hmisc","fmsb","GGally","ggrepel","BiocManager")
## Installing package into 'C:/Users/Davis Liu/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## Warning: package 'cnSentimentR' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0:
##   無法開啟 URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0/PACKAGES'
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'cnSentimentR'
## Warning in pacman::p_load("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", : Failed to install/load:
## cnSentimentR
  • 以關鍵字“華航”搜尋PTT八卦版、航空、股票版,載入CSV檔
  • 搜尋日期2021/04/01~2021/05/01,評估近期因為華航機師感染事件的新聞熱度
  • 結合華航近一個月的股票價格,比對文字情緒對於股價的影響,並預測隔日股價
stock=fread("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\midterm\\articleMetaData1.csv",header=TRUE,sep=",")
head(stock)
# 格式化日期欄位
stock$artDate= stock$artDate %>% as.Date("%Y/%m/%d")
stock

每日文章趨勢量

  • 文章的討論量趨勢,可以發現相關文章從4/23開始明顯增加
  • 4/23也是新聞公布機師確診
stock_n_bydate<-stock %>%
  transform(artCat = as.factor(artCat)) %>%
  group_by(artDate,artCat) %>%
  summarise(count=n(), total_comment=sum(commentNum)) 
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
stock_n_bydate
stock_n_bydate_plot<-stock_n_bydate %>%
  ggplot(aes(x=reorder(artDate, artDate),y=count, fill=artCat)) +
  geom_bar(position = 'identity', stat = "identity")+
  #geom_label_repel(min.segment.length = 0, box.padding = 0.5)+
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("每日文章量")
stock_n_bydate_plot

  • 以jieba進行斷詞
jieba_tokenizer <- worker(stop_word = "C:\\Users\\Davis Liu\\Documents\\R\\win-library\\4.0\\jiebaRD\\dict\\stop_words.utf8")
new_user_word(jieba_tokenizer, c("諾富特","柯文哲"))
## [1] TRUE
# 設定斷詞function
stock_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
#filter<-c("the","and","you")

tokens_stock <- stock %>% unnest_tokens(word, sentence, token=stock_tokenizer) #%>%
#filter_segment(filter) %>%
#str() 
tokens_stock
  • 清洗word中的文字,過濾特殊字元,算每天不同字的詞頻,依照各版分類計算字詞出現次數
tokens_stock_filter <- tokens_stock %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 

stock_word_count <- tokens_stock_filter %>%
  group_by(word,artDate,artCat) %>%
  summarise(word_count=n()) %>%  # 算字詞單篇總數用summarise
  filter(word_count>3) %>%  # 過濾出現太少次的字
  arrange(desc(word_count))
## `summarise()` has grouped output by 'word', 'artDate'. You can override using the `.groups` argument.
stock_word_count

作圖排序 top10 word

  • 作圖排序後前三名依序是華航、機師、確診,文章也多數來自於八卦版
stock_word_count_plot<-stock_word_count %>%
  head(20) %>%
  arrange(desc(word_count)) %>%
  ggplot(aes(x=reorder(word, word_count),y=word_count, fill=artCat)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()+
  ggtitle("top10 word")
stock_word_count_plot

不同版在這個議題的常用字

stock_word_count_plot_bystock<-stock_word_count %>%
  arrange(desc(word_count)) %>%
  head(30) %>%
  ggplot(aes(x=reorder(word, word_count),y=word_count)) +
  facet_wrap(~artCat, scales = "free") +
  geom_col() +
  xlab(NULL) +
  coord_flip()
  ggtitle("top10 word")
## $title
## [1] "top10 word"
## 
## attr(,"class")
## [1] "labels"
stock_word_count_plot_bystock

2.全部文章的文字雲

  • 文字雲中出現澳洲、印尼等詞彙,研判係本土第一例(案1090)之傳染者為印尼籍貨 機師且於澳洲確診
#install.packages("wordcloud2")
library(wordcloud2)
cloud_word_count <- tokens_stock_filter %>%
  group_by(word) %>%
  summarise(word_count=n()) %>%  
  filter(word_count>3) %>%
  arrange(desc(word_count))
wordcloud2(cloud_word_count)

3.分析資料中的共現相關與TF-idf

  • 全部文章以天製作top10文字DTM(document term matrix)
dtm = stock_word_count %>% 
      cast_dtm(artDate,word,word_count)

inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 43/57
## Sparsity           : 57%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##             Terms
## Docs         台北 印尼 抗體 活動 採檢 清真寺 華航 確診 機師 澳洲
##   2021-04-15    0    0    0   10    0      0   26    0    0    0
##   2021-04-21    0    0    0    0    0      0    0    0    0    0
##   2021-04-22    7    0    0    0    0      0   22    0    0    0
##   2021-04-23   36   32    0   46   38     50   29   58   65   36
##   2021-04-24    0    5    0    0   14      0   30   15   28    6
##   2021-04-25    0    0    7    0   18      0   29   11   19    0
##   2021-04-27    0    0    5    0    6      0   15   13   31    5
##   2021-04-28    0    0   30    0   10      0   27   20   47    0
##   2021-04-29    0    0    0    0    5      0   26   12   33    0
##   2021-04-30    9    0    0    0    0      0   35    5   30    0
word_cors <- stock_word_count%>%
  group_by(word) %>%
  filter(n() >= 3) %>%
  pairwise_cor(word, artDate, sort = TRUE)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_cors

共現相關分析

  • 共現相關分析(加入中文字型設定,避免中文字顯示錯誤。)
  • 使用詞彙關係圖畫出相關性大於0.6的組合
set.seed(2021)

word_cors %>%
  filter(correlation > 0.7) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

  • 加上每天top10 word的tfidf
tokens_stock_addtfidf<- stock_word_count%>%
  bind_tf_idf(word,artCat,word_count)%>%
  arrange(desc(tf_idf))  
## Warning: A value for tf_idf is negative:
##  Input should have exactly one row per document-term combination.
tokens_stock_addtfidf

tfidf的top10 word排序

  • 發現八卦版中清真寺及柯文哲的權重最大
  • 調查本土案例感染源時,其中3名確診者曾到台北清真寺參加宗教活動,因 此引起熱烈討論
tokens_stock_addtfidf_plot <- tokens_stock_addtfidf %>% 
  head(15) %>%
  ggplot(aes(x=reorder(word, tf_idf),y=tf_idf, fill=artCat)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tokens_stock_addtfidf_plot

  • 全部文章以天計算top10 tfidf DTM
dtm_tfidf<-tokens_stock_addtfidf %>%
  cast_dtm(artDate,word,tf_idf)

inspect(dtm_tfidf[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 10/90
## Sparsity           : 90%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##             Terms
## Docs              波音       空中        空間      柯文哲       飛機
##   2021-04-12 0.0101382 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-13 0.0000000 0.01290316 0.000000000 0.000000000 0.00000000
##   2021-04-15 0.0000000 0.00000000 0.008294891 0.000000000 0.01198151
##   2021-04-16 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-20 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-21 0.0000000 0.00000000 0.000000000 0.007676409 0.00000000
##   2021-04-22 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-23 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-24 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##   2021-04-30 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
##             Terms
## Docs                參加        商務     清真寺      經濟艙        轉換
##   2021-04-12 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-13 0.007373237 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-15 0.000000000 0.009216546 0.00000000 0.008294891 0.000000000
##   2021-04-16 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-20 0.000000000 0.000000000 0.00000000 0.000000000 0.007934422
##   2021-04-21 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-22 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-23 0.000000000 0.000000000 0.01476233 0.000000000 0.000000000
##   2021-04-24 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
##   2021-04-30 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000

4.以LIWC情緒字典分析

  • R建立情緒字典,以情緒字典的情緒字top20
P <- read_file("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\liwc\\positive.txt") # 正向字典txt檔
N <- read_file("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\liwc\\negative.txt") # 負向字典txt檔
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

stock_sentiment_count <-stock_word_count %>% 
  #select(word,chapter) %>%
  inner_join(LIWC) %>% 
  group_by(sentiment,artDate) %>% 
  summarise(sentiment_number = n())
## Joining, by = "word"
## `summarise()` has grouped output by 'sentiment'. You can override using the `.groups` argument.
stock_sentiment_count$sentiment_total = stock_sentiment_count$sentiment_number * ifelse(stock_sentiment_count$sentiment == "positive", 1, -1)

stock_sentiment_count

情緒分析圖

  • 利用情緒字典分析4號到30號的情緒長方圖
stock_sentiment_count_plot<-stock_sentiment_count %>%
  ggplot(aes(x=reorder(artDate, artDate),y=sentiment_total, fill=sentiment)) +
  geom_bar(position = 'identity', stat = "identity")+
  theme(axis.text.x = element_text(angle = 90))
stock_sentiment_count_plot

情緒分析加總圖

  • 情緒字典加總後以4號到30號的情緒長方圖,情緒由23號開始出現轉折
stock_sentiment_count_total<-stock_sentiment_count %>%
  group_by(artDate) %>% 
  summarise(sentiment_sum=sum(sentiment_total))
stock_sentiment_count_total
stock_sentiment_count_total_plot<-stock_sentiment_count_total %>%
  ggplot(aes(x=reorder(artDate, artDate),y=sentiment_sum)) +
  geom_bar(position = 'identity', stat = "identity")+
  theme(axis.text.x = element_text(angle = 90))
stock_sentiment_count_total_plot

  • 觀察4號到30號情緒字典加總後的情緒長方圖,發現情緒由4/23開始出現轉折變成負 面,4/24至4/25又短暫回升至正面,此後皆為負面
  • 4/23為此次群聚案例本土第一例,因此情緒轉折明顯,而4/24至4/25之3確診案例皆 屬境外,民眾還未感覺已演變成本土群聚事件

5.分析華航股價與推測隔日收盤股價

  • 將全部以artDate日期合併
dtm_mod <- as.data.frame(as.matrix(dtm)) %>%
  cbind(artDate = rownames(.), .) %>%
  transform(artDate = as.Date(artDate))
rownames(dtm_mod) <- 1:nrow(dtm_mod)
dtm_mod
dtm_tfidf_mod = as.data.frame(as.matrix(dtm_tfidf)) %>%
  cbind(artDate = rownames(.), .) %>%
  transform(artDate = as.Date(artDate))
rownames(dtm_tfidf_mod) <- 1:nrow(dtm_tfidf_mod)

dtm_tfidf_mod

匯入華航4/1~5/1股價

  • 匯入華航股價資料CSV,並且合併資料
history=fread("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\midterm\\2610_history.csv",header=TRUE,sep=",")
head(history)
# 格式化日期欄位
history$targetDate= history$Date %>% as.Date("%Y/%m/%d") %>% -1
history$Date= history$Date %>% as.Date("%Y/%m/%d")
#合併股票資料歷史資料
combine<- history %>%
  left_join(.,dtm_mod, by=c("Date"="artDate")) %>%
  left_join(.,dtm_tfidf_mod, by=c("Date"="artDate")) %>%
  left_join(.,stock_sentiment_count_total, by=c("Date"="artDate")) %>%
  arrange(desc((Date)))
  • 合併股票資料、字頻TF-IDF、DTM資料與歷史資料
target<-combine %>%
  select(targetDate,Close,Change) %>%
  rename(target_price = Close,predict_change=Change)

 combine_addtarget<-combine %>%
   inner_join(target,by=c("Date"="targetDate")) %>%
   replace(is.na(.), 0)
combine_addtarget

與股價較相關的字詞

  • 與股價呈現負相關的字詞為“機師”,“採檢”
combine_addtarget_input<-combine_addtarget %>%
  subset(.,select=-c(Date,targetDate,predict_change,target_price)) 
combine_addtarget_input
lm_model_all <- glm(formula=combine_addtarget$target_price~., data = combine_addtarget_input) 

combine_addtarget$pred_outcome<-predict(lm_model_all,newdata = combine_addtarget_input)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
error_plot_all<- combine_addtarget %>%
  ggplot(aes(x=target_price,y=pred_outcome )) +
  geom_point(color = 'red') +
  geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95) 

glm.summary = summary(lm_model_all)
coef = as.data.frame(glm.summary$coefficients)
coef$term = row.names(coef)
coef %>%
  group_by(Estimate > 0) %>% # group_by兩類:Estimate > 0 或 Estimate <= 0
  top_n(10, abs(Estimate)) %>% #abs:絕對值
  ungroup() %>%
  ggplot(aes(reorder(term, Estimate), Estimate, fill = Estimate > 0)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  labs(
    x = NULL,
    title = "Coefficients that increase/decrease odds ratio of 股價")

預測股價與實際股價關係

  • 利用“確診”為依據來預測股價,比對實際股價,發現p-value: 0.02117、R-squared: 0.3692
lm_model_confirm <- lm(formula=combine_addtarget$target_price~確診.x, data = combine_addtarget_input) 
summary(lm_model_confirm)
## 
## Call:
## lm(formula = combine_addtarget$target_price ~ 確診.x, data = combine_addtarget_input)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8995 -1.0217 -0.2245  1.3035  2.4005 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.19949    0.53425   34.07  2.6e-13 ***
## 確診.x       0.20482    0.07728    2.65   0.0212 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.791 on 12 degrees of freedom
## Multiple R-squared:  0.3692, Adjusted R-squared:  0.3166 
## F-statistic: 7.023 on 1 and 12 DF,  p-value: 0.02117
combine_addtarget$pred_outcome<-predict(lm_model_confirm,newdata = combine_addtarget_input)


error_plot_confirm<- combine_addtarget %>%
  ggplot(aes(x=target_price,y=pred_outcome )) +
  geom_point(color = 'red') +
  geom_smooth(method=lm)
error_plot_confirm
## `geom_smooth()` using formula 'y ~ x'

  • 利用情緒為依據來預測股價,比對實際股價,發現p-value: 0.02674、R-squared: 0.3467
lm_model_confirm <- lm(formula=combine_addtarget$target_price~sentiment_sum, data = combine_addtarget_input) 
summary(lm_model_confirm)
## 
## Call:
## lm(formula = combine_addtarget$target_price ~ sentiment_sum, 
##     data = combine_addtarget_input)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5286 -1.0901  0.3176  1.5705  2.1772 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18.8286     0.4871  38.656 5.79e-14 ***
## sentiment_sum  -0.9019     0.3574  -2.523   0.0267 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.823 on 12 degrees of freedom
## Multiple R-squared:  0.3467, Adjusted R-squared:  0.2922 
## F-statistic: 6.368 on 1 and 12 DF,  p-value: 0.02674
combine_addtarget$pred_outcome<-predict(lm_model_confirm,newdata = combine_addtarget_input)


error_plot_confirm<- combine_addtarget %>%
  ggplot(aes(x=target_price,y=pred_outcome )) +
  geom_point(color = 'red') +
  geom_smooth(method=lm)
error_plot_confirm
## `geom_smooth()` using formula 'y ~ x'

+ 預測股價部分

combine_addtarget_input<-combine_addtarget %>%
  subset(.,select=-c(Date,targetDate,predict_change)) 

lm_model <- glm(formula=target_price~., data = combine_addtarget_input) 
#summary(lm_model)
  • 股票漲跌分類預測
combine_upordown_input<-combine_addtarget %>%
    mutate(up_down = ifelse(predict_change > 0, 1, 0)) %>%
  subset(.,select=-c(Date,targetDate,target_price,predict_change)) 
  
combine_upordown_input$up_down
##  [1] 0 1 1 0 1 1 0 1 1 1 1 0 1 1
lm_model_pn_binary <- glm(formula=up_down~., data = combine_upordown_input,family = "binomial") 
#summary(lm_model_pn_binary)

6.總結

文章趨勢

  1. 文章從4/23開始明顯增加,也是新聞公布機師確診時間
  2. 文字雲中出現澳洲、印尼等詞彙,研判係本土第一例(案1090)之傳染者為印尼籍貨機師且於澳洲確診
  3. tfidf的top10 word發現八卦版中清真寺及柯文哲的權重最大

情緒分析

  1. 情緒字典加總後以4號到30號的情緒長方圖,情緒由23號開始出現轉折,4/24至4/25又短暫回升至正面,此後皆為負面
  2. 對應股價23號股價波動向下,後續幾天逆勢上揚

股價與字詞相關性

  1. 與股價呈現負相關的字詞為“機師”,“採檢”,也就是說有機師有確診股價會掉
  2. 利用字詞“確診”與情緒分數為依據來預測股價得到結果p-value< 0.05