packages = c("pacman")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale(category = "LC_ALL", locale = "Chinese (Traditional)_Taiwan.950")
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
# 避免中文亂碼zh_TW.UTF-8
pacman::p_load("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","widyr","ggraph", "igraph","cnSentimentR","data.table","glmnet","broom","rsample","caTools","caret","rpart","rpart.plot","e1071","textstem","tm", 'readr','reshape2','wordcloud',"corrplot","Hmisc","fmsb","GGally","ggrepel","BiocManager")
## Installing package into 'C:/Users/Davis Liu/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## Warning: package 'cnSentimentR' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0:
## 無法開啟 URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0/PACKAGES'
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'cnSentimentR'
## Warning in pacman::p_load("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", : Failed to install/load:
## cnSentimentR
stock=fread("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\midterm\\articleMetaData1.csv",header=TRUE,sep=",")
head(stock)
# 格式化日期欄位
stock$artDate= stock$artDate %>% as.Date("%Y/%m/%d")
stock
stock_n_bydate<-stock %>%
transform(artCat = as.factor(artCat)) %>%
group_by(artDate,artCat) %>%
summarise(count=n(), total_comment=sum(commentNum))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
stock_n_bydate
stock_n_bydate_plot<-stock_n_bydate %>%
ggplot(aes(x=reorder(artDate, artDate),y=count, fill=artCat)) +
geom_bar(position = 'identity', stat = "identity")+
#geom_label_repel(min.segment.length = 0, box.padding = 0.5)+
theme(axis.text.x = element_text(angle = 90))+
ggtitle("每日文章量")
stock_n_bydate_plot
jieba_tokenizer <- worker(stop_word = "C:\\Users\\Davis Liu\\Documents\\R\\win-library\\4.0\\jiebaRD\\dict\\stop_words.utf8")
new_user_word(jieba_tokenizer, c("諾富特","柯文哲"))
## [1] TRUE
# 設定斷詞function
stock_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
#filter<-c("the","and","you")
tokens_stock <- stock %>% unnest_tokens(word, sentence, token=stock_tokenizer) #%>%
#filter_segment(filter) %>%
#str()
tokens_stock
tokens_stock_filter <- tokens_stock %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
stock_word_count <- tokens_stock_filter %>%
group_by(word,artDate,artCat) %>%
summarise(word_count=n()) %>% # 算字詞單篇總數用summarise
filter(word_count>3) %>% # 過濾出現太少次的字
arrange(desc(word_count))
## `summarise()` has grouped output by 'word', 'artDate'. You can override using the `.groups` argument.
stock_word_count
stock_word_count_plot<-stock_word_count %>%
head(20) %>%
arrange(desc(word_count)) %>%
ggplot(aes(x=reorder(word, word_count),y=word_count, fill=artCat)) +
geom_col() +
xlab(NULL) +
coord_flip()+
ggtitle("top10 word")
stock_word_count_plot
stock_word_count_plot_bystock<-stock_word_count %>%
arrange(desc(word_count)) %>%
head(30) %>%
ggplot(aes(x=reorder(word, word_count),y=word_count)) +
facet_wrap(~artCat, scales = "free") +
geom_col() +
xlab(NULL) +
coord_flip()
ggtitle("top10 word")
## $title
## [1] "top10 word"
##
## attr(,"class")
## [1] "labels"
stock_word_count_plot_bystock
#install.packages("wordcloud2")
library(wordcloud2)
cloud_word_count <- tokens_stock_filter %>%
group_by(word) %>%
summarise(word_count=n()) %>%
filter(word_count>3) %>%
arrange(desc(word_count))
wordcloud2(cloud_word_count)
dtm = stock_word_count %>%
cast_dtm(artDate,word,word_count)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 43/57
## Sparsity : 57%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 台北 印尼 抗體 活動 採檢 清真寺 華航 確診 機師 澳洲
## 2021-04-15 0 0 0 10 0 0 26 0 0 0
## 2021-04-21 0 0 0 0 0 0 0 0 0 0
## 2021-04-22 7 0 0 0 0 0 22 0 0 0
## 2021-04-23 36 32 0 46 38 50 29 58 65 36
## 2021-04-24 0 5 0 0 14 0 30 15 28 6
## 2021-04-25 0 0 7 0 18 0 29 11 19 0
## 2021-04-27 0 0 5 0 6 0 15 13 31 5
## 2021-04-28 0 0 30 0 10 0 27 20 47 0
## 2021-04-29 0 0 0 0 5 0 26 12 33 0
## 2021-04-30 9 0 0 0 0 0 35 5 30 0
word_cors <- stock_word_count%>%
group_by(word) %>%
filter(n() >= 3) %>%
pairwise_cor(word, artDate, sort = TRUE)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_cors
set.seed(2021)
word_cors %>%
filter(correlation > 0.7) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
tokens_stock_addtfidf<- stock_word_count%>%
bind_tf_idf(word,artCat,word_count)%>%
arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
tokens_stock_addtfidf
tokens_stock_addtfidf_plot <- tokens_stock_addtfidf %>%
head(15) %>%
ggplot(aes(x=reorder(word, tf_idf),y=tf_idf, fill=artCat)) +
geom_col() +
xlab(NULL) +
coord_flip()
tokens_stock_addtfidf_plot
dtm_tfidf<-tokens_stock_addtfidf %>%
cast_dtm(artDate,word,tf_idf)
inspect(dtm_tfidf[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 10/90
## Sparsity : 90%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 波音 空中 空間 柯文哲 飛機
## 2021-04-12 0.0101382 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-13 0.0000000 0.01290316 0.000000000 0.000000000 0.00000000
## 2021-04-15 0.0000000 0.00000000 0.008294891 0.000000000 0.01198151
## 2021-04-16 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-20 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-21 0.0000000 0.00000000 0.000000000 0.007676409 0.00000000
## 2021-04-22 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-23 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-24 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## 2021-04-30 0.0000000 0.00000000 0.000000000 0.000000000 0.00000000
## Terms
## Docs 參加 商務 清真寺 經濟艙 轉換
## 2021-04-12 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-13 0.007373237 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-15 0.000000000 0.009216546 0.00000000 0.008294891 0.000000000
## 2021-04-16 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-20 0.000000000 0.000000000 0.00000000 0.000000000 0.007934422
## 2021-04-21 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-22 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-23 0.000000000 0.000000000 0.01476233 0.000000000 0.000000000
## 2021-04-24 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2021-04-30 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
P <- read_file("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\liwc\\positive.txt") # 正向字典txt檔
N <- read_file("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\liwc\\negative.txt") # 負向字典txt檔
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
stock_sentiment_count <-stock_word_count %>%
#select(word,chapter) %>%
inner_join(LIWC) %>%
group_by(sentiment,artDate) %>%
summarise(sentiment_number = n())
## Joining, by = "word"
## `summarise()` has grouped output by 'sentiment'. You can override using the `.groups` argument.
stock_sentiment_count$sentiment_total = stock_sentiment_count$sentiment_number * ifelse(stock_sentiment_count$sentiment == "positive", 1, -1)
stock_sentiment_count
stock_sentiment_count_plot<-stock_sentiment_count %>%
ggplot(aes(x=reorder(artDate, artDate),y=sentiment_total, fill=sentiment)) +
geom_bar(position = 'identity', stat = "identity")+
theme(axis.text.x = element_text(angle = 90))
stock_sentiment_count_plot
stock_sentiment_count_total<-stock_sentiment_count %>%
group_by(artDate) %>%
summarise(sentiment_sum=sum(sentiment_total))
stock_sentiment_count_total
stock_sentiment_count_total_plot<-stock_sentiment_count_total %>%
ggplot(aes(x=reorder(artDate, artDate),y=sentiment_sum)) +
geom_bar(position = 'identity', stat = "identity")+
theme(axis.text.x = element_text(angle = 90))
stock_sentiment_count_total_plot
dtm_mod <- as.data.frame(as.matrix(dtm)) %>%
cbind(artDate = rownames(.), .) %>%
transform(artDate = as.Date(artDate))
rownames(dtm_mod) <- 1:nrow(dtm_mod)
dtm_mod
dtm_tfidf_mod = as.data.frame(as.matrix(dtm_tfidf)) %>%
cbind(artDate = rownames(.), .) %>%
transform(artDate = as.Date(artDate))
rownames(dtm_tfidf_mod) <- 1:nrow(dtm_tfidf_mod)
dtm_tfidf_mod
history=fread("C:\\Users\\Davis Liu\\Documents\\R\\SOCIAL MEDIA ANALYSIS\\midterm\\2610_history.csv",header=TRUE,sep=",")
head(history)
# 格式化日期欄位
history$targetDate= history$Date %>% as.Date("%Y/%m/%d") %>% -1
history$Date= history$Date %>% as.Date("%Y/%m/%d")
#合併股票資料歷史資料
combine<- history %>%
left_join(.,dtm_mod, by=c("Date"="artDate")) %>%
left_join(.,dtm_tfidf_mod, by=c("Date"="artDate")) %>%
left_join(.,stock_sentiment_count_total, by=c("Date"="artDate")) %>%
arrange(desc((Date)))
target<-combine %>%
select(targetDate,Close,Change) %>%
rename(target_price = Close,predict_change=Change)
combine_addtarget<-combine %>%
inner_join(target,by=c("Date"="targetDate")) %>%
replace(is.na(.), 0)
combine_addtarget
combine_addtarget_input<-combine_addtarget %>%
subset(.,select=-c(Date,targetDate,predict_change,target_price))
combine_addtarget_input
lm_model_all <- glm(formula=combine_addtarget$target_price~., data = combine_addtarget_input)
combine_addtarget$pred_outcome<-predict(lm_model_all,newdata = combine_addtarget_input)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
error_plot_all<- combine_addtarget %>%
ggplot(aes(x=target_price,y=pred_outcome )) +
geom_point(color = 'red') +
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
glm.summary = summary(lm_model_all)
coef = as.data.frame(glm.summary$coefficients)
coef$term = row.names(coef)
coef %>%
group_by(Estimate > 0) %>% # group_by兩類:Estimate > 0 或 Estimate <= 0
top_n(10, abs(Estimate)) %>% #abs:絕對值
ungroup() %>%
ggplot(aes(reorder(term, Estimate), Estimate, fill = Estimate > 0)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
labs(
x = NULL,
title = "Coefficients that increase/decrease odds ratio of 股價")
lm_model_confirm <- lm(formula=combine_addtarget$target_price~確診.x, data = combine_addtarget_input)
summary(lm_model_confirm)
##
## Call:
## lm(formula = combine_addtarget$target_price ~ 確診.x, data = combine_addtarget_input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8995 -1.0217 -0.2245 1.3035 2.4005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.19949 0.53425 34.07 2.6e-13 ***
## 確診.x 0.20482 0.07728 2.65 0.0212 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.791 on 12 degrees of freedom
## Multiple R-squared: 0.3692, Adjusted R-squared: 0.3166
## F-statistic: 7.023 on 1 and 12 DF, p-value: 0.02117
combine_addtarget$pred_outcome<-predict(lm_model_confirm,newdata = combine_addtarget_input)
error_plot_confirm<- combine_addtarget %>%
ggplot(aes(x=target_price,y=pred_outcome )) +
geom_point(color = 'red') +
geom_smooth(method=lm)
error_plot_confirm
## `geom_smooth()` using formula 'y ~ x'
lm_model_confirm <- lm(formula=combine_addtarget$target_price~sentiment_sum, data = combine_addtarget_input)
summary(lm_model_confirm)
##
## Call:
## lm(formula = combine_addtarget$target_price ~ sentiment_sum,
## data = combine_addtarget_input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5286 -1.0901 0.3176 1.5705 2.1772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.8286 0.4871 38.656 5.79e-14 ***
## sentiment_sum -0.9019 0.3574 -2.523 0.0267 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.823 on 12 degrees of freedom
## Multiple R-squared: 0.3467, Adjusted R-squared: 0.2922
## F-statistic: 6.368 on 1 and 12 DF, p-value: 0.02674
combine_addtarget$pred_outcome<-predict(lm_model_confirm,newdata = combine_addtarget_input)
error_plot_confirm<- combine_addtarget %>%
ggplot(aes(x=target_price,y=pred_outcome )) +
geom_point(color = 'red') +
geom_smooth(method=lm)
error_plot_confirm
## `geom_smooth()` using formula 'y ~ x'
+ 預測股價部分
combine_addtarget_input<-combine_addtarget %>%
subset(.,select=-c(Date,targetDate,predict_change))
lm_model <- glm(formula=target_price~., data = combine_addtarget_input)
#summary(lm_model)
combine_upordown_input<-combine_addtarget %>%
mutate(up_down = ifelse(predict_change > 0, 1, 0)) %>%
subset(.,select=-c(Date,targetDate,target_price,predict_change))
combine_upordown_input$up_down
## [1] 0 1 1 0 1 1 0 1 1 1 1 0 1 1
lm_model_pn_binary <- glm(formula=up_down~., data = combine_upordown_input,family = "binomial")
#summary(lm_model_pn_binary)
文章趨勢
情緒分析
股價與字詞相關性